diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -18980,6 +18980,41 @@ return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask); } + unsigned NumEltsIn128 = 128 / EltSizeInBits; + assert(isPowerOf2_32(NumEltsIn128) && + "Vectors will always have power-of-two number of elements."); + + // If we have broadcast support for the given scalar type, + // and it is profitable to do (either if we are inserting into the high part + // of an YMM register, or the scalar has other uses, and all of them produce + // a vector), then prefer the broadcast+blend sequence. + if (((Subtarget.hasAVX2() && EltSizeInBits != 8) || + (Subtarget.hasAVX() && (EltSizeInBits == 32 || EltSizeInBits == 64) && + ISD::isNormalLoad(N1.getNode()))) && + (N1->hasOneUse() ? (VT.is256BitVector() && IdxVal >= NumEltsIn128) + : all_of(N1->uses(), [](llvm::SDNode *U) { + switch (U->getOpcode()) { + case ISD::INSERT_VECTOR_ELT: + case ISD::BUILD_VECTOR: + case ISD::SCALAR_TO_VECTOR: + case ISD::SPLAT_VECTOR: + case X86ISD::VBROADCAST: + case X86ISD::PINSRB: + case X86ISD::PINSRW: + case X86ISD::INSERTPS: + case X86ISD::INSERTQI: + return true; + default: + return false; + } + }))) { + SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1); + SmallVector BlendMask; + for (unsigned i = 0; i != NumElts; ++i) + BlendMask.push_back(i == IdxVal ? i + NumElts : i); + return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask); + } + // If the vector is wider than 128 bits, extract the 128-bit subvector, insert // into that, and then insert the subvector back into the result. if (VT.is256BitVector() || VT.is512BitVector()) { @@ -19001,8 +19036,6 @@ SDValue V = extract128BitVector(N0, IdxVal, DAG, dl); // Insert the element into the desired chunk. - unsigned NumEltsIn128 = 128 / EltSizeInBits; - assert(isPowerOf2_32(NumEltsIn128)); // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo. unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1); @@ -37683,6 +37716,13 @@ if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0)); + // broadcast(extract_vector_elt(x, 0)) -> broadcast(x). + if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + isNullConstant(Src.getOperand(1)) && + DAG.getTargetLoweringInfo().isTypeLegal( + Src.getOperand(0).getValueType())) + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0)); + // Share broadcast with the longest vector and extract low subvector (free). // Ensure the same SDValue from the SDNode use is being used. for (SDNode *User : Src->uses()) diff --git a/llvm/test/CodeGen/X86/avx-insertelt.ll b/llvm/test/CodeGen/X86/avx-insertelt.ll --- a/llvm/test/CodeGen/X86/avx-insertelt.ll +++ b/llvm/test/CodeGen/X86/avx-insertelt.ll @@ -91,23 +91,35 @@ ; 0'th element of high subvector insertion into an AVX register. define <8 x float> @insert_f32_firstelt_of_high_subvector(<8 x float> %x, float %s) { -; ALL-LABEL: insert_f32_firstelt_of_high_subvector: -; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm2 -; ALL-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX-LABEL: insert_f32_firstelt_of_high_subvector: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: insert_f32_firstelt_of_high_subvector: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss %xmm1, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-NEXT: retq %i0 = insertelement <8 x float> %x, float %s, i32 4 ret <8 x float> %i0 } define <4 x double> @insert_f64_firstelt_of_high_subvector(<4 x double> %x, double %s) { -; ALL-LABEL: insert_f64_firstelt_of_high_subvector: -; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm2 -; ALL-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX-LABEL: insert_f64_firstelt_of_high_subvector: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: insert_f64_firstelt_of_high_subvector: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq %i0 = insertelement <4 x double> %x, double %s, i32 2 ret <4 x double> %i0 } @@ -140,9 +152,10 @@ ; ; AVX2-LABEL: insert_i16_firstelt_of_high_subvector: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpinsrw $0, %edi, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovd %edi, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: retq %i0 = insertelement <16 x i16> %x, i16 %s, i32 8 ret <16 x i16> %i0 @@ -158,9 +171,9 @@ ; ; AVX2-LABEL: insert_i32_firstelt_of_high_subvector: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpinsrd $0, %edi, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovd %edi, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-NEXT: retq %i0 = insertelement <8 x i32> %x, i32 %s, i32 4 ret <8 x i32> %i0 @@ -176,9 +189,9 @@ ; ; AVX2-LABEL: insert_i64_firstelt_of_high_subvector: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpinsrq $0, %rdi, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovq %rdi, %xmm1 +; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: retq %i0 = insertelement <4 x i64> %x, i64 %s, i32 2 ret <4 x i64> %i0 @@ -187,26 +200,38 @@ ; element insertion into 0'th element of both subvectors define <8 x float> @insert_f32_firstelts(<8 x float> %x, float %s) { -; ALL-LABEL: insert_f32_firstelts: -; ALL: # %bb.0: -; ALL-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3] -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; ALL-NEXT: retq +; AVX-LABEL: insert_f32_firstelts: +; AVX: # %bb.0: +; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3] +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: insert_f32_firstelts: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss %xmm1, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-NEXT: retq %i0 = insertelement <8 x float> %x, float %s, i32 0 %i1 = insertelement <8 x float> %i0, float %s, i32 4 ret <8 x float> %i1 } define <4 x double> @insert_f64_firstelts(<4 x double> %x, double %s) { -; ALL-LABEL: insert_f64_firstelts: -; ALL: # %bb.0: -; ALL-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3] -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; ALL-NEXT: retq +; AVX-LABEL: insert_f64_firstelts: +; AVX: # %bb.0: +; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3] +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: insert_f64_firstelts: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-NEXT: retq %i0 = insertelement <4 x double> %x, double %s, i32 0 %i1 = insertelement <4 x double> %i0, double %s, i32 2 ret <4 x double> %i1 @@ -244,10 +269,9 @@ ; ; AVX2-LABEL: insert_i16_firstelts: ; AVX2: # %bb.0: -; AVX2-NEXT: vpinsrw $0, %edi, %xmm0, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpinsrw $0, %edi, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovd %edi, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-NEXT: retq %i0 = insertelement <16 x i16> %x, i16 %s, i32 0 %i1 = insertelement <16 x i16> %i0, i16 %s, i32 8 @@ -266,10 +290,8 @@ ; AVX2-LABEL: insert_i32_firstelts: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpinsrd $0, %edi, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-NEXT: retq %i0 = insertelement <8 x i32> %x, i32 %s, i32 0 %i1 = insertelement <8 x i32> %i0, i32 %s, i32 4 @@ -287,10 +309,9 @@ ; ; AVX2-LABEL: insert_i64_firstelts: ; AVX2: # %bb.0: -; AVX2-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovq %rdi, %xmm1 +; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-NEXT: retq %i0 = insertelement <4 x i64> %x, i64 %s, i32 0 %i1 = insertelement <4 x i64> %i0, i64 %s, i32 2 @@ -300,23 +321,35 @@ ; element insertion into two elements of high subvector define <8 x float> @insert_f32_two_elts_of_high_subvector(<8 x float> %x, float %s) { -; ALL-LABEL: insert_f32_two_elts_of_high_subvector: -; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm2 -; ALL-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,3] -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX-LABEL: insert_f32_two_elts_of_high_subvector: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,3] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: insert_f32_two_elts_of_high_subvector: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss %xmm1, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq %i0 = insertelement <8 x float> %x, float %s, i32 4 %i1 = insertelement <8 x float> %i0, float %s, i32 5 ret <8 x float> %i1 } define <4 x double> @insert_f64_two_elts_of_high_subvector(<4 x double> %x, double %s) { -; ALL-LABEL: insert_f64_two_elts_of_high_subvector: -; ALL: # %bb.0: -; ALL-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX-LABEL: insert_f64_two_elts_of_high_subvector: +; AVX: # %bb.0: +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: insert_f64_two_elts_of_high_subvector: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: retq %i0 = insertelement <4 x double> %x, double %s, i32 2 %i1 = insertelement <4 x double> %i0, double %s, i32 3 ret <4 x double> %i1 @@ -354,10 +387,9 @@ ; ; AVX2-LABEL: insert_i16_two_elts_of_high_subvector: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpinsrw $0, %edi, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrw $1, %edi, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovd %edi, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-NEXT: retq %i0 = insertelement <16 x i16> %x, i16 %s, i32 8 %i1 = insertelement <16 x i16> %i0, i16 %s, i32 9 @@ -375,10 +407,9 @@ ; ; AVX2-LABEL: insert_i32_two_elts_of_high_subvector: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpinsrd $0, %edi, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrd $1, %edi, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovd %edi, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: retq %i0 = insertelement <8 x i32> %x, i32 %s, i32 4 %i1 = insertelement <8 x i32> %i0, i32 %s, i32 5 @@ -395,9 +426,9 @@ ; ; AVX2-LABEL: insert_i64_two_elts_of_high_subvector: ; AVX2: # %bb.0: -; AVX2-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm1 -; AVX2-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovq %rdi, %xmm1 +; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: retq %i0 = insertelement <4 x i64> %x, i64 %s, i32 2 %i1 = insertelement <4 x i64> %i0, i64 %s, i32 3 @@ -407,11 +438,17 @@ ; element insertion into two elements of low subvector define <8 x float> @insert_f32_two_elts_of_low_subvector(<8 x float> %x, float %s) { -; ALL-LABEL: insert_f32_two_elts_of_low_subvector: -; ALL: # %bb.0: -; ALL-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; ALL-NEXT: retq +; AVX-LABEL: insert_f32_two_elts_of_low_subvector: +; AVX: # %bb.0: +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: retq +; +; AVX2-LABEL: insert_f32_two_elts_of_low_subvector: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: retq %i0 = insertelement <8 x float> %x, float %s, i32 0 %i1 = insertelement <8 x float> %i0, float %s, i32 1 ret <8 x float> %i1 @@ -457,8 +494,9 @@ ; ; AVX2-LABEL: insert_i16_two_elts_of_low_subvector: ; AVX2: # %bb.0: -; AVX2-NEXT: vpinsrw $0, %edi, %xmm0, %xmm1 -; AVX2-NEXT: vpinsrw $1, %edi, %xmm1, %xmm1 +; AVX2-NEXT: vmovd %edi, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq %i0 = insertelement <16 x i16> %x, i16 %s, i32 0 @@ -477,9 +515,8 @@ ; AVX2-LABEL: insert_i32_two_elts_of_low_subvector: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3] -; AVX2-NEXT: vpinsrd $1, %edi, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-NEXT: retq %i0 = insertelement <8 x i32> %x, i32 %s, i32 0 %i1 = insertelement <8 x i32> %i0, i32 %s, i32 1 @@ -496,8 +533,8 @@ ; ; AVX2-LABEL: insert_i64_two_elts_of_low_subvector: ; AVX2: # %bb.0: -; AVX2-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm1 -; AVX2-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1 +; AVX2-NEXT: vmovq %rdi, %xmm1 +; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq %i0 = insertelement <4 x i64> %x, i64 %s, i32 0 diff --git a/llvm/test/CodeGen/X86/avx2-masked-gather.ll b/llvm/test/CodeGen/X86/avx2-masked-gather.ll --- a/llvm/test/CodeGen/X86/avx2-masked-gather.ll +++ b/llvm/test/CodeGen/X86/avx2-masked-gather.ll @@ -396,17 +396,15 @@ ; NOGATHER-NEXT: je .LBB6_10 ; NOGATHER-NEXT: # %bb.9: # %cond.load10 ; NOGATHER-NEXT: vmovq %xmm0, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 -; NOGATHER-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2 -; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastss (%rcx), %ymm2 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] ; NOGATHER-NEXT: .LBB6_10: # %else11 ; NOGATHER-NEXT: testb $32, %al ; NOGATHER-NEXT: je .LBB6_12 ; NOGATHER-NEXT: # %bb.11: # %cond.load13 ; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 -; NOGATHER-NEXT: vpinsrd $1, (%rcx), %xmm2, %xmm2 -; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastss (%rcx), %ymm2 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; NOGATHER-NEXT: .LBB6_12: # %else14 ; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0 ; NOGATHER-NEXT: testb $64, %al @@ -419,16 +417,14 @@ ; NOGATHER-NEXT: retq ; NOGATHER-NEXT: .LBB6_13: # %cond.load16 ; NOGATHER-NEXT: vmovq %xmm0, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 -; NOGATHER-NEXT: vpinsrd $2, (%rcx), %xmm2, %xmm2 -; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastss (%rcx), %ymm2 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; NOGATHER-NEXT: testb $-128, %al ; NOGATHER-NEXT: je .LBB6_16 ; NOGATHER-NEXT: .LBB6_15: # %cond.load19 ; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 -; NOGATHER-NEXT: vpinsrd $3, (%rax), %xmm0, %xmm0 -; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastss (%rax), %ymm0 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; NOGATHER-NEXT: vmovaps %ymm1, %ymm0 ; NOGATHER-NEXT: retq entry: @@ -503,18 +499,15 @@ ; NOGATHER-NEXT: je .LBB7_10 ; NOGATHER-NEXT: # %bb.9: # %cond.load10 ; NOGATHER-NEXT: vmovq %xmm0, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 -; NOGATHER-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; NOGATHER-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] -; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastss (%rcx), %ymm2 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] ; NOGATHER-NEXT: .LBB7_10: # %else11 ; NOGATHER-NEXT: testb $32, %al ; NOGATHER-NEXT: je .LBB7_12 ; NOGATHER-NEXT: # %bb.11: # %cond.load13 ; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 -; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] -; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastss (%rcx), %ymm2 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; NOGATHER-NEXT: .LBB7_12: # %else14 ; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0 ; NOGATHER-NEXT: testb $64, %al @@ -527,16 +520,14 @@ ; NOGATHER-NEXT: retq ; NOGATHER-NEXT: .LBB7_13: # %cond.load16 ; NOGATHER-NEXT: vmovq %xmm0, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 -; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] -; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastss (%rcx), %ymm2 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; NOGATHER-NEXT: testb $-128, %al ; NOGATHER-NEXT: je .LBB7_16 ; NOGATHER-NEXT: .LBB7_15: # %cond.load19 ; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 -; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] -; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastss (%rax), %ymm0 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; NOGATHER-NEXT: vmovaps %ymm1, %ymm0 ; NOGATHER-NEXT: retq entry: @@ -597,16 +588,14 @@ ; NOGATHER-NEXT: retq ; NOGATHER-NEXT: .LBB8_5: # %cond.load4 ; NOGATHER-NEXT: vmovq %xmm0, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 -; NOGATHER-NEXT: vpinsrq $0, (%rcx), %xmm2, %xmm2 -; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastsd (%rcx), %ymm2 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; NOGATHER-NEXT: testb $8, %al ; NOGATHER-NEXT: je .LBB8_8 ; NOGATHER-NEXT: .LBB8_7: # %cond.load7 ; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 -; NOGATHER-NEXT: vpinsrq $1, (%rax), %xmm0, %xmm0 -; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastsd (%rax), %ymm0 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; NOGATHER-NEXT: vmovaps %ymm1, %ymm0 ; NOGATHER-NEXT: retq entry: @@ -667,16 +656,14 @@ ; NOGATHER-NEXT: retq ; NOGATHER-NEXT: .LBB9_5: # %cond.load4 ; NOGATHER-NEXT: vmovq %xmm0, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 -; NOGATHER-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] -; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastsd (%rcx), %ymm2 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; NOGATHER-NEXT: testb $8, %al ; NOGATHER-NEXT: je .LBB9_8 ; NOGATHER-NEXT: .LBB9_7: # %cond.load7 ; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 -; NOGATHER-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] -; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastsd (%rax), %ymm0 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; NOGATHER-NEXT: vmovaps %ymm1, %ymm0 ; NOGATHER-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -550,13 +550,22 @@ } define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) { -; CHECK-LABEL: insert_v4i64: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v4i64: +; KNL: ## %bb.0: +; KNL-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; KNL-NEXT: vmovq %rdi, %xmm1 +; KNL-NEXT: vpbroadcastq %xmm1, %ymm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v4i64: +; SKX: ## %bb.0: +; SKX-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 +; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; SKX-NEXT: vpbroadcastq %rdi, %ymm1 +; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; SKX-NEXT: retq %val = load i64, i64* %ptr %r1 = insertelement <4 x i64> %x, i64 %val, i32 1 %r2 = insertelement <4 x i64> %r1, i64 %y, i32 3 @@ -591,13 +600,22 @@ } define <8 x i32> @insert_v8i32(<8 x i32> %x, i32 %y, i32* %ptr) { -; CHECK-LABEL: insert_v8i32: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v8i32: +; KNL: ## %bb.0: +; KNL-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; KNL-NEXT: vmovd %edi, %xmm1 +; KNL-NEXT: vpbroadcastd %xmm1, %ymm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v8i32: +; SKX: ## %bb.0: +; SKX-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1 +; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; SKX-NEXT: vpbroadcastd %edi, %ymm1 +; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; SKX-NEXT: retq %val = load i32, i32* %ptr %r1 = insertelement <8 x i32> %x, i32 %val, i32 1 %r2 = insertelement <8 x i32> %r1, i32 %y, i32 5 @@ -632,13 +650,24 @@ } define <16 x i16> @insert_v16i16(<16 x i16> %x, i16 %y, i16* %ptr) { -; CHECK-LABEL: insert_v16i16: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v16i16: +; KNL: ## %bb.0: +; KNL-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; KNL-NEXT: vmovd %edi, %xmm1 +; KNL-NEXT: vpbroadcastw %xmm1, %ymm1 +; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v16i16: +; SKX: ## %bb.0: +; SKX-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1 +; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; SKX-NEXT: vpbroadcastw %edi, %ymm1 +; SKX-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] +; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; SKX-NEXT: retq %val = load i16, i16* %ptr %r1 = insertelement <16 x i16> %x, i16 %val, i32 1 %r2 = insertelement <16 x i16> %r1, i16 %y, i32 9 @@ -739,12 +768,20 @@ } define <16 x i16> @test_insert_128_v16i16(<16 x i16> %x, i16 %y) { -; CHECK-LABEL: test_insert_128_v16i16: -; CHECK: ## %bb.0: -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpinsrw $2, %edi, %xmm1, %xmm1 -; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; CHECK-NEXT: retq +; KNL-LABEL: test_insert_128_v16i16: +; KNL: ## %bb.0: +; KNL-NEXT: vmovd %edi, %xmm1 +; KNL-NEXT: vpbroadcastw %xmm1, %ymm1 +; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; KNL-NEXT: retq +; +; SKX-LABEL: test_insert_128_v16i16: +; SKX: ## %bb.0: +; SKX-NEXT: vpbroadcastw %edi, %ymm1 +; SKX-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] +; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; SKX-NEXT: retq %r = insertelement <16 x i16> %x, i16 %y, i32 10 ret <16 x i16> %r } diff --git a/llvm/test/CodeGen/X86/insertelement-shuffle.ll b/llvm/test/CodeGen/X86/insertelement-shuffle.ll --- a/llvm/test/CodeGen/X86/insertelement-shuffle.ll +++ b/llvm/test/CodeGen/X86/insertelement-shuffle.ll @@ -30,19 +30,18 @@ define <8 x i64> @insert_subvector_512(i32 %x0, i32 %x1, <8 x i64> %v) nounwind { ; X86_AVX256-LABEL: insert_subvector_512: ; X86_AVX256: # %bb.0: -; X86_AVX256-NEXT: vextracti128 $1, %ymm0, %xmm2 -; X86_AVX256-NEXT: vpinsrd $0, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; X86_AVX256-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; X86_AVX256-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; X86_AVX256-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm2 +; X86_AVX256-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7] +; X86_AVX256-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm2 +; X86_AVX256-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; X86_AVX256-NEXT: retl ; ; X64_AVX256-LABEL: insert_subvector_512: ; X64_AVX256: # %bb.0: ; X64_AVX256-NEXT: vmovd %edi, %xmm2 ; X64_AVX256-NEXT: vpinsrd $1, %esi, %xmm2, %xmm2 -; X64_AVX256-NEXT: vextracti128 $1, %ymm0, %xmm3 -; X64_AVX256-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; X64_AVX256-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; X64_AVX256-NEXT: vpbroadcastq %xmm2, %ymm2 +; X64_AVX256-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; X64_AVX256-NEXT: retq ; ; X86_AVX512-LABEL: insert_subvector_512: diff --git a/llvm/test/CodeGen/X86/load-partial.ll b/llvm/test/CodeGen/X86/load-partial.ll --- a/llvm/test/CodeGen/X86/load-partial.ll +++ b/llvm/test/CodeGen/X86/load-partial.ll @@ -41,9 +41,8 @@ ; ; AVX-LABEL: load_float4_float3_0122: ; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovups (%rdi), %xmm1 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] +; AVX-NEXT: vbroadcastss 8(%rdi), %xmm0 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; AVX-NEXT: retq %p0 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 0 %p1 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 1 @@ -90,9 +89,8 @@ ; ; AVX-LABEL: load_float8_float3_0122: ; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovups (%rdi), %xmm1 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] +; AVX-NEXT: vbroadcastss 8(%rdi), %xmm0 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; AVX-NEXT: retq %p0 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 0 %p1 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 1 @@ -139,9 +137,8 @@ ; ; AVX-LABEL: load_float4_float3_as_float2_float_0122: ; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; AVX-NEXT: vbroadcastss 8(%rdi), %xmm0 +; AVX-NEXT: vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; AVX-NEXT: retq %2 = bitcast <4 x float>* %0 to <2 x float>* %3 = load <2 x float>, <2 x float>* %2, align 4 @@ -194,9 +191,8 @@ ; ; AVX-LABEL: load_float4_float3_trunc_0122: ; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovaps (%rdi), %xmm1 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] +; AVX-NEXT: vbroadcastss 8(%rdi), %xmm0 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; AVX-NEXT: retq %2 = bitcast <4 x float>* %0 to i64* %3 = load i64, i64* %2, align 16 diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll --- a/llvm/test/CodeGen/X86/masked_expandload.ll +++ b/llvm/test/CodeGen/X86/masked_expandload.ll @@ -216,16 +216,14 @@ ; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je LBB1_6 ; AVX1-NEXT: LBB1_5: ## %cond.load5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovlpd (%rdi), %xmm1, %xmm1 ## xmm1 = mem[0],xmm1[1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je LBB1_8 ; AVX1-NEXT: LBB1_7: ## %cond.load9 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovhps (%rdi), %xmm1, %xmm1 ## xmm1 = xmm1[0,1],mem[0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: expandload_v4f64_v4i64: @@ -259,16 +257,14 @@ ; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je LBB1_6 ; AVX2-NEXT: LBB1_5: ## %cond.load5 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovlpd (%rdi), %xmm1, %xmm1 ## xmm1 = mem[0],xmm1[1] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je LBB1_8 ; AVX2-NEXT: LBB1_7: ## %cond.load9 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovhpd (%rdi), %xmm1, %xmm1 ## xmm1 = xmm1[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX2-NEXT: retq ; ; AVX512F-LABEL: expandload_v4f64_v4i64: @@ -405,16 +401,14 @@ ; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je LBB2_6 ; AVX1-NEXT: LBB2_5: ## %cond.load5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovlps (%rdi), %xmm2, %xmm2 ## xmm2 = mem[0,1],xmm2[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je LBB2_8 ; AVX1-NEXT: LBB2_7: ## %cond.load9 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovhps (%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je LBB2_10 @@ -431,16 +425,14 @@ ; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je LBB2_14 ; AVX1-NEXT: LBB2_13: ## %cond.load21 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovlps (%rdi), %xmm2, %xmm2 ## xmm2 = mem[0,1],xmm2[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je LBB2_16 ; AVX1-NEXT: LBB2_15: ## %cond.load25 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovhps (%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: expandload_v8f64_v8i1: @@ -486,16 +478,14 @@ ; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je LBB2_6 ; AVX2-NEXT: LBB2_5: ## %cond.load5 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vmovlps (%rdi), %xmm2, %xmm2 ## xmm2 = mem[0,1],xmm2[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je LBB2_8 ; AVX2-NEXT: LBB2_7: ## %cond.load9 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vmovhps (%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0,1] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je LBB2_10 @@ -512,16 +502,14 @@ ; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je LBB2_14 ; AVX2-NEXT: LBB2_13: ## %cond.load21 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vmovlps (%rdi), %xmm2, %xmm2 ## xmm2 = mem[0,1],xmm2[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je LBB2_16 ; AVX2-NEXT: LBB2_15: ## %cond.load25 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vmovhps (%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0,1] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: retq ; ; AVX512F-LABEL: expandload_v8f64_v8i1: @@ -777,16 +765,14 @@ ; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je LBB3_6 ; AVX1-NEXT: LBB3_5: ## %cond.load5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je LBB3_8 ; AVX1-NEXT: LBB3_7: ## %cond.load9 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je LBB3_10 @@ -803,16 +789,14 @@ ; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je LBB3_14 ; AVX1-NEXT: LBB3_13: ## %cond.load21 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je LBB3_16 ; AVX1-NEXT: LBB3_15: ## %cond.load25 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testl $256, %eax ## imm = 0x100 ; AVX1-NEXT: je LBB3_18 @@ -829,16 +813,14 @@ ; AVX1-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX1-NEXT: je LBB3_22 ; AVX1-NEXT: LBB3_21: ## %cond.load37 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX1-NEXT: je LBB3_24 ; AVX1-NEXT: LBB3_23: ## %cond.load41 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX1-NEXT: je LBB3_26 @@ -855,16 +837,14 @@ ; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX1-NEXT: je LBB3_30 ; AVX1-NEXT: LBB3_29: ## %cond.load53 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX1-NEXT: je LBB3_32 ; AVX1-NEXT: LBB3_31: ## %cond.load57 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: expandload_v16f64_v16i32: @@ -939,16 +919,14 @@ ; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je LBB3_6 ; AVX2-NEXT: LBB3_5: ## %cond.load5 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je LBB3_8 ; AVX2-NEXT: LBB3_7: ## %cond.load9 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je LBB3_10 @@ -965,16 +943,14 @@ ; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je LBB3_14 ; AVX2-NEXT: LBB3_13: ## %cond.load21 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je LBB3_16 ; AVX2-NEXT: LBB3_15: ## %cond.load25 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testl $256, %eax ## imm = 0x100 ; AVX2-NEXT: je LBB3_18 @@ -991,16 +967,14 @@ ; AVX2-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX2-NEXT: je LBB3_22 ; AVX2-NEXT: LBB3_21: ## %cond.load37 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX2-NEXT: je LBB3_24 ; AVX2-NEXT: LBB3_23: ## %cond.load41 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX2-NEXT: je LBB3_26 @@ -1017,16 +991,14 @@ ; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX2-NEXT: je LBB3_30 ; AVX2-NEXT: LBB3_29: ## %cond.load53 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX2-NEXT: je LBB3_32 ; AVX2-NEXT: LBB3_31: ## %cond.load57 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: retq ; ; AVX512F-LABEL: expandload_v16f64_v16i32: @@ -2193,31 +2165,26 @@ ; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je LBB8_10 ; AVX1-NEXT: LBB8_9: ## %cond.load13 -; AVX1-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4],ymm0[5,6,7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je LBB8_12 ; AVX1-NEXT: LBB8_11: ## %cond.load17 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je LBB8_14 ; AVX1-NEXT: LBB8_13: ## %cond.load21 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6],ymm0[7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je LBB8_16 ; AVX1-NEXT: LBB8_15: ## %cond.load25 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $256, %eax ## imm = 0x100 ; AVX1-NEXT: je LBB8_18 @@ -2246,31 +2213,26 @@ ; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX1-NEXT: je LBB8_26 ; AVX1-NEXT: LBB8_25: ## %cond.load45 -; AVX1-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5,6,7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX1-NEXT: je LBB8_28 ; AVX1-NEXT: LBB8_27: ## %cond.load49 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX1-NEXT: je LBB8_30 ; AVX1-NEXT: LBB8_29: ## %cond.load53 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX1-NEXT: je LBB8_32 ; AVX1-NEXT: LBB8_31: ## %cond.load57 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $65536, %eax ## imm = 0x10000 ; AVX1-NEXT: je LBB8_34 @@ -2299,31 +2261,26 @@ ; AVX1-NEXT: testl $1048576, %eax ## imm = 0x100000 ; AVX1-NEXT: je LBB8_42 ; AVX1-NEXT: LBB8_41: ## %cond.load77 -; AVX1-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $2097152, %eax ## imm = 0x200000 ; AVX1-NEXT: je LBB8_44 ; AVX1-NEXT: LBB8_43: ## %cond.load81 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $4194304, %eax ## imm = 0x400000 ; AVX1-NEXT: je LBB8_46 ; AVX1-NEXT: LBB8_45: ## %cond.load85 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6],ymm2[7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $8388608, %eax ## imm = 0x800000 ; AVX1-NEXT: je LBB8_48 ; AVX1-NEXT: LBB8_47: ## %cond.load89 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $16777216, %eax ## imm = 0x1000000 ; AVX1-NEXT: je LBB8_50 @@ -2352,31 +2309,26 @@ ; AVX1-NEXT: testl $268435456, %eax ## imm = 0x10000000 ; AVX1-NEXT: je LBB8_58 ; AVX1-NEXT: LBB8_57: ## %cond.load109 -; AVX1-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $536870912, %eax ## imm = 0x20000000 ; AVX1-NEXT: je LBB8_60 ; AVX1-NEXT: LBB8_59: ## %cond.load113 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $1073741824, %eax ## imm = 0x40000000 ; AVX1-NEXT: je LBB8_62 ; AVX1-NEXT: LBB8_61: ## %cond.load117 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 ; AVX1-NEXT: je LBB8_64 ; AVX1-NEXT: LBB8_63: ## %cond.load121 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: expandload_v32f32_v32i32: @@ -2515,31 +2467,26 @@ ; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je LBB8_10 ; AVX2-NEXT: LBB8_9: ## %cond.load13 -; AVX2-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4],ymm0[5,6,7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je LBB8_12 ; AVX2-NEXT: LBB8_11: ## %cond.load17 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je LBB8_14 ; AVX2-NEXT: LBB8_13: ## %cond.load21 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6],ymm0[7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je LBB8_16 ; AVX2-NEXT: LBB8_15: ## %cond.load25 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $256, %eax ## imm = 0x100 ; AVX2-NEXT: je LBB8_18 @@ -2568,31 +2515,26 @@ ; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX2-NEXT: je LBB8_26 ; AVX2-NEXT: LBB8_25: ## %cond.load45 -; AVX2-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5,6,7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX2-NEXT: je LBB8_28 ; AVX2-NEXT: LBB8_27: ## %cond.load49 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX2-NEXT: je LBB8_30 ; AVX2-NEXT: LBB8_29: ## %cond.load53 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX2-NEXT: je LBB8_32 ; AVX2-NEXT: LBB8_31: ## %cond.load57 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $65536, %eax ## imm = 0x10000 ; AVX2-NEXT: je LBB8_34 @@ -2621,31 +2563,26 @@ ; AVX2-NEXT: testl $1048576, %eax ## imm = 0x100000 ; AVX2-NEXT: je LBB8_42 ; AVX2-NEXT: LBB8_41: ## %cond.load77 -; AVX2-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $2097152, %eax ## imm = 0x200000 ; AVX2-NEXT: je LBB8_44 ; AVX2-NEXT: LBB8_43: ## %cond.load81 -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $4194304, %eax ## imm = 0x400000 ; AVX2-NEXT: je LBB8_46 ; AVX2-NEXT: LBB8_45: ## %cond.load85 -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6],ymm2[7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $8388608, %eax ## imm = 0x800000 ; AVX2-NEXT: je LBB8_48 ; AVX2-NEXT: LBB8_47: ## %cond.load89 -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $16777216, %eax ## imm = 0x1000000 ; AVX2-NEXT: je LBB8_50 @@ -2674,31 +2611,26 @@ ; AVX2-NEXT: testl $268435456, %eax ## imm = 0x10000000 ; AVX2-NEXT: je LBB8_58 ; AVX2-NEXT: LBB8_57: ## %cond.load109 -; AVX2-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $536870912, %eax ## imm = 0x20000000 ; AVX2-NEXT: je LBB8_60 ; AVX2-NEXT: LBB8_59: ## %cond.load113 -; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $1073741824, %eax ## imm = 0x40000000 ; AVX2-NEXT: je LBB8_62 ; AVX2-NEXT: LBB8_61: ## %cond.load117 -; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 ; AVX2-NEXT: je LBB8_64 ; AVX2-NEXT: LBB8_63: ## %cond.load121 -; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-NEXT: retq ; ; AVX512-LABEL: expandload_v32f32_v32i32: diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll --- a/llvm/test/CodeGen/X86/masked_gather.ll +++ b/llvm/test/CodeGen/X86/masked_gather.ll @@ -1359,11 +1359,10 @@ ; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB4_16 ; AVX1-NEXT: .LBB4_15: # %cond.load19 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpinsrd $3, c+12(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] ; AVX1-NEXT: .LBB4_16: # %else20 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 @@ -1393,11 +1392,10 @@ ; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB4_32 ; AVX1-NEXT: .LBB4_31: # %cond.load58 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpinsrd $3, c+28(%rip), %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX1-NEXT: .LBB4_32: # %else61 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -1418,9 +1416,8 @@ ; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB4_42 ; AVX1-NEXT: .LBB4_41: # %cond.load84 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpinsrd $0, c+28(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7] ; AVX1-NEXT: .LBB4_42: # %else87 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 @@ -1428,25 +1425,22 @@ ; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB4_44 ; AVX1-NEXT: # %bb.43: # %cond.load89 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpinsrd $1, c+28(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX1-NEXT: .LBB4_44: # %else92 ; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB4_46 ; AVX1-NEXT: # %bb.45: # %cond.load94 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpinsrd $2, c+28(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6],ymm0[7] ; AVX1-NEXT: .LBB4_46: # %else97 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB4_48 ; AVX1-NEXT: # %bb.47: # %cond.load99 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpinsrd $3, c+28(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX1-NEXT: .LBB4_48: # %else102 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 @@ -1474,21 +1468,18 @@ ; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB4_10 ; AVX1-NEXT: .LBB4_9: # %cond.load10 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpinsrd $0, c+12(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4],ymm1[5,6,7] ; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB4_12 ; AVX1-NEXT: .LBB4_11: # %cond.load13 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpinsrd $1, c+12(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] ; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB4_14 ; AVX1-NEXT: .LBB4_13: # %cond.load16 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpinsrd $2, c+12(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6],ymm1[7] ; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: jne .LBB4_15 ; AVX1-NEXT: jmp .LBB4_16 @@ -1512,21 +1503,18 @@ ; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB4_26 ; AVX1-NEXT: .LBB4_25: # %cond.load43 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpinsrd $0, c+28(%rip), %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7] ; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB4_28 ; AVX1-NEXT: .LBB4_27: # %cond.load48 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpinsrd $1, c+28(%rip), %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] ; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB4_30 ; AVX1-NEXT: .LBB4_29: # %cond.load53 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpinsrd $2, c+28(%rip), %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7] ; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: jne .LBB4_31 ; AVX1-NEXT: jmp .LBB4_32 @@ -1581,9 +1569,8 @@ ; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB4_16 ; AVX2-NEXT: .LBB4_15: # %cond.load19 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpinsrd $3, c+12(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastd c+12(%rip), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-NEXT: .LBB4_16: # %else20 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm2 @@ -1613,9 +1600,8 @@ ; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB4_32 ; AVX2-NEXT: .LBB4_31: # %cond.load58 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpinsrd $3, c+28(%rip), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-NEXT: .LBB4_32: # %else61 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm0, %ymm0 @@ -1642,17 +1628,15 @@ ; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB4_46 ; AVX2-NEXT: .LBB4_45: # %cond.load94 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpinsrd $2, c+28(%rip), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6],ymm0[7] ; AVX2-NEXT: .LBB4_46: # %else97 ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB4_48 ; AVX2-NEXT: # %bb.47: # %cond.load99 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpinsrd $3, c+28(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-NEXT: .LBB4_48: # %else102 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -1676,21 +1660,18 @@ ; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB4_10 ; AVX2-NEXT: .LBB4_9: # %cond.load10 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpinsrd $0, c+12(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastd c+12(%rip), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] ; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB4_12 ; AVX2-NEXT: .LBB4_11: # %cond.load13 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpinsrd $1, c+12(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastd c+12(%rip), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB4_14 ; AVX2-NEXT: .LBB4_13: # %cond.load16 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpinsrd $2, c+12(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastd c+12(%rip), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: jne .LBB4_15 ; AVX2-NEXT: jmp .LBB4_16 @@ -1714,21 +1695,18 @@ ; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB4_26 ; AVX2-NEXT: .LBB4_25: # %cond.load43 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpinsrd $0, c+28(%rip), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] ; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB4_28 ; AVX2-NEXT: .LBB4_27: # %cond.load48 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpinsrd $1, c+28(%rip), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB4_30 ; AVX2-NEXT: .LBB4_29: # %cond.load53 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpinsrd $2, c+28(%rip), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7] ; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: jne .LBB4_31 ; AVX2-NEXT: jmp .LBB4_32 @@ -1752,15 +1730,13 @@ ; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB4_42 ; AVX2-NEXT: .LBB4_41: # %cond.load84 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpinsrd $0, c+28(%rip), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4],ymm0[5,6,7] ; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB4_44 ; AVX2-NEXT: .LBB4_43: # %cond.load89 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpinsrd $1, c+28(%rip), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: jne .LBB4_45 ; AVX2-NEXT: jmp .LBB4_46 diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -965,16 +965,14 @@ ; KNL_64-NEXT: retq ; KNL_64-NEXT: .LBB15_5: # %cond.load4 ; KNL_64-NEXT: vmovq %xmm0, %rcx -; KNL_64-NEXT: vextracti128 $1, %ymm2, %xmm1 -; KNL_64-NEXT: vmovlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; KNL_64-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2 +; KNL_64-NEXT: vpbroadcastq (%rcx), %ymm1 +; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; KNL_64-NEXT: testb $8, %al ; KNL_64-NEXT: je .LBB15_8 ; KNL_64-NEXT: .LBB15_7: # %cond.load7 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rax -; KNL_64-NEXT: vextracti128 $1, %ymm2, %xmm0 -; KNL_64-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] -; KNL_64-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm2 +; KNL_64-NEXT: vpbroadcastq (%rax), %ymm0 +; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; KNL_64-NEXT: vmovdqa %ymm2, %ymm0 ; KNL_64-NEXT: retq ; @@ -1014,16 +1012,14 @@ ; KNL_32-NEXT: je .LBB15_6 ; KNL_32-NEXT: .LBB15_5: # %cond.load4 ; KNL_32-NEXT: vpextrd $2, %xmm0, %ecx -; KNL_32-NEXT: vextracti128 $1, %ymm2, %xmm1 -; KNL_32-NEXT: vmovlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; KNL_32-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2 +; KNL_32-NEXT: vpbroadcastq (%ecx), %ymm1 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; KNL_32-NEXT: testb $8, %al ; KNL_32-NEXT: je .LBB15_8 ; KNL_32-NEXT: .LBB15_7: # %cond.load7 ; KNL_32-NEXT: vpextrd $3, %xmm0, %eax -; KNL_32-NEXT: vextracti128 $1, %ymm2, %xmm0 -; KNL_32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] -; KNL_32-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm2 +; KNL_32-NEXT: vpbroadcastq (%eax), %ymm0 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; KNL_32-NEXT: vmovdqa %ymm2, %ymm0 ; KNL_32-NEXT: retl ; @@ -3220,17 +3216,15 @@ ; KNL_64-NEXT: je .LBB42_6 ; KNL_64-NEXT: # %bb.5: # %cond.load4 ; KNL_64-NEXT: vmovq %xmm2, %rcx -; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm3 -; KNL_64-NEXT: vpinsrq $0, (%rcx), %xmm3, %xmm3 -; KNL_64-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; KNL_64-NEXT: vpbroadcastq (%rcx), %ymm3 +; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] ; KNL_64-NEXT: .LBB42_6: # %else5 ; KNL_64-NEXT: testb $8, %al ; KNL_64-NEXT: je .LBB42_8 ; KNL_64-NEXT: # %bb.7: # %cond.load7 ; KNL_64-NEXT: vpextrq $1, %xmm2, %rax -; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm3 -; KNL_64-NEXT: vpinsrq $1, (%rax), %xmm3, %xmm3 -; KNL_64-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; KNL_64-NEXT: vpbroadcastq (%rax), %ymm3 +; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; KNL_64-NEXT: .LBB42_8: # %else8 ; KNL_64-NEXT: kmovw %k0, %eax ; KNL_64-NEXT: testb $1, %al @@ -3247,9 +3241,8 @@ ; KNL_64-NEXT: je .LBB42_16 ; KNL_64-NEXT: .LBB42_15: # %cond.load29 ; KNL_64-NEXT: vpextrq $1, %xmm2, %rax -; KNL_64-NEXT: vextracti128 $1, %ymm3, %xmm4 -; KNL_64-NEXT: vpinsrq $1, (%rax), %xmm4, %xmm4 -; KNL_64-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; KNL_64-NEXT: vpbroadcastq (%rax), %ymm4 +; KNL_64-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; KNL_64-NEXT: .LBB42_16: # %else33 ; KNL_64-NEXT: kmovw %k0, %eax ; KNL_64-NEXT: testb $1, %al @@ -3266,9 +3259,8 @@ ; KNL_64-NEXT: je .LBB42_24 ; KNL_64-NEXT: .LBB42_23: # %cond.load54 ; KNL_64-NEXT: vpextrq $1, %xmm2, %rax -; KNL_64-NEXT: vextracti128 $1, %ymm4, %xmm0 -; KNL_64-NEXT: vpinsrq $1, (%rax), %xmm0, %xmm0 -; KNL_64-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm4 +; KNL_64-NEXT: vpbroadcastq (%rax), %ymm0 +; KNL_64-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm0[6,7] ; KNL_64-NEXT: .LBB42_24: # %else58 ; KNL_64-NEXT: vpaddq %ymm3, %ymm1, %ymm0 ; KNL_64-NEXT: vpaddq %ymm4, %ymm0, %ymm0 @@ -3286,9 +3278,8 @@ ; KNL_64-NEXT: je .LBB42_14 ; KNL_64-NEXT: .LBB42_13: # %cond.load23 ; KNL_64-NEXT: vmovq %xmm2, %rcx -; KNL_64-NEXT: vextracti128 $1, %ymm3, %xmm4 -; KNL_64-NEXT: vpinsrq $0, (%rcx), %xmm4, %xmm4 -; KNL_64-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; KNL_64-NEXT: vpbroadcastq (%rcx), %ymm4 +; KNL_64-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] ; KNL_64-NEXT: testb $8, %al ; KNL_64-NEXT: jne .LBB42_15 ; KNL_64-NEXT: jmp .LBB42_16 @@ -3305,9 +3296,8 @@ ; KNL_64-NEXT: je .LBB42_22 ; KNL_64-NEXT: .LBB42_21: # %cond.load48 ; KNL_64-NEXT: vmovq %xmm2, %rcx -; KNL_64-NEXT: vextracti128 $1, %ymm4, %xmm0 -; KNL_64-NEXT: vpinsrq $0, (%rcx), %xmm0, %xmm0 -; KNL_64-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm4 +; KNL_64-NEXT: vpbroadcastq (%rcx), %ymm0 +; KNL_64-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5],ymm4[6,7] ; KNL_64-NEXT: testb $8, %al ; KNL_64-NEXT: jne .LBB42_23 ; KNL_64-NEXT: jmp .LBB42_24 @@ -3347,19 +3337,19 @@ ; KNL_32-NEXT: vpextrd $2, %xmm0, %edx ; KNL_32-NEXT: je .LBB42_6 ; KNL_32-NEXT: # %bb.5: # %cond.load4 -; KNL_32-NEXT: vextracti128 $1, %ymm1, %xmm2 -; KNL_32-NEXT: vpinsrd $0, (%edx), %xmm2, %xmm2 -; KNL_32-NEXT: vpinsrd $1, 4(%edx), %xmm2, %xmm2 -; KNL_32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; KNL_32-NEXT: vpbroadcastd (%edx), %ymm2 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] +; KNL_32-NEXT: vpbroadcastd 4(%edx), %ymm2 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; KNL_32-NEXT: .LBB42_6: # %else5 ; KNL_32-NEXT: testb $8, %bl ; KNL_32-NEXT: vpextrd $3, %xmm0, %esi ; KNL_32-NEXT: je .LBB42_8 ; KNL_32-NEXT: # %bb.7: # %cond.load7 -; KNL_32-NEXT: vextracti128 $1, %ymm1, %xmm0 -; KNL_32-NEXT: vpinsrd $2, (%esi), %xmm0, %xmm0 -; KNL_32-NEXT: vpinsrd $3, 4(%esi), %xmm0, %xmm0 -; KNL_32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; KNL_32-NEXT: vpbroadcastd (%esi), %ymm0 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; KNL_32-NEXT: vpbroadcastd 4(%esi), %ymm1 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; KNL_32-NEXT: .LBB42_8: # %else8 ; KNL_32-NEXT: kmovw %k0, %ebx ; KNL_32-NEXT: testb $1, %bl @@ -3375,10 +3365,10 @@ ; KNL_32-NEXT: testb $8, %bl ; KNL_32-NEXT: je .LBB42_16 ; KNL_32-NEXT: .LBB42_15: # %cond.load29 -; KNL_32-NEXT: vextracti128 $1, %ymm0, %xmm2 -; KNL_32-NEXT: vpinsrd $2, (%esi), %xmm2, %xmm2 -; KNL_32-NEXT: vpinsrd $3, 4(%esi), %xmm2, %xmm2 -; KNL_32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; KNL_32-NEXT: vpbroadcastd (%esi), %ymm2 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7] +; KNL_32-NEXT: vpbroadcastd 4(%esi), %ymm2 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; KNL_32-NEXT: .LBB42_16: # %else33 ; KNL_32-NEXT: kmovw %k0, %ebx ; KNL_32-NEXT: testb $1, %bl @@ -3394,10 +3384,10 @@ ; KNL_32-NEXT: testb $8, %bl ; KNL_32-NEXT: je .LBB42_24 ; KNL_32-NEXT: .LBB42_23: # %cond.load54 -; KNL_32-NEXT: vextracti128 $1, %ymm2, %xmm3 -; KNL_32-NEXT: vpinsrd $2, (%esi), %xmm3, %xmm3 -; KNL_32-NEXT: vpinsrd $3, 4(%esi), %xmm3, %xmm3 -; KNL_32-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; KNL_32-NEXT: vpbroadcastd (%esi), %ymm3 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7] +; KNL_32-NEXT: vpbroadcastd 4(%esi), %ymm3 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; KNL_32-NEXT: .LBB42_24: # %else58 ; KNL_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; KNL_32-NEXT: vpaddq %ymm2, %ymm0, %ymm0 @@ -3419,10 +3409,10 @@ ; KNL_32-NEXT: testb $4, %bl ; KNL_32-NEXT: je .LBB42_14 ; KNL_32-NEXT: .LBB42_13: # %cond.load23 -; KNL_32-NEXT: vextracti128 $1, %ymm0, %xmm2 -; KNL_32-NEXT: vpinsrd $0, (%edx), %xmm2, %xmm2 -; KNL_32-NEXT: vpinsrd $1, 4(%edx), %xmm2, %xmm2 -; KNL_32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; KNL_32-NEXT: vpbroadcastd (%edx), %ymm2 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7] +; KNL_32-NEXT: vpbroadcastd 4(%edx), %ymm2 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; KNL_32-NEXT: testb $8, %bl ; KNL_32-NEXT: jne .LBB42_15 ; KNL_32-NEXT: jmp .LBB42_16 @@ -3437,10 +3427,10 @@ ; KNL_32-NEXT: testb $4, %bl ; KNL_32-NEXT: je .LBB42_22 ; KNL_32-NEXT: .LBB42_21: # %cond.load48 -; KNL_32-NEXT: vextracti128 $1, %ymm2, %xmm3 -; KNL_32-NEXT: vpinsrd $0, (%edx), %xmm3, %xmm3 -; KNL_32-NEXT: vpinsrd $1, 4(%edx), %xmm3, %xmm3 -; KNL_32-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; KNL_32-NEXT: vpbroadcastd (%edx), %ymm3 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] +; KNL_32-NEXT: vpbroadcastd 4(%edx), %ymm3 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; KNL_32-NEXT: testb $8, %bl ; KNL_32-NEXT: jne .LBB42_23 ; KNL_32-NEXT: jmp .LBB42_24 diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -3463,51 +3463,51 @@ ; AVX2-NEXT: testl $256, %eax ## imm = 0x100 ; AVX2-NEXT: je LBB22_18 ; AVX2-NEXT: LBB22_17: ## %cond.load22 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw 16(%rdi), %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: testl $512, %eax ## imm = 0x200 ; AVX2-NEXT: je LBB22_20 ; AVX2-NEXT: LBB22_19: ## %cond.load25 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrw $1, 18(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw 18(%rdi), %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX2-NEXT: je LBB22_22 ; AVX2-NEXT: LBB22_21: ## %cond.load28 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrw $2, 20(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw 20(%rdi), %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX2-NEXT: je LBB22_24 ; AVX2-NEXT: LBB22_23: ## %cond.load31 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrw $3, 22(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw 22(%rdi), %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX2-NEXT: je LBB22_26 ; AVX2-NEXT: LBB22_25: ## %cond.load34 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw 24(%rdi), %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX2-NEXT: je LBB22_28 ; AVX2-NEXT: LBB22_27: ## %cond.load37 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrw $5, 26(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw 26(%rdi), %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7,8,9,10,11,12],ymm0[13],ymm1[14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX2-NEXT: je LBB22_30 ; AVX2-NEXT: LBB22_29: ## %cond.load40 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw 28(%rdi), %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX2-NEXT: je LBB22_32 ; AVX2-NEXT: LBB22_31: ## %cond.load43 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw 30(%rdi), %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqa %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -3609,51 +3609,51 @@ ; AVX512F-NEXT: testl $256, %eax ## imm = 0x100 ; AVX512F-NEXT: je LBB22_18 ; AVX512F-NEXT: LBB22_17: ## %cond.load22 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw 16(%rdi), %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 ; AVX512F-NEXT: je LBB22_20 ; AVX512F-NEXT: LBB22_19: ## %cond.load25 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrw $1, 18(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw 18(%rdi), %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX512F-NEXT: je LBB22_22 ; AVX512F-NEXT: LBB22_21: ## %cond.load28 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrw $2, 20(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw 20(%rdi), %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX512F-NEXT: je LBB22_24 ; AVX512F-NEXT: LBB22_23: ## %cond.load31 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrw $3, 22(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw 22(%rdi), %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX512F-NEXT: je LBB22_26 ; AVX512F-NEXT: LBB22_25: ## %cond.load34 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw 24(%rdi), %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX512F-NEXT: je LBB22_28 ; AVX512F-NEXT: LBB22_27: ## %cond.load37 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrw $5, 26(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw 26(%rdi), %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7,8,9,10,11,12],ymm0[13],ymm1[14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX512F-NEXT: je LBB22_30 ; AVX512F-NEXT: LBB22_29: ## %cond.load40 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw 28(%rdi), %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX512F-NEXT: je LBB22_32 ; AVX512F-NEXT: LBB22_31: ## %cond.load43 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw 30(%rdi), %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512F-NEXT: retq ; @@ -3755,51 +3755,51 @@ ; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100 ; AVX512VLDQ-NEXT: je LBB22_18 ; AVX512VLDQ-NEXT: LBB22_17: ## %cond.load22 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastw 16(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 ; AVX512VLDQ-NEXT: je LBB22_20 ; AVX512VLDQ-NEXT: LBB22_19: ## %cond.load25 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrw $1, 18(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastw 18(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15] +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX512VLDQ-NEXT: je LBB22_22 ; AVX512VLDQ-NEXT: LBB22_21: ## %cond.load28 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrw $2, 20(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastw 20(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15] +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX512VLDQ-NEXT: je LBB22_24 ; AVX512VLDQ-NEXT: LBB22_23: ## %cond.load31 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrw $3, 22(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastw 22(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15] +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX512VLDQ-NEXT: je LBB22_26 ; AVX512VLDQ-NEXT: LBB22_25: ## %cond.load34 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastw 24(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX512VLDQ-NEXT: je LBB22_28 ; AVX512VLDQ-NEXT: LBB22_27: ## %cond.load37 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrw $5, 26(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastw 26(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7,8,9,10,11,12],ymm0[13],ymm1[14,15] +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX512VLDQ-NEXT: je LBB22_30 ; AVX512VLDQ-NEXT: LBB22_29: ## %cond.load40 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastw 28(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15] +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX512VLDQ-NEXT: je LBB22_32 ; AVX512VLDQ-NEXT: LBB22_31: ## %cond.load43 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastw 30(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512VLDQ-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLDQ-NEXT: retq ; @@ -7084,33 +7084,17 @@ ; SSE42-NEXT: pinsrq $0, 16(%rdi), %xmm1 ; SSE42-NEXT: retq ; -; AVX1-LABEL: load_one_mask_bit_set3: -; AVX1: ## %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_one_mask_bit_set3: -; AVX2: ## %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_one_mask_bit_set3: -; AVX512: ## %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1 -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: load_one_mask_bit_set3: +; AVX: ## %bb.0: +; AVX-NEXT: vbroadcastsd 16(%rdi), %ymm1 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX-NEXT: retq ; ; X86-AVX512-LABEL: load_one_mask_bit_set3: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX512-NEXT: vmovlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX512-NEXT: vbroadcastsd 16(%eax), %ymm1 +; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; X86-AVX512-NEXT: retl %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> , <4 x i64> %val) ret <4 x i64> %res @@ -7126,17 +7110,15 @@ ; ; AVX-LABEL: load_one_mask_bit_set4: ; AVX: ## %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vbroadcastsd 24(%rdi), %ymm1 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: retq ; ; X86-AVX512-LABEL: load_one_mask_bit_set4: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX512-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX512-NEXT: vbroadcastsd 24(%eax), %ymm1 +; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; X86-AVX512-NEXT: retl %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> , <4 x double> %val) ret <4 x double> %res @@ -7152,9 +7134,8 @@ ; ; AVX1OR2-LABEL: load_one_mask_bit_set5: ; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1OR2-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] -; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1OR2-NEXT: vbroadcastsd 56(%rdi), %ymm2 +; AVX1OR2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1OR2-NEXT: retq ; ; AVX512-LABEL: load_one_mask_bit_set5: diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll --- a/llvm/test/CodeGen/X86/oddsubvector.ll +++ b/llvm/test/CodeGen/X86/oddsubvector.ll @@ -271,6 +271,7 @@ ; AVX2-NEXT: vmovdqu c+128(%rip), %ymm0 ; AVX2-NEXT: addl c+128(%rip), %eax ; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm3 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3,4,5,6,7] @@ -295,15 +296,16 @@ ; AVX512-NEXT: vmovdqu64 c+128(%rip), %zmm1 ; AVX512-NEXT: addl c+128(%rip), %eax ; AVX512-NEXT: vmovd %eax, %xmm2 -; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm2 +; AVX512-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm3 ; AVX512-NEXT: vpaddd %ymm0, %ymm0, %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7] -; AVX512-NEXT: vmovdqa c+128(%rip), %xmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7] ; AVX512-NEXT: vmovdqu %ymm0, c+128(%rip) ; AVX512-NEXT: vmovdqu c+160(%rip), %ymm0 ; AVX512-NEXT: vmovdqu64 d+128(%rip), %zmm3 -; AVX512-NEXT: vpinsrd $0, %eax, %xmm2, %xmm2 -; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 +; AVX512-NEXT: movw $1, %ax +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm1 ; AVX512-NEXT: vpsubd %zmm1, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, d+128(%rip) diff --git a/llvm/test/CodeGen/X86/pr29112.ll b/llvm/test/CodeGen/X86/pr29112.ll --- a/llvm/test/CodeGen/X86/pr29112.ll +++ b/llvm/test/CodeGen/X86/pr29112.ll @@ -11,45 +11,46 @@ ; CHECK-NEXT: subq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 80 ; CHECK-NEXT: vmovaps %xmm1, %xmm9 -; CHECK-NEXT: vmovaps {{.*#+}} xmm14 = [4,22,1,17] -; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm14 -; CHECK-NEXT: vmovaps {{.*#+}} xmm10 = [4,30,1,22] +; CHECK-NEXT: vmovaps {{.*#+}} xmm10 = [4,22,1,3] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm10 -; CHECK-NEXT: vmovaps {{.*#+}} xmm8 = [4,28,1,29] +; CHECK-NEXT: vmovaps {{.*#+}} xmm12 = [4,30,1,3] +; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm12 +; CHECK-NEXT: vmovaps {{.*#+}} xmm8 = [4,28,1,3] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm8 -; CHECK-NEXT: vmovaps {{.*#+}} xmm7 = <5,20,u,u> -; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm7 -; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [4,21,1,7] +; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [4,21,1,3] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm4 -; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm5 -; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm6 -; CHECK-NEXT: vunpcklps {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm11[0,1],xmm2[1],xmm11[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm13 = xmm1[0,1,2],xmm3[1] +; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm15 +; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm7 +; CHECK-NEXT: vunpcklps {{.*#+}} xmm11 = xmm7[0],xmm15[0],xmm7[1],xmm15[1] +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm11[0,1],xmm2[1],xmm11[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm14 = xmm0[0,1,2],xmm3[1] ; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm4[0,1,2],xmm3[1] ; CHECK-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vextractf32x4 $2, %zmm3, %xmm4 -; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[3,3,3,3] -; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1,2],xmm4[3] +; CHECK-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[12,13,14,15],xmm15[0,1,2,3,4,5,6,7,8,9,10,11] ; CHECK-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0,1],xmm2[1],xmm7[3] -; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm3[3] -; CHECK-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0,1,2],xmm3[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0,1,2],xmm3[1] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[1] -; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm8 +; CHECK-NEXT: vbroadcastss %xmm15, %xmm1 +; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[1],xmm1[1],zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1],xmm1[3] +; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm3[3] +; CHECK-NEXT: vblendps {{.*#+}} xmm13 = xmm0[0,1,2],xmm3[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm8[0,1,2],xmm3[1] +; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[1] +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm8 +; CHECK-NEXT: vinsertps {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[2] +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm10[0,1,2],xmm3[1] ; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm11[0,1],xmm2[3,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2] -; CHECK-NEXT: vaddps %xmm2, %xmm14, %xmm2 -; CHECK-NEXT: vmovaps %xmm13, %xmm1 -; CHECK-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vaddps %xmm10, %xmm13, %xmm10 -; CHECK-NEXT: vaddps %xmm13, %xmm13, %xmm3 -; CHECK-NEXT: vaddps %xmm12, %xmm14, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm8, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm2 +; CHECK-NEXT: vmovaps %xmm14, %xmm1 +; CHECK-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vaddps %xmm12, %xmm14, %xmm10 +; CHECK-NEXT: vaddps %xmm14, %xmm14, %xmm3 ; CHECK-NEXT: vaddps %xmm0, %xmm13, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm8, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm14, %xmm0 ; CHECK-NEXT: vmovaps %xmm3, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovaps %xmm10, (%rsp) ; CHECK-NEXT: vmovaps %xmm9, %xmm3 diff --git a/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll b/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll --- a/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll +++ b/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 ; 0'th element insertion into an SSE register. @@ -262,8 +262,8 @@ ; ; AVX-LABEL: insert_f32_two_elts: ; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[2,3] +; AVX-NEXT: vbroadcastss (%rdi), %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX-NEXT: retq %s = load float, float* %s.addr %i0 = insertelement <4 x float> %x, float %s, i32 0 @@ -336,12 +336,18 @@ ; SSE-NEXT: pinsrw $1, %eax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: insert_i16_two_elts: -; AVX: # %bb.0: -; AVX-NEXT: movzwl (%rdi), %eax -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: insert_i16_two_elts: +; AVX1: # %bb.0: +; AVX1-NEXT: movzwl (%rdi), %eax +; AVX1-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_i16_two_elts: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw (%rdi), %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX2-NEXT: retq %s = load i16, i16* %s.addr %i0 = insertelement <8 x i16> %x, i16 %s, i32 0 %i1 = insertelement <8 x i16> %i0, i16 %s, i32 1 @@ -368,9 +374,8 @@ ; ; AVX-LABEL: insert_i32_two_elts: ; AVX: # %bb.0: -; AVX-NEXT: movl (%rdi), %eax -; AVX-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: vbroadcastss (%rdi), %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX-NEXT: retq %s = load i32, i32* %s.addr %i0 = insertelement <4 x i32> %x, i32 %s, i32 0 @@ -396,9 +401,7 @@ ; ; AVX-LABEL: insert_i64_two_elts: ; AVX: # %bb.0: -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: vpinsrq $0, %rax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrq $1, %rax, %xmm0, %xmm0 +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX-NEXT: retq %s = load i64, i64* %s.addr %i0 = insertelement <2 x i64> %x, i64 %s, i32 0 diff --git a/llvm/test/CodeGen/X86/sse-insertelt.ll b/llvm/test/CodeGen/X86/sse-insertelt.ll --- a/llvm/test/CodeGen/X86/sse-insertelt.ll +++ b/llvm/test/CodeGen/X86/sse-insertelt.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 ; 0'th element insertion into an SSE register. @@ -247,10 +247,16 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: insert_f32_two_elts: -; AVX: # %bb.0: -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: insert_f32_two_elts: +; AVX1: # %bb.0: +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_f32_two_elts: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq %i0 = insertelement <4 x float> %x, float %s, i32 0 %i1 = insertelement <4 x float> %i0, float %s, i32 1 ret <4 x float> %i1 @@ -315,11 +321,18 @@ ; SSE-NEXT: pinsrw $1, %edi, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: insert_i16_two_elts: -; AVX: # %bb.0: -; AVX-NEXT: vpinsrw $0, %edi, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: insert_i16_two_elts: +; AVX1: # %bb.0: +; AVX1-NEXT: vpinsrw $0, %edi, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_i16_two_elts: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd %edi, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX2-NEXT: retq %i0 = insertelement <8 x i16> %x, i16 %s, i32 0 %i1 = insertelement <8 x i16> %i0, i16 %s, i32 1 ret <8 x i16> %i1 @@ -341,11 +354,18 @@ ; SSE41-NEXT: pinsrd $1, %edi, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: insert_i32_two_elts: -; AVX: # %bb.0: -; AVX-NEXT: vpinsrd $0, %edi, %xmm0, %xmm0 -; AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: insert_i32_two_elts: +; AVX1: # %bb.0: +; AVX1-NEXT: vpinsrd $0, %edi, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_i32_two_elts: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd %edi, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq %i0 = insertelement <4 x i32> %x, i32 %s, i32 0 %i1 = insertelement <4 x i32> %i0, i32 %s, i32 1 ret <4 x i32> %i1 @@ -365,11 +385,17 @@ ; SSE41-NEXT: pinsrq $1, %rdi, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: insert_i64_two_elts: -; AVX: # %bb.0: -; AVX-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 -; AVX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: insert_i64_two_elts: +; AVX1: # %bb.0: +; AVX1-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_i64_two_elts: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq %rdi, %xmm0 +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: retq %i0 = insertelement <2 x i64> %x, i64 %s, i32 0 %i1 = insertelement <2 x i64> %i0, i64 %s, i32 1 ret <2 x i64> %i1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -534,8 +534,8 @@ define void @test_demandedelts_pshufb_v32i8_v16i8(<2 x i32>* %src, <8 x i32>* %dst) { ; SKX64-LABEL: test_demandedelts_pshufb_v32i8_v16i8: ; SKX64: # %bb.0: -; SKX64-NEXT: vmovdqa 32(%rdi), %xmm0 -; SKX64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; SKX64-NEXT: vpbroadcastd 44(%rdi), %ymm0 +; SKX64-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; SKX64-NEXT: vmovdqa %ymm0, 672(%rsi) ; SKX64-NEXT: vmovdqa 208(%rdi), %xmm0 ; SKX64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero @@ -545,8 +545,8 @@ ; ; KNL64-LABEL: test_demandedelts_pshufb_v32i8_v16i8: ; KNL64: # %bb.0: -; KNL64-NEXT: vmovdqa 32(%rdi), %xmm0 -; KNL64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; KNL64-NEXT: vpbroadcastd 44(%rdi), %ymm0 +; KNL64-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; KNL64-NEXT: vmovdqa %ymm0, 672(%rsi) ; KNL64-NEXT: vmovdqa 208(%rdi), %xmm0 ; KNL64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero @@ -557,8 +557,8 @@ ; SKX32: # %bb.0: ; SKX32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; SKX32-NEXT: vmovdqa 32(%ecx), %xmm0 -; SKX32-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; SKX32-NEXT: vpbroadcastd 44(%ecx), %ymm0 +; SKX32-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; SKX32-NEXT: vmovdqa %ymm0, 672(%eax) ; SKX32-NEXT: vmovdqa 208(%ecx), %xmm0 ; SKX32-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero @@ -569,13 +569,13 @@ ; KNL32-LABEL: test_demandedelts_pshufb_v32i8_v16i8: ; KNL32: # %bb.0: ; KNL32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL32-NEXT: vmovdqa 32(%eax), %xmm0 -; KNL32-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; KNL32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; KNL32-NEXT: vmovdqa %ymm0, 672(%ecx) -; KNL32-NEXT: vmovdqa 208(%eax), %xmm0 +; KNL32-NEXT: vpbroadcastd 44(%ecx), %ymm0 +; KNL32-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; KNL32-NEXT: vmovdqa %ymm0, 672(%eax) +; KNL32-NEXT: vmovdqa 208(%ecx), %xmm0 ; KNL32-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero -; KNL32-NEXT: vmovdqa %ymm0, 832(%ecx) +; KNL32-NEXT: vmovdqa %ymm0, 832(%eax) ; KNL32-NEXT: retl %t64 = bitcast <2 x i32>* %src to <16 x i32>* %t87 = load <16 x i32>, <16 x i32>* %t64, align 64 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -3375,11 +3375,10 @@ ; AVX2-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX2-NEXT: vmovaps {{.*#+}} xmm1 = <0.0E+0,-2.0E+0,u,u> -; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,1,3] -; AVX2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[2,3] -; AVX2-NEXT: vaddps %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] +; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm1[1],xmm2[3] +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; AVX2-NEXT: vaddps %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vmovaps %xmm2, (%rax) ; AVX2-NEXT: vbroadcastss (%rax), %xmm2 ; AVX2-NEXT: vmulps %xmm1, %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 ; Verify that we don't emit packed vector shifts instructions if the ; condition used by the vector select is a vector of constants. @@ -589,17 +589,32 @@ ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: simplify_select: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX-NEXT: vmovd %edi, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] -; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX-NEXT: vpinsrd $1, %edi, %xmm2, %xmm2 -; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: simplify_select: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %edi, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] +; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; AVX1-NEXT: vpinsrd $1, %edi, %xmm2, %xmm2 +; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: simplify_select: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vmovd %edi, %xmm2 +; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,0,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] +; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; AVX2-NEXT: retq %a = insertelement <2 x i32> , i32 %x, i32 1 %b = insertelement <2 x i32> , i32 %x, i32 0 %y = or <2 x i32> %a, %b