Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -7566,12 +7566,20 @@ return ConstantVector::get(ArrayRef(ConstantVec)); } -static bool isUseOfShuffle(SDNode *N) { +static bool isFoldableUseOfShuffle(SDNode *N) { for (auto *U : N->uses()) { - if (isTargetShuffle(U->getOpcode())) + unsigned Opc = U->getOpcode(); + // VPERMV/VPERMV3 shuffles can never fold their index operands. + if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N) + return false; + if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N) + return false; + if (isTargetShuffle(Opc)) + return true; + if (Opc == ISD::BITCAST) // Ignore bitcasts + return isFoldableUseOfShuffle(U); + if (N->hasOneUse()) return true; - if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts - return isUseOfShuffle(U); } return false; } @@ -7679,7 +7687,7 @@ SplatBitSize < VT.getSizeInBits()) { // Avoid replacing with broadcast when it's a use of a shuffle // instruction to preserve the present custom lowering of shuffles. - if (isUseOfShuffle(BVOp) || BVOp->hasOneUse()) + if (isFoldableUseOfShuffle(BVOp)) return SDValue(); // replace BUILD_VECTOR with broadcast of the repeated constants. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); Index: test/CodeGen/X86/avx512-shuffles/partial_permute.ll =================================================================== --- test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -462,9 +462,9 @@ define <8 x i16> @test_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec) { ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm1 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [22,27,7,10,13,21,5,14] +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vpermt2w %ymm0, %ymm2, %ymm1 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -474,9 +474,9 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14] +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; CHECK-NEXT: vpermt2w %ymm0, %ymm3, %ymm4 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper @@ -490,9 +490,9 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [22,27,7,10,13,21,5,14] +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vpermt2w %ymm0, %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper @@ -505,11 +505,11 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <1,21,27,10,8,19,14,5,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,21,27,10,8,19,14,5] +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; CHECK-NEXT: vpermt2w %ymm4, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 -; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> @@ -521,11 +521,11 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <1,21,27,10,8,19,14,5,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,21,27,10,8,19,14,5] +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> @@ -536,11 +536,11 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <15,13,18,16,9,11,26,8,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [15,13,18,16,9,11,26,8] +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; CHECK-NEXT: vpermt2w %ymm4, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 -; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> @@ -552,11 +552,11 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <15,13,18,16,9,11,26,8,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [15,13,18,16,9,11,26,8] +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> @@ -567,10 +567,10 @@ define <8 x i16> @test_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec) { ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask3: ; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [17,0,23,10,1,8,7,30] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm1 -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermt2w %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> @@ -579,11 +579,11 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [17,0,23,10,1,8,7,30] +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; CHECK-NEXT: vpermt2w %ymm4, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 -; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> @@ -595,11 +595,11 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [17,0,23,10,1,8,7,30] +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> @@ -760,9 +760,9 @@ define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp) { ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w (%rdi), %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [16,17,5,1,14,14,13,17] +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm0 +; CHECK-NEXT: vpermt2w (%rdi), %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -773,9 +773,9 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17] +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm3 +; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper @@ -790,9 +790,9 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w (%rdi), %ymm1, %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [16,17,5,1,14,14,13,17] +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 +; CHECK-NEXT: vpermt2w (%rdi), %ymm1, %ymm2 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper @@ -807,9 +807,9 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1] +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm3 +; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper @@ -824,9 +824,9 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w (%rdi), %ymm1, %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [7,6,4,6,12,4,27,1] +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 +; CHECK-NEXT: vpermt2w (%rdi), %ymm1, %ymm2 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper @@ -841,9 +841,9 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10] +; CHECK-NEXT: vmovdqa (%rdi), %ymm3 +; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper @@ -858,9 +858,9 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [6,18,0,4,10,25,22,10] +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 +; CHECK-NEXT: vpermt2w 32(%rdi), %ymm1, %ymm2 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper @@ -875,9 +875,9 @@ define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp) { ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [19,1,5,31,9,12,17,9] +; CHECK-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-NEXT: vpermt2w 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -888,9 +888,9 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9] +; CHECK-NEXT: vmovdqa (%rdi), %ymm3 +; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper @@ -905,9 +905,9 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [19,1,5,31,9,12,17,9] +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 +; CHECK-NEXT: vpermt2w 32(%rdi), %ymm1, %ymm2 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper @@ -935,7 +935,7 @@ define <4 x i32> @test_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec) { ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <4,0,3,2,u,u,u,u> +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,0,3,2] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper @@ -946,7 +946,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <4,0,3,2,u,u,u,u> +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [4,0,3,2] ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} @@ -961,7 +961,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <4,0,3,2,u,u,u,u> +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,3,2] ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} @@ -975,7 +975,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <3,0,7,3,u,u,u,u> +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,0,7,3] ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} @@ -990,7 +990,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <3,0,7,3,u,u,u,u> +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [3,0,7,3] ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} @@ -1033,7 +1033,7 @@ define <4 x i32> @test_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec) { ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <5,3,2,5,u,u,u,u> +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [5,3,2,5] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper @@ -1044,7 +1044,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <5,3,2,5,u,u,u,u> +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,3,2,5] ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} @@ -1059,7 +1059,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <5,3,2,5,u,u,u,u> +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [5,3,2,5] ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} @@ -1397,8 +1397,8 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask1: ; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,1,3,4] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <5,1,3,4,u,u,u,u> ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} @@ -1413,8 +1413,8 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask1: ; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [5,1,3,4] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <5,1,3,4,u,u,u,u> ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} @@ -1428,11 +1428,11 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <1,1,13,0,u,u,u,u> -; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,13,0] +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; CHECK-NEXT: vpermt2d %ymm4, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 -; CHECK-NEXT: vpblendmd %xmm4, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -1444,11 +1444,11 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <1,1,13,0,u,u,u,u> -; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,13,0] +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vpermt2d %ymm3, %ymm2, %ymm0 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -1459,10 +1459,10 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [3,0,0,13] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <3,0,0,13,u,u,u,u> -; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm1 -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -1471,11 +1471,11 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <3,0,0,13,u,u,u,u> -; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,0,0,13] +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; CHECK-NEXT: vpermt2d %ymm4, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 -; CHECK-NEXT: vpblendmd %xmm4, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -1487,11 +1487,11 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <3,0,0,13,u,u,u,u> -; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [3,0,0,13] +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vpermt2d %ymm3, %ymm2, %ymm0 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -1647,9 +1647,9 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = <13,0,0,6,u,u,u,u> -; CHECK-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [13,0,0,6] +; CHECK-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-NEXT: vpermt2d 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1660,9 +1660,9 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <13,0,0,6,u,u,u,u> -; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [13,0,0,6] +; CHECK-NEXT: vmovdqa (%rdi), %ymm3 +; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper @@ -1677,9 +1677,9 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <13,0,0,6,u,u,u,u> -; CHECK-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [13,0,0,6] +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 +; CHECK-NEXT: vpermt2d 32(%rdi), %ymm1, %ymm2 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper @@ -1728,9 +1728,9 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <2,15,6,9,u,u,u,u> -; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [2,15,6,9] +; CHECK-NEXT: vmovdqa (%rdi), %ymm3 +; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper @@ -1745,9 +1745,9 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <2,15,6,9,u,u,u,u> -; CHECK-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [2,15,6,9] +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 +; CHECK-NEXT: vpermt2d 32(%rdi), %ymm1, %ymm2 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper @@ -1823,9 +1823,9 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask9: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = <4,1,u,2,u,u,u,u> -; CHECK-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,1,0,2] +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; CHECK-NEXT: vpermps %ymm2, %ymm1, %ymm1 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] @@ -2035,7 +2035,8 @@ ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,7,2,7] +; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,7,2,7] +; CHECK-NEXT: # ymm4 = mem[0,1,0,1] ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} @@ -2050,7 +2051,8 @@ ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,7,2,7] +; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,7,2,7] +; CHECK-NEXT: # ymm2 = mem[0,1,0,1] ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 @@ -2695,7 +2697,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <1,3,5,0,u,u,u,u> +; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [1,3,5,0] ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 @@ -2711,7 +2713,7 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = <1,3,5,0,u,u,u,u> +; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,0] ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 @@ -2726,7 +2728,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <3,2,7,0,u,u,u,u> +; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,2,7,0] ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 @@ -2742,7 +2744,7 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = <3,2,7,0,u,u,u,u> +; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,2,7,0] ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 @@ -2757,7 +2759,7 @@ define <4 x float> @test_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec) { ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <3,3,5,2,u,u,u,u> +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,3,5,2] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper @@ -2768,7 +2770,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <3,3,5,2,u,u,u,u> +; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,3,5,2] ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 @@ -2784,7 +2786,7 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = <3,3,5,2,u,u,u,u> +; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,5,2] ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 @@ -3103,9 +3105,9 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec) { ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <12,0,1,2,u,u,u,u> -; CHECK-NEXT: vpermi2ps %ymm0, %ymm2, %ymm1 +; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [12,0,1,2] +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vpermt2ps %ymm0, %ymm2, %ymm1 ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -3115,9 +3117,9 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = <12,0,1,2,u,u,u,u> -; CHECK-NEXT: vpermi2ps %ymm0, %ymm3, %ymm4 +; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [12,0,1,2] +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm4 +; CHECK-NEXT: vpermt2ps %ymm0, %ymm3, %ymm4 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vcmpeqps %xmm0, %xmm2, %k1 ; CHECK-NEXT: vblendmps %xmm4, %xmm1, %xmm0 {%k1} @@ -3132,9 +3134,9 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <12,0,1,2,u,u,u,u> -; CHECK-NEXT: vpermi2ps %ymm0, %ymm2, %ymm3 +; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [12,0,1,2] +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vpermt2ps %ymm0, %ymm2, %ymm3 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vcmpeqps %xmm0, %xmm1, %k1 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} {z} @@ -3543,9 +3545,9 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp) { ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm1 -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = <3,3,15,9,u,u,u,u> -; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm1, %ymm0 +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,3,15,9] +; CHECK-NEXT: vmovaps (%rdi), %ymm0 +; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -3556,9 +3558,9 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm2 -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <3,3,15,9,u,u,u,u> -; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3 +; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,15,9] +; CHECK-NEXT: vmovaps (%rdi), %ymm3 +; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} @@ -3574,9 +3576,9 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm1 -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = <3,3,15,9,u,u,u,u> -; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm1, %ymm2 +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,3,15,9] +; CHECK-NEXT: vmovaps (%rdi), %ymm2 +; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm1, %ymm2 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm0 {%k1} {z} @@ -3733,7 +3735,8 @@ ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [3,7,3,7] +; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,7,3,7] +; CHECK-NEXT: # ymm1 = mem[0,1,0,1] ; CHECK-NEXT: vpermi2pd %ymm0, %ymm2, %ymm1 ; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -3744,7 +3747,8 @@ ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [3,7,3,7] +; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,7,3,7] +; CHECK-NEXT: # ymm4 = mem[0,1,0,1] ; CHECK-NEXT: vpermi2pd %ymm0, %ymm3, %ymm4 ; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1 @@ -3760,7 +3764,8 @@ ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [3,7,3,7] +; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,7,3,7] +; CHECK-NEXT: # ymm2 = mem[0,1,0,1] ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1 ; CHECK-NEXT: vpermi2pd %ymm0, %ymm3, %ymm2 {%k1} {z} Index: test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll =================================================================== --- test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll +++ test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll @@ -1391,7 +1391,8 @@ ; ; ALL32-LABEL: f4xi64_i128: ; ALL32: # %bb.0: -; ALL32-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] +; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] +; ALL32-NEXT: # ymm1 = mem[0,1,0,1] ; ALL32-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: retl @@ -1431,14 +1432,15 @@ ; AVX-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl ; ; AVX2-LABEL: f8xi64_i128: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 @@ -1447,7 +1449,8 @@ ; ; AVX512-LABEL: f8xi64_i128: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0] +; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retl @@ -1521,7 +1524,8 @@ ; ; AVX512-LABEL: f8xi64_i256: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,0,0,1,0,2,0,3,0] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,0,0,1,0,2,0,3,0] +; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retl Index: test/CodeGen/X86/horizontal-reduce-umax.ll =================================================================== --- test/CodeGen/X86/horizontal-reduce-umax.ll +++ test/CodeGen/X86/horizontal-reduce-umax.ll @@ -50,17 +50,30 @@ ; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v2i64: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X86-AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; X86-AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2 -; X86-AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v2i64: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0] +; X86-AVX1-NEXT: ## xmm2 = mem[0,0] +; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3 +; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v2i64: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0] +; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v2i64: ; X64-SSE2: ## %bb.0: @@ -524,9 +537,10 @@ ; X86-AVX1-LABEL: test_reduce_v4i64: ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 -; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4 +; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0] +; X86-AVX1-NEXT: ## xmm2 = mem[0,0] +; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm3 +; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 @@ -544,7 +558,7 @@ ; X86-AVX2-LABEL: test_reduce_v4i64: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 ; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 ; X86-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 @@ -1239,18 +1253,19 @@ ; X86-AVX1-LABEL: test_reduce_v8i64: ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] -; X86-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0] +; X86-AVX1-NEXT: ## xmm3 = mem[0,0] +; X86-AVX1-NEXT: vxorps %xmm3, %xmm2, %xmm2 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 +; X86-AVX1-NEXT: vxorps %xmm3, %xmm4, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 -; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4 -; X86-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm5 +; X86-AVX1-NEXT: vxorps %xmm3, %xmm1, %xmm4 +; X86-AVX1-NEXT: vxorps %xmm3, %xmm0, %xmm5 ; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 ; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm2 +; X86-AVX1-NEXT: vxorps %xmm3, %xmm1, %xmm2 ; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2 @@ -1268,7 +1283,7 @@ ; ; X86-AVX2-LABEL: test_reduce_v8i64: ; X86-AVX2: ## %bb.0: -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 ; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm4 ; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 Index: test/CodeGen/X86/horizontal-reduce-umin.ll =================================================================== --- test/CodeGen/X86/horizontal-reduce-umin.ll +++ test/CodeGen/X86/horizontal-reduce-umin.ll @@ -51,17 +51,30 @@ ; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v2i64: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X86-AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; X86-AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2 -; X86-AVX-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v2i64: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0] +; X86-AVX1-NEXT: ## xmm2 = mem[0,0] +; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3 +; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v2i64: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0] +; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v2i64: ; X64-SSE2: ## %bb.0: @@ -462,9 +475,10 @@ ; X86-AVX1-LABEL: test_reduce_v4i64: ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm4 +; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0] +; X86-AVX1-NEXT: ## xmm2 = mem[0,0] +; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3 +; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 @@ -482,7 +496,7 @@ ; X86-AVX2-LABEL: test_reduce_v4i64: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 ; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 ; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 @@ -1141,19 +1155,20 @@ ; X86-AVX1-LABEL: test_reduce_v8i64: ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] -; X86-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0] +; X86-AVX1-NEXT: ## xmm3 = mem[0,0] +; X86-AVX1-NEXT: vxorps %xmm3, %xmm2, %xmm2 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 +; X86-AVX1-NEXT: vxorps %xmm3, %xmm4, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 -; X86-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4 -; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm5 +; X86-AVX1-NEXT: vxorps %xmm3, %xmm0, %xmm4 +; X86-AVX1-NEXT: vxorps %xmm3, %xmm1, %xmm5 ; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 ; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 -; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4 +; X86-AVX1-NEXT: vxorps %xmm3, %xmm1, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 ; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 @@ -1170,7 +1185,7 @@ ; ; X86-AVX2-LABEL: test_reduce_v8i64: ; X86-AVX2: ## %bb.0: -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 ; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 ; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 Index: test/CodeGen/X86/i64-to-float.ll =================================================================== --- test/CodeGen/X86/i64-to-float.ll +++ test/CodeGen/X86/i64-to-float.ll @@ -179,10 +179,12 @@ ; ; X32-AVX-LABEL: clamp_sitofp_2i64_2f64: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967041,4294967295,4294967041,4294967295] +; X32-AVX-NEXT: vmovddup {{.*#+}} xmm1 = [NaN,NaN] +; X32-AVX-NEXT: # xmm1 = mem[0,0] ; X32-AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X32-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,255,0] +; X32-AVX-NEXT: vmovddup {{.*#+}} xmm1 = [1.2598673968951787E-321,1.2598673968951787E-321] +; X32-AVX-NEXT: # xmm1 = mem[0,0] ; X32-AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X32-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X32-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] Index: test/CodeGen/X86/masked_compressstore.ll =================================================================== --- test/CodeGen/X86/masked_compressstore.ll +++ test/CodeGen/X86/masked_compressstore.ll @@ -7,7 +7,7 @@ ; HSW-LABEL: compressstore_v16f32_const: ; HSW: # %bb.0: # %cond.store ; HSW-NEXT: vmovups %ymm0, (%rdi) -; HSW-NEXT: vmovaps {{.*#+}} ymm0 = <0,1,2,4,u,u,u,u> +; HSW-NEXT: vmovaps {{.*#+}} xmm0 = [0,1,2,4] ; HSW-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; HSW-NEXT: vmovups %xmm0, 32(%rdi) ; HSW-NEXT: vextractf128 $1, %ymm1, %xmm0 Index: test/CodeGen/X86/oddshuffles.ll =================================================================== --- test/CodeGen/X86/oddshuffles.ll +++ test/CodeGen/X86/oddshuffles.ll @@ -1392,13 +1392,14 @@ ; AVX2-SLOW-NEXT: vmovups (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups 32(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups 64(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm4 = +; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6] +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u> @@ -1421,13 +1422,14 @@ ; AVX2-FAST-NEXT: vmovups (%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups 32(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups 64(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482] ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u> ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u> @@ -1658,7 +1660,8 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] ; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] Index: test/CodeGen/X86/packss.ll =================================================================== --- test/CodeGen/X86/packss.ll +++ test/CodeGen/X86/packss.ll @@ -193,7 +193,8 @@ ; ; X86-AVX2-LABEL: trunc_ashr_v4i64_demandedelts: ; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [63,0,0,0,63,0,0,0] +; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [63,0,0,0,63,0,0,0] +; X86-AVX2-NEXT: # ymm1 = mem[0,1,0,1] ; X86-AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] ; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2 Index: test/CodeGen/X86/pr30284.ll =================================================================== --- test/CodeGen/X86/pr30284.ll +++ test/CodeGen/X86/pr30284.ll @@ -21,7 +21,7 @@ ; CHECK-NEXT: vpmovd2m %zmm0, %k1 ; CHECK-NEXT: vmovapd 0, %zmm0 ; CHECK-NEXT: vmovapd 64, %zmm1 -; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [0,16,0,16,0,16,0,16,0,16,0,16,0,16,0,16] +; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm2 = [3.3951932655444357E-313,3.3951932655444357E-313,3.3951932655444357E-313,3.3951932655444357E-313,3.3951932655444357E-313,3.3951932655444357E-313,3.3951932655444357E-313,3.3951932655444357E-313] ; CHECK-NEXT: kshiftrw $8, %k1, %k2 ; CHECK-NEXT: vorpd %zmm2, %zmm1, %zmm1 {%k2} ; CHECK-NEXT: vorpd %zmm2, %zmm0, %zmm0 {%k1} Index: test/CodeGen/X86/prefer-avx256-mask-shuffle.ll =================================================================== --- test/CodeGen/X86/prefer-avx256-mask-shuffle.ll +++ test/CodeGen/X86/prefer-avx256-mask-shuffle.ll @@ -211,7 +211,8 @@ ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vptestnmb %ymm0, %ymm0, %k0 ; AVX512VLBW-NEXT: vpmovm2w %k0, %zmm0 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512VLBW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512VLBW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512VLBW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512VLBW-NEXT: vpmovw2m %zmm0, %k0 ; AVX512VLBW-NEXT: vpmovm2b %k0, %ymm0 @@ -222,7 +223,8 @@ ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpmovw2m %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 Index: test/CodeGen/X86/shuffle-strided-with-offset-512.ll =================================================================== --- test/CodeGen/X86/shuffle-strided-with-offset-512.ll +++ test/CodeGen/X86/shuffle-strided-with-offset-512.ll @@ -278,9 +278,9 @@ ; ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_1: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [1,5,9,13,17,21,25,29] +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 ; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -345,9 +345,9 @@ ; ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_2: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [2,6,10,14,18,22,26,30] +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 ; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -412,9 +412,9 @@ ; ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_3: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [3,7,11,15,19,23,27,31] +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 ; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -559,9 +559,9 @@ ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_2: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [1,5,9,13,17,21,25,29] +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 ; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -706,9 +706,9 @@ ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_4: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [2,6,10,14,18,22,26,30] +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 ; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -853,9 +853,9 @@ ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_6: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [3,7,11,15,19,23,27,31] +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 ; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq Index: test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll =================================================================== --- test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll +++ test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll @@ -398,9 +398,9 @@ ; ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v16i8: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1 +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512VBMIVL-NEXT: vpermt2b 32(%rdi), %ymm0, %ymm1 ; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi) ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq @@ -479,9 +479,9 @@ ; ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28] +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 ; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -506,9 +506,9 @@ ; ; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v8i16: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u> -; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28] +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 ; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi) ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq @@ -626,7 +626,7 @@ ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8: ; AVX512VBMIVL: # %bb.0: ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,8,16,24,32,40,48,56,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4048780183313844224,4048780183313844224,4048780183313844224,4048780183313844224] ; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1 ; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi) ; AVX512VBMIVL-NEXT: vzeroupper @@ -737,10 +737,10 @@ ; ; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: ; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] ; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VBMIVL-NEXT: vpermi2b %ymm2, %ymm0, %ymm1 -; AVX512VBMIVL-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512VBMIVL-NEXT: vpermt2b %ymm2, %ymm1, %ymm0 +; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> @@ -828,10 +828,10 @@ ; ; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: ; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62] ; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VBMIVL-NEXT: vpermi2b %ymm2, %ymm0, %ymm1 -; AVX512VBMIVL-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512VBMIVL-NEXT: vpermt2b %ymm2, %ymm1, %ymm0 +; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> @@ -878,7 +878,7 @@ ; AVX512BWVL-LABEL: PR34175: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768] ; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 ; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0 @@ -899,7 +899,7 @@ ; AVX512VBMIVL-LABEL: PR34175: ; AVX512VBMIVL: # %bb.0: ; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768] ; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 ; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX512VBMIVL-NEXT: vcvtdq2pd %xmm0, %ymm0 Index: test/CodeGen/X86/shuffle-vs-trunc-512.ll =================================================================== --- test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -398,9 +398,9 @@ ; ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v16i8: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1 +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512VBMIVL-NEXT: vpermt2b 32(%rdi), %ymm0, %ymm1 ; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi) ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq @@ -479,9 +479,9 @@ ; ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28] +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 ; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -506,9 +506,9 @@ ; ; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v8i16: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u> -; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28] +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 ; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi) ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq @@ -589,9 +589,9 @@ ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28] +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 ; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -616,9 +616,9 @@ ; ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u> -; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28] +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 ; AVX512VBMIVL-NEXT: vpmovwb %xmm1, (%rsi) ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq @@ -728,10 +728,10 @@ ; ; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: ; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] ; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VBMIVL-NEXT: vpermi2b %ymm2, %ymm0, %ymm1 -; AVX512VBMIVL-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512VBMIVL-NEXT: vpermt2b %ymm2, %ymm1, %ymm0 +; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> @@ -819,10 +819,10 @@ ; ; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: ; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62] ; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VBMIVL-NEXT: vpermi2b %ymm2, %ymm0, %ymm1 -; AVX512VBMIVL-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512VBMIVL-NEXT: vpermt2b %ymm2, %ymm1, %ymm0 +; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> @@ -869,7 +869,7 @@ ; AVX512BWVL-LABEL: PR34175: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768] ; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 ; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0 @@ -890,7 +890,7 @@ ; AVX512VBMIVL-LABEL: PR34175: ; AVX512VBMIVL: # %bb.0: ; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768] ; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 ; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX512VBMIVL-NEXT: vcvtdq2pd %xmm0, %ymm0 Index: test/CodeGen/X86/subvector-broadcast.ll =================================================================== --- test/CodeGen/X86/subvector-broadcast.ll +++ test/CodeGen/X86/subvector-broadcast.ll @@ -874,8 +874,9 @@ ; ; X32-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64: ; X32-AVX512: # %bb.0: # %entry -; X32-AVX512-NEXT: vpaddq {{\.LCPI.*}}, %ymm0, %ymm0 -; X32-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,2,0,3,0,4,0,1,0,2,0,3,0,4,0] +; X32-AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,2,0,3,0,4,0] +; X32-AVX512-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; X32-AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 ; X32-AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1 ; X32-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; X32-AVX512-NEXT: vmovdqu %ymm0, ga4 Index: test/CodeGen/X86/vector-fshl-256.ll =================================================================== --- test/CodeGen/X86/vector-fshl-256.ll +++ test/CodeGen/X86/vector-fshl-256.ll @@ -2080,7 +2080,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpsllw $2, %ymm2, %ymm4 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 @@ -2106,7 +2107,8 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm4 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 @@ -2132,7 +2134,8 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512VL-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm4 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 Index: test/CodeGen/X86/vector-fshl-512.ll =================================================================== --- test/CodeGen/X86/vector-fshl-512.ll +++ test/CodeGen/X86/vector-fshl-512.ll @@ -1141,7 +1141,8 @@ ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512F-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm4 ; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm7 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] @@ -1190,7 +1191,8 @@ ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm4 ; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm7 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] @@ -1239,7 +1241,8 @@ ; ; AVX512BW-LABEL: constant_funnnel_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 @@ -1270,7 +1273,8 @@ ; ; AVX512VBMI2-LABEL: constant_funnnel_v64i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512VBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512VBMI2-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1 ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm3 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 @@ -1301,7 +1305,8 @@ ; ; AVX512VLBW-LABEL: constant_funnnel_v64i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512VLBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512VLBW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 @@ -1332,7 +1337,8 @@ ; ; AVX512VLVBMI2-LABEL: constant_funnnel_v64i8: ; AVX512VLVBMI2: # %bb.0: -; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512VLVBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512VLVBMI2-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1 ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm3 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 Index: test/CodeGen/X86/vector-fshl-rot-256.ll =================================================================== --- test/CodeGen/X86/vector-fshl-rot-256.ll +++ test/CodeGen/X86/vector-fshl-rot-256.ll @@ -1184,7 +1184,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpsllw $2, %ymm1, %ymm3 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 @@ -1208,7 +1209,8 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm3 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 @@ -1232,7 +1234,8 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512VL-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm3 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 Index: test/CodeGen/X86/vector-fshl-rot-512.ll =================================================================== --- test/CodeGen/X86/vector-fshl-rot-512.ll +++ test/CodeGen/X86/vector-fshl-rot-512.ll @@ -561,7 +561,8 @@ ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] @@ -607,7 +608,8 @@ ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] @@ -653,7 +655,8 @@ ; ; AVX512BW-LABEL: constant_funnnel_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 @@ -680,7 +683,8 @@ ; ; AVX512VLBW-LABEL: constant_funnnel_v64i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512VLBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512VLBW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 Index: test/CodeGen/X86/vector-fshr-256.ll =================================================================== --- test/CodeGen/X86/vector-fshr-256.ll +++ test/CodeGen/X86/vector-fshr-256.ll @@ -2095,7 +2095,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsllw $2, %ymm0, %ymm2 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 @@ -2121,7 +2122,8 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 @@ -2147,7 +2149,8 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536] +; AVX512VL-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 Index: test/CodeGen/X86/vector-fshr-512.ll =================================================================== --- test/CodeGen/X86/vector-fshr-512.ll +++ test/CodeGen/X86/vector-fshr-512.ll @@ -1137,7 +1137,8 @@ ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536] +; AVX512F-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] @@ -1186,7 +1187,8 @@ ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536] +; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] @@ -1235,7 +1237,8 @@ ; ; AVX512BW-LABEL: constant_funnnel_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 @@ -1265,7 +1268,8 @@ ; ; AVX512VBMI2-LABEL: constant_funnnel_v64i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536] +; AVX512VBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536] +; AVX512VBMI2-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1 ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm3 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 @@ -1295,7 +1299,8 @@ ; ; AVX512VLBW-LABEL: constant_funnnel_v64i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536] +; AVX512VLBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536] +; AVX512VLBW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 @@ -1325,7 +1330,8 @@ ; ; AVX512VLVBMI2-LABEL: constant_funnnel_v64i8: ; AVX512VLVBMI2: # %bb.0: -; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536] +; AVX512VLVBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536] +; AVX512VLVBMI2-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1 ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm3 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 Index: test/CodeGen/X86/vector-fshr-rot-256.ll =================================================================== --- test/CodeGen/X86/vector-fshr-rot-256.ll +++ test/CodeGen/X86/vector-fshr-rot-256.ll @@ -1261,7 +1261,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpsllw $2, %ymm1, %ymm3 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 @@ -1285,7 +1286,8 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536] +; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm3 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 @@ -1309,7 +1311,8 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536] +; AVX512VL-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm3 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 Index: test/CodeGen/X86/vector-fshr-rot-512.ll =================================================================== --- test/CodeGen/X86/vector-fshr-rot-512.ll +++ test/CodeGen/X86/vector-fshr-rot-512.ll @@ -581,7 +581,8 @@ ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] @@ -627,7 +628,8 @@ ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536] +; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] @@ -673,7 +675,8 @@ ; ; AVX512BW-LABEL: constant_funnnel_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 @@ -700,7 +703,8 @@ ; ; AVX512VLBW-LABEL: constant_funnnel_v64i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536] +; AVX512VLBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536] +; AVX512VLBW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 Index: test/CodeGen/X86/vector-rotate-256.ll =================================================================== --- test/CodeGen/X86/vector-rotate-256.ll +++ test/CodeGen/X86/vector-rotate-256.ll @@ -1177,7 +1177,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpsllw $2, %ymm1, %ymm3 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 @@ -1201,7 +1202,8 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm3 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 @@ -1225,7 +1227,8 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512VL-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm3 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 Index: test/CodeGen/X86/vector-rotate-512.ll =================================================================== --- test/CodeGen/X86/vector-rotate-512.ll +++ test/CodeGen/X86/vector-rotate-512.ll @@ -542,7 +542,8 @@ ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] @@ -588,7 +589,8 @@ ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] @@ -634,7 +636,8 @@ ; ; AVX512BW-LABEL: constant_rotate_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 @@ -661,7 +664,8 @@ ; ; AVX512VLBW-LABEL: constant_rotate_v64i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256] +; AVX512VLBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256] +; AVX512VLBW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 Index: test/CodeGen/X86/vector-shift-ashr-256.ll =================================================================== --- test/CodeGen/X86/vector-shift-ashr-256.ll +++ test/CodeGen/X86/vector-shift-ashr-256.ll @@ -89,7 +89,8 @@ ; X32-AVX1-LABEL: var_shift_v4i64: ; X32-AVX1: # %bb.0: ; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] +; X32-AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0] +; X32-AVX1-NEXT: # xmm3 = mem[0,0] ; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 ; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] ; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6 Index: test/CodeGen/X86/vector-shift-shl-256.ll =================================================================== --- test/CodeGen/X86/vector-shift-shl-256.ll +++ test/CodeGen/X86/vector-shift-shl-256.ll @@ -958,7 +958,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsllw $2, %ymm0, %ymm1 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 @@ -991,7 +992,8 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32] +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32] +; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm1 ; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 @@ -1013,7 +1015,8 @@ ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32] +; AVX512DQVL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32] +; AVX512DQVL-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512DQVL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpsllw $2, %ymm0, %ymm1 ; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 @@ -1058,7 +1061,8 @@ ; X32-AVX2: # %bb.0: ; X32-AVX2-NEXT: vpsllw $4, %ymm0, %ymm1 ; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm1, %ymm1 -; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32] +; X32-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32] +; X32-AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; X32-AVX2-NEXT: vpsllw $2, %ymm0, %ymm1 ; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm1, %ymm1 Index: test/CodeGen/X86/vector-shift-shl-512.ll =================================================================== --- test/CodeGen/X86/vector-shift-shl-512.ll +++ test/CodeGen/X86/vector-shift-shl-512.ll @@ -208,7 +208,8 @@ ; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32] +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32] +; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] @@ -230,7 +231,8 @@ ; ; AVX512BW-LABEL: constant_shift_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 Index: test/CodeGen/X86/vector-shuffle-256-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v16.ll +++ test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -726,7 +726,8 @@ ; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,1,0,0,0,1] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,1,0,0,0,1] +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; @@ -739,7 +740,8 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0] +; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0] +; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -764,7 +766,8 @@ ; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; @@ -777,7 +780,8 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0] +; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0] +; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -802,7 +806,8 @@ ; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; @@ -815,7 +820,8 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0,0,0,0,0,3,0,0,0] +; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0,0,0,0,0,3,0,0,0] +; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -843,7 +849,8 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0,0,0,0,4,0,0,0,0] +; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0,0,0,0,4,0,0,0,0] +; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -871,7 +878,8 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0,0,0,5,0,0,0,0,0] +; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0,0,0,5,0,0,0,0,0] +; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -899,7 +907,8 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0,0,6,0,0,0,0,0,0] +; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0,0,6,0,0,0,0,0,0] +; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -927,7 +936,8 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -3798,7 +3808,8 @@ ; ; AVX2-FAST-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,6,4,4,5,6,4] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,5,6,4,4,5,6,4] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,2,3,8,9,10,11,14,15,14,15,16,17,18,19,20,21,18,19,24,25,26,27,30,31,30,31] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] Index: test/CodeGen/X86/vector-shuffle-256-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v32.ll +++ test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1336,7 +1336,8 @@ ; ; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX512VLVBMI-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX512VLVBMI-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-FAST-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -1370,7 +1371,8 @@ ; ; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX512VLVBMI-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX512VLVBMI-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-FAST-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -1404,7 +1406,8 @@ ; ; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-FAST-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -1438,7 +1441,8 @@ ; ; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: ; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-FAST-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -1472,7 +1476,8 @@ ; ; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-FAST-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -1514,7 +1519,8 @@ ; ; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-FAST-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> Index: test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v8.ll +++ test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -750,7 +750,8 @@ ; ; AVX2-SLOW-LABEL: shuffle_v8f32_c348cda0: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u> +; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4,5,2,0,4,5,2,0] +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] @@ -761,7 +762,8 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,3,4,7,4,7,2,0] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u> +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4,5,2,0,4,5,2,0] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7] ; AVX2-FAST-NEXT: retq @@ -829,7 +831,8 @@ ; ; AVX2-FAST-LABEL: shuffle_v8f32_32103210: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; @@ -841,8 +844,9 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_32103210: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] -; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] +; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -863,7 +867,8 @@ ; ; AVX2-FAST-LABEL: shuffle_v8f32_76547654: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; @@ -875,8 +880,9 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_76547654: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] -; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -2080,7 +2086,8 @@ ; ; AVX2-FAST-LABEL: shuffle_v8i32_32103210: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; @@ -2092,8 +2099,9 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_32103210: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] -; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] +; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2114,7 +2122,8 @@ ; ; AVX2-FAST-LABEL: shuffle_v8i32_76547654: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; @@ -2126,8 +2135,9 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_76547654: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] -; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -3000,13 +3010,15 @@ ; AVX2-LABEL: lowhalf_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <2,6,3,6,u,u,u,u> +; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2,6,3,6,2,6,3,6] +; AVX2-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: lowhalf_v8i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <2,14,3,14,u,u,u,u> +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,14,3,14,2,14,3,14] +; AVX512VL-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512VL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %r = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> @@ -3026,13 +3038,15 @@ ; AVX2-LABEL: lowhalf_v8f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <2,6,3,6,u,u,u,u> +; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2,6,3,6,2,6,3,6] +; AVX2-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: lowhalf_v8f32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = <2,14,3,14,u,u,u,u> +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,14,3,14,2,14,3,14] +; AVX512VL-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512VL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %r = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> Index: test/CodeGen/X86/vector-shuffle-512-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v16.ll +++ test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -325,7 +325,7 @@ ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 ; ALL-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vmovaps {{.*#+}} ymm2 = +; ALL-NEXT: vbroadcastsd {{.*#+}} ymm2 = [17179869184,17179869184,17179869184,17179869184] ; ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; ALL-NEXT: vzeroupper Index: test/CodeGen/X86/vector-shuffle-512-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v32.ll +++ test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -338,10 +338,10 @@ ; ; SKX-LABEL: pr32967: ; SKX: ## %bb.0: +; SKX-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29] ; SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u> -; SKX-NEXT: vpermi2w %ymm2, %ymm0, %ymm1 -; SKX-NEXT: vmovdqa %xmm1, %xmm0 +; SKX-NEXT: vpermt2w %ymm2, %ymm1, %ymm0 +; SKX-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %shuffle = shufflevector <32 x i16> %v, <32 x i16> undef, <8 x i32> Index: test/CodeGen/X86/vector-shuffle-512-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v8.ll +++ test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -212,13 +212,15 @@ ; ; AVX512F-LABEL: shuffle_v8f64_08080808: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_08080808: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [0,0,8,0,0,0,8,0,0,0,8,0,0,0,8,0] +; AVX512F-32-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,0,8,0,0,0,8,0,0,0,8,0,0,0,8,0] +; AVX512F-32-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-32-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -1084,13 +1086,15 @@ ; ; AVX512F-LABEL: shuffle_v8i64_08080808: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_08080808: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,8,0,0,0,8,0,0,0,8,0,0,0,8,0] +; AVX512F-32-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,0,8,0,0,0,8,0,0,0,8,0,0,0,8,0] +; AVX512F-32-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> Index: test/CodeGen/X86/vector-shuffle-combining-avx.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -373,7 +373,7 @@ ; X86-AVX512: # %bb.0: # %entry ; X86-AVX512-NEXT: vmovups 0, %zmm0 ; X86-AVX512-NEXT: vmovups 64, %ymm1 -; X86-AVX512-NEXT: vmovaps {{.*#+}} zmm2 = <2,5,8,11,14,17,20,23,u,u,u,u,u,u,u,u> +; X86-AVX512-NEXT: vmovaps {{.*#+}} ymm2 = [2,5,8,11,14,17,20,23] ; X86-AVX512-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2 ; X86-AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X86-AVX512-NEXT: vmulps %ymm0, %ymm2, %ymm1 @@ -416,7 +416,7 @@ ; X64-AVX512: # %bb.0: # %entry ; X64-AVX512-NEXT: vmovups 0, %zmm0 ; X64-AVX512-NEXT: vmovups 64, %ymm1 -; X64-AVX512-NEXT: vmovaps {{.*#+}} zmm2 = <2,5,8,11,14,17,20,23,u,u,u,u,u,u,u,u> +; X64-AVX512-NEXT: vmovaps {{.*#+}} ymm2 = [2,5,8,11,14,17,20,23] ; X64-AVX512-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2 ; X64-AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmulps %ymm0, %ymm2, %ymm1 Index: test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll +++ test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll @@ -61,7 +61,8 @@ define <32 x i8> @combine_vpermi2var_32i8_as_vpermb(<32 x i8> %x0, <32 x i8> %x1) { ; CHECK-LABEL: combine_vpermi2var_32i8_as_vpermb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19] +; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19] +; CHECK-NEXT: # ymm1 = mem[0,1,0,1] ; CHECK-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = shufflevector <32 x i8> %x0, <32 x i8> %x1, <32 x i32> @@ -71,7 +72,8 @@ define <64 x i8> @combine_vpermi2var_64i8_as_vpermb(<64 x i8> %x0, <64 x i8> %x1) { ; CHECK-LABEL: combine_vpermi2var_64i8_as_vpermb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19] +; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19] +; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; CHECK-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = shufflevector <64 x i8> %x0, <64 x i8> %x1, <64 x i32> @@ -94,7 +96,8 @@ define <32 x i8> @combine_vpermi2var_32i8_as_vperm2(<32 x i8> %x0, <32 x i8> %x1) { ; CHECK-LABEL: combine_vpermi2var_32i8_as_vperm2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19] +; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19] +; CHECK-NEXT: # ymm2 = mem[0,1,0,1] ; CHECK-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = shufflevector <32 x i8> %x0, <32 x i8> %x1, <32 x i32> Index: test/CodeGen/X86/vector-shuffle-combining.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining.ll +++ test/CodeGen/X86/vector-shuffle-combining.ll @@ -2465,7 +2465,8 @@ ; AVX2-FAST-LABEL: combine_unneeded_subvector1: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq %b = add <8 x i32> %a, Index: test/CodeGen/X86/vector-shuffle-v1.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-v1.ll +++ test/CodeGen/X86/vector-shuffle-v1.ll @@ -254,7 +254,8 @@ ; VL_BW_DQ-NEXT: vpsllw $7, %ymm0, %ymm0 ; VL_BW_DQ-NEXT: vpmovb2m %ymm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 -; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; VL_BW_DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %ymm0 @@ -308,7 +309,8 @@ ; VL_BW_DQ: # %bb.0: ; VL_BW_DQ-NEXT: vptestnmw %zmm0, %zmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 -; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; VL_BW_DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm3, %zmm0 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1 ; VL_BW_DQ-NEXT: vpblendmw %zmm1, %zmm2, %zmm0 {%k1} @@ -364,7 +366,8 @@ ; VL_BW_DQ: # %bb.0: ; VL_BW_DQ-NEXT: vptestnmb %ymm0, %ymm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 -; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; VL_BW_DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm3, %zmm0 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1 ; VL_BW_DQ-NEXT: vpblendmb %ymm1, %ymm2, %ymm0 {%k1} @@ -412,7 +415,8 @@ ; VL_BW_DQ-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; VL_BW_DQ-NEXT: kunpckwd %k0, %k1, %k0 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 -; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; VL_BW_DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1 ; VL_BW_DQ-NEXT: vpblendmw %zmm2, %zmm3, %zmm0 {%k1} @@ -462,7 +466,8 @@ ; VL_BW_DQ-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; VL_BW_DQ-NEXT: kunpckwd %k0, %k1, %k0 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 -; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; VL_BW_DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1 ; VL_BW_DQ-NEXT: vpblendmb %ymm2, %ymm3, %ymm0 {%k1}