diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -22613,6 +22613,47 @@ } } + // If we're not performing a select/blend shuffle, see if we can convert the + // shuffle into a AND node, with all the out-of-lane elements are known zero. + { + bool IsInLaneMask = true; + ArrayRef Mask = SVN->getMask(); + SmallVector ClearMask(NumElts, -1); + APInt DemandedLHS = APInt::getNullValue(NumElts); + APInt DemandedRHS = APInt::getNullValue(NumElts); + for (int I = 0; I != NumElts; ++I) { + int M = Mask[I]; + if (M < 0) + continue; + IsInLaneMask &= (M == I) || (M == (I + NumElts)); + if (M != I) { + APInt &Demanded = M < (int)NumElts ? DemandedLHS : DemandedRHS; + Demanded.setBit(M % NumElts); + } + } + if (!IsInLaneMask && + (!DemandedLHS.isNullValue() || !DemandedRHS.isNullValue()) && + (DemandedLHS.isNullValue() || + DAG.MaskedVectorIsZero(N0, DemandedLHS)) && + (DemandedRHS.isNullValue() || + DAG.MaskedVectorIsZero(N1, DemandedRHS))) { + SDLoc DL(N); + EVT IntVT = VT.changeVectorElementTypeToInteger(); + EVT IntSVT = VT.getVectorElementType().changeTypeToInteger(); + SDValue ZeroElt = DAG.getConstant(0, DL, IntSVT); + SDValue AllOnesElt = DAG.getAllOnesConstant(DL, IntSVT); + SmallVector AndMask(NumElts, ZeroElt); + for (int I = 0; I != NumElts; ++I) + if (Mask[I] == I) + AndMask[I] = AllOnesElt; + + // TODO: Should we try to mask with N1 as well? + return DAG.getBitcast( + VT, DAG.getNode(ISD::AND, DL, IntVT, DAG.getBitcast(IntVT, N0), + DAG.getBuildVector(IntVT, DL, AndMask))); + } + } + // Attempt to combine a shuffle of 2 inputs of 'scalar sources' - // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR. if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -549,6 +549,10 @@ /// should be stack expanded. bool isShuffleMaskLegal(ArrayRef M, EVT VT) const override; + /// Similar to isShuffleMaskLegal. Return true is the given 'select with zero' + /// shuffle mask can be codegen'd directly. + bool isVectorClearMaskLegal(ArrayRef M, EVT VT) const override; + /// Return the ISD::SETCC ValueType. EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11799,6 +11799,12 @@ isConcatMask(M, VT, VT.getSizeInBits() == 128)); } +bool AArch64TargetLowering::isVectorClearMaskLegal(ArrayRef M, + EVT VT) const { + // Just delegate to the generic legality, clear masks aren't special. + return isShuffleMaskLegal(M, VT); +} + /// getVShiftImm - Check if this is a valid build_vector for the immediate /// operand of a vector shift operation, where all the elements of the /// build_vector must have the same constant integer value. diff --git a/llvm/test/CodeGen/AArch64/build-vector-extract.ll b/llvm/test/CodeGen/AArch64/build-vector-extract.ll --- a/llvm/test/CodeGen/AArch64/build-vector-extract.ll +++ b/llvm/test/CodeGen/AArch64/build-vector-extract.ll @@ -5,7 +5,8 @@ ; CHECK-LABEL: extract0_i32_zext_insert0_i64_undef: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: zip1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: mov v1.s[0], v0.s[0] +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <4 x i32> %x, i32 0 %z = zext i32 %e to i64 @@ -160,7 +161,9 @@ define <2 x i64> @extract2_i32_zext_insert1_i64_undef(<4 x i32> %x) { ; CHECK-LABEL: extract2_i32_zext_insert1_i64_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v0.s[3], wzr +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: mov v1.s[2], v0.s[2] +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <4 x i32> %x, i32 2 %z = zext i32 %e to i64 diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll --- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll +++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll @@ -907,23 +907,11 @@ ret <8 x i8> %c } -; CHECK-LABEL: .LCPI90_0: -; CHECK-NEXT: .byte 0 -; CHECK-NEXT: .byte 255 -; CHECK-NEXT: .byte 2 -; CHECK-NEXT: .byte 255 -; CHECK-NEXT: .byte 4 -; CHECK-NEXT: .byte 5 -; CHECK-NEXT: .byte 6 -; CHECK-NEXT: .byte 7 define <8 x i8> @vselect_equivalent_shuffle_v8i8_zero(<8 x i8> %a) { ; CHECK-LABEL: vselect_equivalent_shuffle_v8i8_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI90_0 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov v0.d[1], v0.d[0] -; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI90_0] -; CHECK-NEXT: tbl v0.8b, { v0.16b }, v1.8b +; CHECK-NEXT: movi d1, #0xffffffff00ff00ff +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %c = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <8 x i32> ret <8 x i8> %c @@ -982,28 +970,20 @@ } ; CHECK-LABEL: .LCPI93_0: -; CHECK-NEXT: .byte 0 -; CHECK-NEXT: .byte 1 -; CHECK-NEXT: .byte 255 -; CHECK-NEXT: .byte 255 -; CHECK-NEXT: .byte 4 -; CHECK-NEXT: .byte 5 -; CHECK-NEXT: .byte 255 -; CHECK-NEXT: .byte 255 -; CHECK-NEXT: .byte 8 -; CHECK-NEXT: .byte 9 -; CHECK-NEXT: .byte 10 -; CHECK-NEXT: .byte 11 -; CHECK-NEXT: .byte 12 -; CHECK-NEXT: .byte 13 -; CHECK-NEXT: .byte 14 -; CHECK-NEXT: .byte 15 +; CHECK-NEXT: .hword 65535 // 0xffff +; CHECK-NEXT: .hword 0 // 0x0 +; CHECK-NEXT: .hword 65535 // 0xffff +; CHECK-NEXT: .hword 0 // 0x0 +; CHECK-NEXT: .hword 65535 // 0xffff +; CHECK-NEXT: .hword 65535 // 0xffff +; CHECK-NEXT: .hword 65535 // 0xffff +; CHECK-NEXT: .hword 65535 // 0xffff define <8 x i16> @vselect_equivalent_shuffle_v8i16_zero(<8 x i16> %a) { ; CHECK-LABEL: vselect_equivalent_shuffle_v8i16_zero: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI93_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI93_0] -; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %c = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> ret <8 x i16> %c diff --git a/llvm/test/CodeGen/ARM/vector-DAGCombine.ll b/llvm/test/CodeGen/ARM/vector-DAGCombine.ll --- a/llvm/test/CodeGen/ARM/vector-DAGCombine.ll +++ b/llvm/test/CodeGen/ARM/vector-DAGCombine.ll @@ -56,7 +56,6 @@ ; CHECK-NEXT: bne .LBB3_2 ; CHECK-NEXT: @ %bb.1: @ %bb1.preheader ; CHECK-NEXT: vmov.i32 q8, #0x0 -; CHECK-NEXT: vext.8 q8, q8, q8, #4 ; CHECK-NEXT: .LBB3_2: @ %bb2 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -4182,13 +4182,19 @@ ; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; KNL-NEXT: kshiftlw $12, %k0, %k0 -; KNL-NEXT: kshiftrw $12, %k0, %k1 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: movw $255, %ax +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kandw %k1, %k0, %k1 ; KNL-NEXT: vpblendmd %zmm5, %zmm4, %zmm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: mask_widening: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 +; SKX-NEXT: movw $255, %ax +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: kandw %k1, %k0, %k1 ; SKX-NEXT: vpblendmd %zmm5, %zmm4, %zmm0 {%k1} ; SKX-NEXT: retq ; @@ -4198,7 +4204,10 @@ ; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: kshiftlw $12, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k1 +; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 +; AVX512BW-NEXT: movw $255, %ax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: vpblendmd %zmm5, %zmm4, %zmm0 {%k1} ; AVX512BW-NEXT: retq ; @@ -4208,7 +4217,10 @@ ; AVX512DQ-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512DQ-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; AVX512DQ-NEXT: kshiftlw $12, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $12, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $12, %k0, %k0 +; AVX512DQ-NEXT: movw $255, %ax +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kandw %k1, %k0, %k1 ; AVX512DQ-NEXT: vpblendmd %zmm5, %zmm4, %zmm0 {%k1} ; AVX512DQ-NEXT: retq ; @@ -4221,8 +4233,11 @@ ; X86-NEXT: .cfi_def_cfa_register %ebp ; X86-NEXT: andl $-64, %esp ; X86-NEXT: subl $64, %esp -; X86-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; X86-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 ; X86-NEXT: vmovdqa64 8(%ebp), %zmm0 +; X86-NEXT: movw $255, %ax +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kandw %k1, %k0, %k1 ; X86-NEXT: vmovdqa32 72(%ebp), %zmm0 {%k1} ; X86-NEXT: movl %ebp, %esp ; X86-NEXT: popl %ebp diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll --- a/llvm/test/CodeGen/X86/combine-sra.ll +++ b/llvm/test/CodeGen/X86/combine-sra.ll @@ -150,21 +150,23 @@ ; SSE: # %bb.0: ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] +; SSE-NEXT: xorps %xmm2, %xmm2 +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: psrad %xmm3, %xmm4 +; SSE-NEXT: blendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: psrad %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: psrad %xmm4, %xmm5 -; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrad %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: psrad %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,3,3,3,4,5,6,7] ; SSE-NEXT: psrad %xmm1, %xmm0 -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3],xmm3[4,5],xmm0[6,7] ; SSE-NEXT: retq ; ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_and: diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll --- a/llvm/test/CodeGen/X86/combine-srl.ll +++ b/llvm/test/CodeGen/X86/combine-srl.ll @@ -406,21 +406,23 @@ ; SSE: # %bb.0: ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] +; SSE-NEXT: xorps %xmm2, %xmm2 +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: psrld %xmm3, %xmm4 +; SSE-NEXT: blendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: psrld %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: psrld %xmm4, %xmm5 -; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrld %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: psrld %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,3,3,3,4,5,6,7] ; SSE-NEXT: psrld %xmm1, %xmm0 -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3],xmm3[4,5],xmm0[6,7] ; SSE-NEXT: retq ; ; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_and: diff --git a/llvm/test/CodeGen/X86/pr45563-2.ll b/llvm/test/CodeGen/X86/pr45563-2.ll --- a/llvm/test/CodeGen/X86/pr45563-2.ll +++ b/llvm/test/CodeGen/X86/pr45563-2.ll @@ -40,7 +40,7 @@ ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; CHECK-NEXT: vmaskmovps (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0 -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: vmovd %ecx, %xmm2 ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 ; CHECK-NEXT: vmaskmovps 32(%rdi), %ymm2, %ymm3 diff --git a/llvm/test/CodeGen/X86/pr45833.ll b/llvm/test/CodeGen/X86/pr45833.ll --- a/llvm/test/CodeGen/X86/pr45833.ll +++ b/llvm/test/CodeGen/X86/pr45833.ll @@ -20,7 +20,7 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: vmovd %eax, %xmm2 ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 ; CHECK-NEXT: vmaskmovps %ymm1, %ymm2, 32(%rdi) diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll --- a/llvm/test/CodeGen/X86/sad.ll +++ b/llvm/test/CodeGen/X86/sad.ll @@ -1058,47 +1058,19 @@ ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; -; AVX1-LABEL: sad_double_reduction: -; AVX1: # %bb.0: # %bb -; AVX1-NEXT: vmovdqu (%rdi), %xmm0 -; AVX1-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqu (%rdx), %xmm1 -; AVX1-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: sad_double_reduction: -; AVX2: # %bb.0: # %bb -; AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; AVX2-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqu (%rdx), %xmm1 -; AVX2-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 -; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq -; -; AVX512-LABEL: sad_double_reduction: -; AVX512: # %bb.0: # %bb -; AVX512-NEXT: vmovdqu (%rdi), %xmm0 -; AVX512-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu (%rdx), %xmm1 -; AVX512-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 -; AVX512-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: retq +; AVX-LABEL: sad_double_reduction: +; AVX: # %bb.0: # %bb +; AVX-NEXT: vmovdqu (%rdi), %xmm0 +; AVX-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqu (%rdx), %xmm1 +; AVX-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 +; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq bb: %tmp = load <16 x i8>, <16 x i8>* %arg, align 1 %tmp4 = load <16 x i8>, <16 x i8>* %arg1, align 1 @@ -1148,47 +1120,19 @@ ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; -; AVX1-LABEL: sad_double_reduction_abs: -; AVX1: # %bb.0: # %bb -; AVX1-NEXT: vmovdqu (%rdi), %xmm0 -; AVX1-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqu (%rdx), %xmm1 -; AVX1-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: sad_double_reduction_abs: -; AVX2: # %bb.0: # %bb -; AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; AVX2-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqu (%rdx), %xmm1 -; AVX2-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 -; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq -; -; AVX512-LABEL: sad_double_reduction_abs: -; AVX512: # %bb.0: # %bb -; AVX512-NEXT: vmovdqu (%rdi), %xmm0 -; AVX512-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu (%rdx), %xmm1 -; AVX512-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 -; AVX512-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: retq +; AVX-LABEL: sad_double_reduction_abs: +; AVX: # %bb.0: # %bb +; AVX-NEXT: vmovdqu (%rdi), %xmm0 +; AVX-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqu (%rdx), %xmm1 +; AVX-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 +; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq bb: %tmp = load <16 x i8>, <16 x i8>* %arg, align 1 %tmp4 = load <16 x i8>, <16 x i8>* %arg1, align 1 diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -1749,7 +1749,11 @@ ; X86-SSE-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE-NEXT: pmulhuw %xmm1, %xmm2 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; @@ -1772,7 +1776,11 @@ ; X64-SSE-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE-NEXT: pmulhuw %xmm1, %xmm2 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll --- a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll +++ b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll @@ -167,7 +167,7 @@ ; CHECK-LABEL: f19: ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; CHECK-NEXT: vcvtph2psx %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %ret = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16( diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -202,26 +202,28 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind { ; SSE2-LABEL: var_funnnel_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pandn %xmm4, %xmm5 -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [31,31,31,31] +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] ; SSE2-NEXT: psrld $1, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: psrld %xmm3, %xmm6 -; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7] +; SSE2-NEXT: psrld %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm4[0],xmm5[1,2,3] ; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psrld %xmm7, %xmm3 +; SSE2-NEXT: psrld %xmm5, %xmm3 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: psrld %xmm6, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrld %xmm5, %xmm1 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] -; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: psrld %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,3,3,3,4,5,6,7] +; SSE2-NEXT: psrld %xmm4, %xmm1 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[0,3] +; SSE2-NEXT: pand %xmm8, %xmm2 ; SSE2-NEXT: pslld $23, %xmm2 ; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm1 @@ -237,31 +239,33 @@ ; ; SSE41-LABEL: var_funnnel_v4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [31,31,31,31] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [31,31,31,31] ; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pandn %xmm8, %xmm4 -; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] +; SSE41-NEXT: pandn %xmm3, %xmm4 +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; SSE41-NEXT: psrld $1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm7 +; SSE41-NEXT: psrld %xmm6, %xmm7 +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1],xmm5[2,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm1, %xmm6 ; SSE41-NEXT: psrld %xmm5, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] -; SSE41-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: psrld %xmm7, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4,5,6,7] -; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm6 -; SSE41-NEXT: psrld %xmm4, %xmm6 -; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4,5,6,7] +; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm7 +; SSE41-NEXT: psrld %xmm5, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,3,3,3,4,5,6,7] ; SSE41-NEXT: psrld %xmm4, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,3],xmm6[4,5],xmm3[6,7] -; SSE41-NEXT: pand %xmm8, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7] +; SSE41-NEXT: pand %xmm3, %xmm2 ; SSE41-NEXT: pslld $23, %xmm2 ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm1 ; SSE41-NEXT: pmulld %xmm1, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_funnnel_v4i32: @@ -385,35 +389,37 @@ ; ; X86-SSE2-LABEL: var_funnnel_v4i32: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 -; X86-SSE2-NEXT: pandn %xmm4, %xmm5 +; X86-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm5 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7] ; X86-SSE2-NEXT: psrld $1, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 ; X86-SSE2-NEXT: psrld %xmm3, %xmm6 -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7] +; X86-SSE2-NEXT: pxor %xmm7, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; X86-SSE2-NEXT: movss {{.*#+}} xmm7 = xmm5[0],xmm7[1,2,3] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 ; X86-SSE2-NEXT: psrld %xmm7, %xmm3 ; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm7 -; X86-SSE2-NEXT: psrld %xmm6, %xmm7 -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] -; X86-SSE2-NEXT: psrld %xmm5, %xmm1 -; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] -; X86-SSE2-NEXT: pand %xmm4, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE2-NEXT: psrld %xmm0, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,3,3,3,4,5,6,7] +; X86-SSE2-NEXT: psrld %xmm0, %xmm1 +; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm6[0,3] +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 ; X86-SSE2-NEXT: pslld $23, %xmm2 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm1 +; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pmuludq %xmm0, %xmm4 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-SSE2-NEXT: por %xmm3, %xmm0 ; X86-SSE2-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -203,25 +203,27 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind { ; SSE2-LABEL: var_funnnel_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31] +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [31,31,31,31] ; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm8, %xmm5 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: psrld %xmm3, %xmm6 -; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7] +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE2-NEXT: movss {{.*#+}} xmm7 = xmm5[0],xmm7[1,2,3] ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: psrld %xmm7, %xmm3 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: psrld %xmm6, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrld %xmm5, %xmm1 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] -; SSE2-NEXT: pandn %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: psrld %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,3,3,3,4,5,6,7] +; SSE2-NEXT: psrld %xmm4, %xmm1 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm6[0,3] +; SSE2-NEXT: pandn %xmm8, %xmm2 ; SSE2-NEXT: pslld $23, %xmm2 ; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm1 @@ -238,31 +240,33 @@ ; ; SSE41-LABEL: var_funnnel_v4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [31,31,31,31] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [31,31,31,31] ; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pand %xmm8, %xmm4 -; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] +; SSE41-NEXT: pand %xmm3, %xmm4 +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSE41-NEXT: movdqa %xmm1, %xmm7 +; SSE41-NEXT: psrld %xmm6, %xmm7 +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1],xmm5[2,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm1, %xmm6 ; SSE41-NEXT: psrld %xmm5, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] -; SSE41-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: psrld %xmm7, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4,5,6,7] -; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm6 -; SSE41-NEXT: psrld %xmm4, %xmm6 -; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4,5,6,7] +; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm7 +; SSE41-NEXT: psrld %xmm5, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,3,3,3,4,5,6,7] ; SSE41-NEXT: psrld %xmm4, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,3],xmm6[4,5],xmm3[6,7] -; SSE41-NEXT: pandn %xmm8, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7] +; SSE41-NEXT: pandn %xmm3, %xmm2 ; SSE41-NEXT: pslld $23, %xmm2 ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm1 ; SSE41-NEXT: pslld $1, %xmm0 ; SSE41-NEXT: pmulld %xmm1, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_funnnel_v4i32: @@ -387,35 +391,37 @@ ; ; X86-SSE2-LABEL: var_funnnel_v4i32: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 -; X86-SSE2-NEXT: pand %xmm4, %xmm5 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm5 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 ; X86-SSE2-NEXT: psrld %xmm3, %xmm6 -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7] +; X86-SSE2-NEXT: pxor %xmm7, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; X86-SSE2-NEXT: movss {{.*#+}} xmm7 = xmm5[0],xmm7[1,2,3] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 ; X86-SSE2-NEXT: psrld %xmm7, %xmm3 ; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm7 -; X86-SSE2-NEXT: psrld %xmm6, %xmm7 -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] -; X86-SSE2-NEXT: psrld %xmm5, %xmm1 -; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] -; X86-SSE2-NEXT: pandn %xmm4, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE2-NEXT: psrld %xmm0, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,3,3,3,4,5,6,7] +; X86-SSE2-NEXT: psrld %xmm0, %xmm1 +; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm6[0,3] +; X86-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 ; X86-SSE2-NEXT: pslld $23, %xmm2 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm1 -; X86-SSE2-NEXT: pslld $1, %xmm0 +; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm0 +; X86-SSE2-NEXT: pslld $1, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pmuludq %xmm0, %xmm4 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-SSE2-NEXT: por %xmm3, %xmm0 ; X86-SSE2-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -1013,21 +1013,21 @@ ; SSE2-LABEL: shuffle_v4f32_0z2z: ; SSE2: # %bb.0: ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4f32_0z2z: ; SSE3: # %bb.0: ; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4f32_0z2z: ; SSSE3: # %bb.0: ; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSSE3-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -2249,9 +2249,7 @@ define <8 x i64> @test_v8i64_insert_zero_128(<8 x i64> %a) { ; ALL-LABEL: test_v8i64_insert_zero_128: ; ALL: # %bb.0: -; ALL-NEXT: movb $3, %al -; ALL-NEXT: kmovw %eax, %k1 -; ALL-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} +; ALL-NEXT: vmovaps %xmm0, %xmm0 ; ALL-NEXT: ret{{[l|q]}} %res = shufflevector <8 x i64> %a, <8 x i64> , <8 x i32> ret <8 x i64> %res diff --git a/llvm/test/CodeGen/X86/vselect-constants.ll b/llvm/test/CodeGen/X86/vselect-constants.ll --- a/llvm/test/CodeGen/X86/vselect-constants.ll +++ b/llvm/test/CodeGen/X86/vselect-constants.ll @@ -282,13 +282,9 @@ ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: pcmpeqw %xmm0, %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [1,0,0,0] -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: psllw $15, %xmm1 -; SSE-NEXT: psraw $15, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: retq ; @@ -296,10 +292,10 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1,0,0,0] -; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX-NEXT: vpsraw $15, %xmm0, %xmm0 ; AVX-NEXT: vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq