Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -31813,6 +31813,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( SDValue Op, const APInt &OriginalDemandedBits, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const { + unsigned BitWidth = OriginalDemandedBits.getBitWidth(); unsigned Opc = Op.getOpcode(); switch(Opc) { case X86ISD::PMULDQ: @@ -31829,6 +31830,40 @@ return true; break; } + case X86ISD::VSHLI: { + if (auto *ShiftImm = dyn_cast(Op.getOperand(1))) { + if (ShiftImm->getAPIntValue().uge(BitWidth)) + break; + + KnownBits KnownOp; + unsigned ShAmt = ShiftImm->getZExtValue(); + APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt); + if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask, KnownOp, TLO, + Depth + 1)) + return true; + } + break; + } + case X86ISD::VSRAI: + case X86ISD::VSRLI: { + if (auto *ShiftImm = dyn_cast(Op.getOperand(1))) { + if (ShiftImm->getAPIntValue().uge(BitWidth)) + break; + + KnownBits KnownOp; + unsigned ShAmt = ShiftImm->getZExtValue(); + APInt DemandedMask = OriginalDemandedBits << ShAmt; + + if (Opc == X86ISD::VSRAI && + OriginalDemandedBits.countLeadingZeros() < ShAmt) + DemandedMask.setSignBit(); + + if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask, KnownOp, TLO, + Depth + 1)) + return true; + } + break; + } } return TargetLowering::SimplifyDemandedBitsForTargetNode( @@ -34857,6 +34892,11 @@ return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N)); } + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedBits(SDValue(N, 0), + APInt::getAllOnesValue(NumBitsPerElt), DCI)) + return SDValue(N, 0); + return SDValue(); } Index: test/CodeGen/X86/combine-srl.ll =================================================================== --- test/CodeGen/X86/combine-srl.ll +++ test/CodeGen/X86/combine-srl.ll @@ -63,17 +63,7 @@ define <4 x i32> @combine_vec_lshr_known_zero1(<4 x i32> %x) { ; SSE-LABEL: combine_vec_lshr_known_zero1: ; SSE: # %bb.0: -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $11, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrld $9, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $10, %xmm1 -; SSE-NEXT: psrld $8, %xmm0 -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_lshr_known_zero1: Index: test/CodeGen/X86/combine-udiv.ll =================================================================== --- test/CodeGen/X86/combine-udiv.ll +++ test/CodeGen/X86/combine-udiv.ll @@ -669,20 +669,15 @@ ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: pmullw %xmm0, %xmm2 ; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: pmullw %xmm3, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: packuswb %xmm0, %xmm3 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: psllw $1, %xmm3 -; SSE41-NEXT: psllw $8, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: psllw $8, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: packuswb %xmm3, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: psllw $1, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] ; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: packuswb %xmm0, %xmm2 +; SSE41-NEXT: packuswb %xmm3, %xmm2 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 @@ -693,21 +688,16 @@ ; AVX1-NEXT: movl $171, %eax ; AVX1-NEXT: vmovd %eax, %xmm1 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $8, %xmm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpsllw $1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX1-NEXT: vpsllw $1, %xmm3, %xmm3 -; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7] -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq Index: test/CodeGen/X86/known-signbits-vector.ll =================================================================== --- test/CodeGen/X86/known-signbits-vector.ll +++ test/CodeGen/X86/known-signbits-vector.ll @@ -91,17 +91,14 @@ ; X32-LABEL: signbits_ashr_extract_sitofp_1: ; X32: # %bb.0: ; X32-NEXT: pushl %eax -; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648] -; X32-NEXT: vpsrlq $63, %xmm1, %xmm2 -; X32-NEXT: vpsrlq $32, %xmm1, %xmm1 -; X32-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; X32-NEXT: vpsrlq $63, %xmm0, %xmm2 +; X32-NEXT: vpsrlq $63, %xmm0, %xmm1 ; X32-NEXT: vpsrlq $32, %xmm0, %xmm0 -; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [0,32768,0,0,1,0,0,0] ; X32-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X32-NEXT: vpsubq %xmm1, %xmm0, %xmm0 ; X32-NEXT: vmovd %xmm0, %eax -; X32-NEXT: vcvtsi2ssl %eax, %xmm3, %xmm0 +; X32-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0 ; X32-NEXT: vmovss %xmm0, (%esp) ; X32-NEXT: flds (%esp) ; X32-NEXT: popl %eax @@ -128,18 +125,15 @@ ; X32-LABEL: signbits_ashr_shl_extract_sitofp: ; X32: # %bb.0: ; X32-NEXT: pushl %eax -; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648] -; X32-NEXT: vpsrlq $60, %xmm1, %xmm2 -; X32-NEXT: vpsrlq $61, %xmm1, %xmm1 -; X32-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; X32-NEXT: vpsrlq $60, %xmm0, %xmm2 +; X32-NEXT: vpsrlq $60, %xmm0, %xmm1 ; X32-NEXT: vpsrlq $61, %xmm0, %xmm0 -; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [4,0,0,0,8,0,0,0] ; X32-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X32-NEXT: vpsubq %xmm1, %xmm0, %xmm0 ; X32-NEXT: vpsllq $20, %xmm0, %xmm0 ; X32-NEXT: vmovd %xmm0, %eax -; X32-NEXT: vcvtsi2ssl %eax, %xmm3, %xmm0 +; X32-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0 ; X32-NEXT: vmovss %xmm0, (%esp) ; X32-NEXT: flds (%esp) ; X32-NEXT: popl %eax @@ -263,13 +257,10 @@ ; X32: # %bb.0: ; X32-NEXT: pushl %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X32-NEXT: vpsrlq $60, %xmm2, %xmm3 -; X32-NEXT: vpsrlq $61, %xmm2, %xmm2 -; X32-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; X32-NEXT: vpsrlq $60, %xmm0, %xmm3 +; X32-NEXT: vpsrlq $60, %xmm0, %xmm2 ; X32-NEXT: vpsrlq $61, %xmm0, %xmm0 -; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; X32-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,0,0,8,0,0,0] ; X32-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; X32-NEXT: vpsubq %xmm2, %xmm0, %xmm0 ; X32-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1 @@ -281,7 +272,7 @@ ; X32-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; X32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X32-NEXT: vmovd %xmm0, %eax -; X32-NEXT: vcvtsi2ssl %eax, %xmm4, %xmm0 +; X32-NEXT: vcvtsi2ssl %eax, %xmm3, %xmm0 ; X32-NEXT: vmovss %xmm0, (%esp) ; X32-NEXT: flds (%esp) ; X32-NEXT: popl %eax @@ -320,13 +311,10 @@ ; X32-LABEL: signbits_ashr_sextvecinreg_bitops_extract_sitofp: ; X32: # %bb.0: ; X32-NEXT: pushl %eax -; X32-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X32-NEXT: vpsrlq $60, %xmm2, %xmm3 -; X32-NEXT: vpsrlq $61, %xmm2, %xmm2 -; X32-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; X32-NEXT: vpsrlq $60, %xmm0, %xmm3 +; X32-NEXT: vpsrlq $60, %xmm0, %xmm2 ; X32-NEXT: vpsrlq $61, %xmm0, %xmm0 -; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; X32-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,0,0,8,0,0,0] ; X32-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; X32-NEXT: vpsubq %xmm2, %xmm0, %xmm0 ; X32-NEXT: vpmovsxdq %xmm1, %xmm1 @@ -334,7 +322,7 @@ ; X32-NEXT: vpor %xmm1, %xmm2, %xmm1 ; X32-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; X32-NEXT: vmovd %xmm0, %eax -; X32-NEXT: vcvtsi2ssl %eax, %xmm4, %xmm0 +; X32-NEXT: vcvtsi2ssl %eax, %xmm3, %xmm0 ; X32-NEXT: vmovss %xmm0, (%esp) ; X32-NEXT: flds (%esp) ; X32-NEXT: popl %eax @@ -375,22 +363,19 @@ ; X32-NEXT: subl $16, %esp ; X32-NEXT: vpmovsxdq 16(%ebp), %xmm3 ; X32-NEXT: vpmovsxdq 8(%ebp), %xmm4 -; X32-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2147483648,0,2147483648] +; X32-NEXT: vextractf128 $1, %ymm2, %xmm5 ; X32-NEXT: vpsrlq $63, %xmm5, %xmm6 ; X32-NEXT: vpsrlq $33, %xmm5, %xmm5 ; X32-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7] -; X32-NEXT: vextractf128 $1, %ymm2, %xmm6 -; X32-NEXT: vpsrlq $63, %xmm6, %xmm7 -; X32-NEXT: vpsrlq $33, %xmm6, %xmm6 -; X32-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4,5,6,7] -; X32-NEXT: vpxor %xmm5, %xmm6, %xmm6 -; X32-NEXT: vpsubq %xmm5, %xmm6, %xmm6 +; X32-NEXT: vmovdqa {{.*#+}} xmm6 = [0,16384,0,0,1,0,0,0] +; X32-NEXT: vpxor %xmm6, %xmm5, %xmm5 +; X32-NEXT: vpsubq %xmm6, %xmm5, %xmm5 ; X32-NEXT: vpsrlq $63, %xmm2, %xmm7 ; X32-NEXT: vpsrlq $33, %xmm2, %xmm2 ; X32-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm7[4,5,6,7] -; X32-NEXT: vpxor %xmm5, %xmm2, %xmm2 -; X32-NEXT: vpsubq %xmm5, %xmm2, %xmm2 -; X32-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; X32-NEXT: vpxor %xmm6, %xmm2, %xmm2 +; X32-NEXT: vpsubq %xmm6, %xmm2, %xmm2 +; X32-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 ; X32-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; X32-NEXT: vextractf128 $1, %ymm1, %xmm4 ; X32-NEXT: vextractf128 $1, %ymm0, %xmm5 Index: test/CodeGen/X86/pr35918.ll =================================================================== --- test/CodeGen/X86/pr35918.ll +++ test/CodeGen/X86/pr35918.ll @@ -11,9 +11,9 @@ ; X86-SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SKYLAKE-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SKYLAKE-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86-SKYLAKE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X86-SKYLAKE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X86-SKYLAKE-NEXT: vpsrad $16, %xmm0, %xmm0 +; X86-SKYLAKE-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X86-SKYLAKE-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X86-SKYLAKE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] ; X86-SKYLAKE-NEXT: vpsrld $7, %xmm0, %xmm0 @@ -29,7 +29,7 @@ ; X86-SKX-NEXT: subl $8, %esp ; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SKX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,1],zero,zero,xmm0[2,3],zero,zero,xmm0[u,u],zero,zero,xmm0[u,u] +; X86-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,1,u,u,2,3,u,u,u,u,u,u,u,u] ; X86-SKX-NEXT: vpsrad $16, %xmm0, %xmm0 ; X86-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X86-SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -50,9 +50,9 @@ ; X64-SKYLAKE-LABEL: fetch_r16g16_snorm_unorm8: ; X64-SKYLAKE: # %bb.0: # %entry ; X64-SKYLAKE-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SKYLAKE-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-SKYLAKE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-SKYLAKE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X64-SKYLAKE-NEXT: vpsrad $16, %xmm0, %xmm0 +; X64-SKYLAKE-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-SKYLAKE-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-SKYLAKE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] ; X64-SKYLAKE-NEXT: vpsrld $7, %xmm0, %xmm0 @@ -65,7 +65,7 @@ ; X64-SKX-LABEL: fetch_r16g16_snorm_unorm8: ; X64-SKX: # %bb.0: # %entry ; X64-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SKX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,1],zero,zero,xmm0[2,3],zero,zero,xmm0[u,u],zero,zero,xmm0[u,u] +; X64-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,1,u,u,2,3,u,u,u,u,u,u,u,u] ; X64-SKX-NEXT: vpsrad $16, %xmm0, %xmm0 ; X64-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 Index: test/CodeGen/X86/vector-shift-ashr-128.ll =================================================================== --- test/CodeGen/X86/vector-shift-ashr-128.ll +++ test/CodeGen/X86/vector-shift-ashr-128.ll @@ -990,15 +990,11 @@ ; ; X32-SSE-LABEL: constant_shift_v2i64: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648] -; X32-SSE-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE-NEXT: psrlq $1, %xmm2 -; X32-SSE-NEXT: psrlq $7, %xmm1 -; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psrlq $1, %xmm2 +; X32-SSE-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE-NEXT: psrlq $1, %xmm1 ; X32-SSE-NEXT: psrlq $7, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X32-SSE-NEXT: movapd {{.*#+}} xmm1 = [2.0E+0,7.2911220195563975E-304] ; X32-SSE-NEXT: xorpd %xmm1, %xmm0 ; X32-SSE-NEXT: psubq %xmm1, %xmm0 ; X32-SSE-NEXT: retl Index: test/CodeGen/X86/vector-shift-ashr-256.ll =================================================================== --- test/CodeGen/X86/vector-shift-ashr-256.ll +++ test/CodeGen/X86/vector-shift-ashr-256.ll @@ -1066,25 +1066,20 @@ ; ; X32-AVX1-LABEL: constant_shift_v4i64: ; X32-AVX1: # %bb.0: -; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648] +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X32-AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2 -; X32-AVX1-NEXT: vpsrlq $31, %xmm1, %xmm3 -; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X32-AVX1-NEXT: vpsrlq $62, %xmm3, %xmm4 -; X32-AVX1-NEXT: vpsrlq $31, %xmm3, %xmm3 -; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] -; X32-AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm3 -; X32-AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 -; X32-AVX1-NEXT: vpsrlq $7, %xmm1, %xmm3 -; X32-AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1 -; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; X32-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,1,0,2,0,0,0] +; X32-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2 ; X32-AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 -; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; X32-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X32-AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,16384,0,0,0,256] +; X32-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X32-AVX1-NEXT: retl ; ; X32-AVX2-LABEL: constant_shift_v4i64: Index: test/CodeGen/X86/vector-trunc-usat.ll =================================================================== --- test/CodeGen/X86/vector-trunc-usat.ll +++ test/CodeGen/X86/vector-trunc-usat.ll @@ -716,26 +716,26 @@ define <8 x i16> @trunc_usat_v8i32_v8i16(<8 x i32> %a0) { ; SSE2-LABEL: trunc_usat_v8i32_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm6 -; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] +; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm2, %xmm5 -; SSE2-NEXT: por %xmm1, %xmm5 -; SSE2-NEXT: pslld $16, %xmm5 -; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pslld $16, %xmm4 +; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: pslld $16, %xmm0 ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm5, %xmm0 +; SSE2-NEXT: packssdw %xmm4, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_usat_v8i32_v8i16: @@ -826,36 +826,36 @@ define <16 x i16> @trunc_usat_v16i32_v16i16(<16 x i32> %a0) { ; SSE2-LABEL: trunc_usat_v16i32_v16i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535] -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm7, %xmm6 +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: pxor %xmm6, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] ; SSE2-NEXT: movdqa %xmm5, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm7 ; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: pxor %xmm7, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pxor %xmm7, %xmm6 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm6, %xmm4 ; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: pxor %xmm7, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm7, %xmm3 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm6 -; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: pandn %xmm8, %xmm5 -; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pxor %xmm6, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm7, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 +; SSE2-NEXT: pxor %xmm5, %xmm7 +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: por %xmm7, %xmm5 ; SSE2-NEXT: pslld $16, %xmm5 ; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: pslld $16, %xmm0 @@ -870,36 +870,36 @@ ; ; SSSE3-LABEL: trunc_usat_v16i32_v16i16: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535] -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm2, %xmm6 -; SSSE3-NEXT: pxor %xmm7, %xmm6 +; SSSE3-NEXT: movdqa %xmm1, %xmm8 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm2, %xmm7 +; SSSE3-NEXT: pxor %xmm6, %xmm7 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] ; SSSE3-NEXT: movdqa %xmm5, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm7 ; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: pxor %xmm7, %xmm1 ; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm3, %xmm6 -; SSSE3-NEXT: pxor %xmm7, %xmm6 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm6, %xmm4 ; SSSE3-NEXT: movdqa %xmm5, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: pxor %xmm7, %xmm2 ; SSSE3-NEXT: por %xmm3, %xmm2 ; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm7, %xmm3 -; SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm6 -; SSSE3-NEXT: por %xmm6, %xmm0 -; SSSE3-NEXT: pxor %xmm4, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm4 -; SSSE3-NEXT: pandn %xmm8, %xmm5 -; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pxor %xmm6, %xmm3 +; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm7, %xmm4 +; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm8, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 +; SSSE3-NEXT: pxor %xmm5, %xmm7 +; SSSE3-NEXT: pand %xmm8, %xmm5 +; SSSE3-NEXT: por %xmm7, %xmm5 ; SSSE3-NEXT: pslld $16, %xmm5 ; SSSE3-NEXT: psrad $16, %xmm5 ; SSSE3-NEXT: pslld $16, %xmm0