Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -4975,6 +4975,11 @@ return true; } +static bool canWidenShuffleElements(ArrayRef Mask) { + SmallVector WidenedMask; + return canWidenShuffleElements(Mask, WidenedMask); +} + /// Returns true if Elt is a constant zero or a floating point constant +0.0. bool X86::isZeroNode(SDValue Elt) { return isNullConstant(Elt) || isNullFPConstant(Elt); @@ -8954,6 +8959,12 @@ return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask); } +static bool +is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask) { + SmallVector RepeatedMask; + return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask); +} + /// Test whether a shuffle mask is equivalent within each 256-bit lane. static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask, @@ -23438,12 +23449,6 @@ return R; } - // If possible, lower this packed shift into a vector multiply instead of - // expanding it into a sequence of scalar shifts. - if (Op.getOpcode() == ISD::SHL) - if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG)) - return DAG.getNode(ISD::MUL, dl, VT, R, Scale); - // If possible, lower this shift as a sequence of two shifts by // constant plus a BLENDing shuffle instead of scalarizing it. // Example: @@ -23454,7 +23459,8 @@ // // The advantage is that the two shifts from the example would be // lowered as X86ISD::VSRLI nodes in parallel before blending. - if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) { + if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 || + (VT == MVT::v16i16 && Subtarget.hasInt256()))) { SDValue Amt1, Amt2; unsigned NumElts = VT.getVectorNumElements(); SmallVector ShuffleMask; @@ -23477,8 +23483,13 @@ break; } + // Only perform this blend if we can perform it without loading a mask. if (ShuffleMask.size() == NumElts && Amt1 && Amt2 && - isa(Amt1) && isa(Amt2)) { + isa(Amt1) && isa(Amt2) && + (VT != MVT::v16i16 || + is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) && + (VT == MVT::v4i32 || Subtarget.hasSSE41() || + Op.getOpcode() != ISD::SHL || canWidenShuffleElements(ShuffleMask))) { SDValue Splat1 = DAG.getConstant(cast(Amt1)->getAPIntValue(), dl, VT); SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1); @@ -23489,6 +23500,12 @@ } } + // If possible, lower this packed shift into a vector multiply instead of + // expanding it into a sequence of scalar shifts. + if (Op.getOpcode() == ISD::SHL) + if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG)) + return DAG.getNode(ISD::MUL, dl, VT, R, Scale); + // v4i32 Non Uniform Shifts. // If the shift amount is constant we can shift each lane using the SSE2 // immediate shifts, else we need to zero-extend each lane to the lower i64 Index: llvm/trunk/test/CodeGen/X86/combine-shl.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/combine-shl.ll +++ llvm/trunk/test/CodeGen/X86/combine-shl.ll @@ -264,22 +264,14 @@ ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,1073741824,1073741824] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [536870912,536870912,268435456,268435456] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pslld $31, %xmm2 +; SSE2-NEXT: pslld $30, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pslld $29, %xmm2 +; SSE2-NEXT: pslld $28, %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_shl_ext_shl1: @@ -288,8 +280,14 @@ ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSE41-NEXT: pmovsxwd %xmm1, %xmm1 ; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 -; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 -; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pslld $30, %xmm2 +; SSE41-NEXT: pslld $31, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pslld $28, %xmm2 +; SSE41-NEXT: pslld $29, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_vec_shl_ext_shl1: Index: llvm/trunk/test/CodeGen/X86/lower-vec-shift.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/lower-vec-shift.ll +++ llvm/trunk/test/CodeGen/X86/lower-vec-shift.ll @@ -266,10 +266,14 @@ ; ; AVX1-LABEL: test11: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsllw $1, %xmm1, %xmm2 +; AVX1-NEXT: vpsllw $3, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6],xmm2[7] +; AVX1-NEXT: vpsllw $3, %xmm0, %xmm2 +; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test11: @@ -291,15 +295,20 @@ ; AVX1-LABEL: test12: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,8,2,2,2,8,8,8] -; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2 +; AVX1-NEXT: vpsllw $1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4],xmm2[5,6,7] +; AVX1-NEXT: vpsllw $3, %xmm0, %xmm2 +; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test12: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpsllw $3, %ymm0, %ymm1 +; AVX2-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5,6,7],ymm0[8],ymm1[9],ymm0[10,11,12],ymm1[13,14,15] ; AVX2-NEXT: retq %lshr = shl <16 x i16> %a, ret <16 x i16> %lshr Index: llvm/trunk/test/CodeGen/X86/vec_shift6.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vec_shift6.ll +++ llvm/trunk/test/CodeGen/X86/vec_shift6.ll @@ -67,19 +67,18 @@ define <4 x i32> @test4(<4 x i32> %a) { ; SSE2-LABEL: test4: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,2,2] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pslld $1, %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE2-NEXT: movapd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test4: ; SSE41: # %bb.0: -; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pslld $1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test4: Index: llvm/trunk/test/CodeGen/X86/widen_arith-4.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/widen_arith-4.ll +++ llvm/trunk/test/CodeGen/X86/widen_arith-4.ll @@ -49,7 +49,6 @@ ; SSE41-NEXT: movw $0, -{{[0-9]+}}(%rsp) ; SSE41-NEXT: movl $0, -{{[0-9]+}}(%rsp) ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = <271,271,271,271,271,u,u,u> -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <2,4,2,2,2,u,u,u> ; SSE41-NEXT: jmp .LBB0_1 ; SSE41-NEXT: .p2align 4, 0x90 ; SSE41-NEXT: .LBB0_2: # %forbody @@ -58,10 +57,13 @@ ; SSE41-NEXT: movq -{{[0-9]+}}(%rsp), %rcx ; SSE41-NEXT: shlq $4, %rax ; SSE41-NEXT: movq -{{[0-9]+}}(%rsp), %rdx -; SSE41-NEXT: movdqa (%rdx,%rax), %xmm2 -; SSE41-NEXT: psubw %xmm0, %xmm2 -; SSE41-NEXT: pmullw %xmm1, %xmm2 -; SSE41-NEXT: pextrw $4, %xmm2, 8(%rcx,%rax) +; SSE41-NEXT: movdqa (%rdx,%rax), %xmm1 +; SSE41-NEXT: psubw %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psllw $2, %xmm2 +; SSE41-NEXT: psllw $1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] +; SSE41-NEXT: pextrw $4, %xmm1, 8(%rcx,%rax) ; SSE41-NEXT: movq %xmm2, (%rcx,%rax) ; SSE41-NEXT: incl -{{[0-9]+}}(%rsp) ; SSE41-NEXT: .LBB0_1: # %forcond