Index: llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h +++ llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h @@ -1434,6 +1434,9 @@ const APInt &getAPIntValue() const { return Value->getValue(); } uint64_t getZExtValue() const { return Value->getZExtValue(); } int64_t getSExtValue() const { return Value->getSExtValue(); } + uint64_t getLimitedValue(uint64_t Limit = UINT64_MAX) { + return Value->getLimitedValue(Limit); + } bool isOne() const { return Value->isOne(); } bool isNullValue() const { return Value->isZero(); } Index: llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -779,33 +779,38 @@ break; } case ISD::SHL: - if (ConstantSDNode *SA = dyn_cast(Op.getOperand(1))) { - unsigned ShAmt = SA->getZExtValue(); + if (ConstantSDNode *SA = isConstOrConstSplat(Op.getOperand(1))) { SDValue InOp = Op.getOperand(0); // If the shift count is an invalid immediate, don't do anything. - if (ShAmt >= BitWidth) + if (SA->getAPIntValue().uge(BitWidth)) break; + unsigned ShAmt = SA->getZExtValue(); + // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a // single shift. We can do this if the bottom bits (which are shifted // out) are never demanded. - if (InOp.getOpcode() == ISD::SRL && - isa(InOp.getOperand(1))) { - if (ShAmt && (NewMask & APInt::getLowBitsSet(BitWidth, ShAmt)) == 0) { - unsigned C1= cast(InOp.getOperand(1))->getZExtValue(); - unsigned Opc = ISD::SHL; - int Diff = ShAmt-C1; - if (Diff < 0) { - Diff = -Diff; - Opc = ISD::SRL; - } + if (InOp.getOpcode() == ISD::SRL) { + if (ConstantSDNode *SA2 = isConstOrConstSplat(InOp.getOperand(1))) { + if (ShAmt && (NewMask & APInt::getLowBitsSet(BitWidth, ShAmt)) == 0) { + if (SA2->getAPIntValue().ult(BitWidth)) { + unsigned C1 = SA2->getZExtValue(); + unsigned Opc = ISD::SHL; + int Diff = ShAmt-C1; + if (Diff < 0) { + Diff = -Diff; + Opc = ISD::SRL; + } - SDValue NewSA = - TLO.DAG.getConstant(Diff, dl, Op.getOperand(1).getValueType()); - EVT VT = Op.getValueType(); - return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT, - InOp.getOperand(0), NewSA)); + SDValue NewSA = + TLO.DAG.getConstant(Diff, dl, Op.getOperand(1).getValueType()); + EVT VT = Op.getValueType(); + return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT, + InOp.getOperand(0), + NewSA)); + } + } } } @@ -817,7 +822,7 @@ if (InOp.getNode()->getOpcode() == ISD::ANY_EXTEND) { SDValue InnerOp = InOp.getOperand(0); EVT InnerVT = InnerOp.getValueType(); - unsigned InnerBits = InnerVT.getSizeInBits(); + unsigned InnerBits = InnerVT.getScalarSizeInBits(); if (ShAmt < InnerBits && NewMask.getActiveBits() <= InnerBits && isTypeDesirableForOp(ISD::SHL, InnerVT)) { EVT ShTy = getShiftAmountTy(InnerVT, DL); @@ -836,45 +841,42 @@ // (shl (anyext x), c2-c1). This requires that the bottom c1 bits // aren't demanded (as above) and that the shifted upper c1 bits of // x aren't demanded. - if (InOp.hasOneUse() && - InnerOp.getOpcode() == ISD::SRL && - InnerOp.hasOneUse() && - isa(InnerOp.getOperand(1))) { - unsigned InnerShAmt = cast(InnerOp.getOperand(1)) - ->getZExtValue(); - if (InnerShAmt < ShAmt && - InnerShAmt < InnerBits && - NewMask.getActiveBits() <= (InnerBits - InnerShAmt + ShAmt) && - NewMask.countTrailingZeros() >= ShAmt) { - SDValue NewSA = - TLO.DAG.getConstant(ShAmt - InnerShAmt, dl, - Op.getOperand(1).getValueType()); - EVT VT = Op.getValueType(); - SDValue NewExt = TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT, - InnerOp.getOperand(0)); - return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl, VT, - NewExt, NewSA)); + if (InOp.hasOneUse() && InnerOp.getOpcode() == ISD::SRL && + InnerOp.hasOneUse()) { + if (ConstantSDNode *SA2 = isConstOrConstSplat(InnerOp.getOperand(1))) { + unsigned InnerShAmt = SA2->getLimitedValue(InnerBits); + if (InnerShAmt < ShAmt && + InnerShAmt < InnerBits && + NewMask.getActiveBits() <= (InnerBits - InnerShAmt + ShAmt) && + NewMask.countTrailingZeros() >= ShAmt) { + SDValue NewSA = + TLO.DAG.getConstant(ShAmt - InnerShAmt, dl, + Op.getOperand(1).getValueType()); + EVT VT = Op.getValueType(); + SDValue NewExt = TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT, + InnerOp.getOperand(0)); + return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl, VT, + NewExt, NewSA)); + } } } } - Known.Zero <<= SA->getZExtValue(); - Known.One <<= SA->getZExtValue(); + Known.Zero <<= ShAmt; + Known.One <<= ShAmt; // low bits known zero. - Known.Zero.setLowBits(SA->getZExtValue()); + Known.Zero.setLowBits(ShAmt); } break; case ISD::SRL: - if (ConstantSDNode *SA = dyn_cast(Op.getOperand(1))) { - EVT VT = Op.getValueType(); - unsigned ShAmt = SA->getZExtValue(); - unsigned VTSize = VT.getSizeInBits(); + if (ConstantSDNode *SA = isConstOrConstSplat(Op.getOperand(1))) { SDValue InOp = Op.getOperand(0); // If the shift count is an invalid immediate, don't do anything. - if (ShAmt >= BitWidth) + if (SA->getAPIntValue().uge(BitWidth)) break; + unsigned ShAmt = SA->getZExtValue(); APInt InDemandedMask = (NewMask << ShAmt); // If the shift is exact, then it does demand the low bits (and knows that @@ -885,21 +887,27 @@ // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a // single shift. We can do this if the top bits (which are shifted out) // are never demanded. - if (InOp.getOpcode() == ISD::SHL && - isa(InOp.getOperand(1))) { - if (ShAmt && (NewMask & APInt::getHighBitsSet(VTSize, ShAmt)) == 0) { - unsigned C1= cast(InOp.getOperand(1))->getZExtValue(); - unsigned Opc = ISD::SRL; - int Diff = ShAmt-C1; - if (Diff < 0) { - Diff = -Diff; - Opc = ISD::SHL; - } + if (InOp.getOpcode() == ISD::SHL) { + if (ConstantSDNode *SA2 = isConstOrConstSplat(InOp.getOperand(1))) { + if (ShAmt && + (NewMask & APInt::getHighBitsSet(BitWidth, ShAmt)) == 0) { + if (SA2->getAPIntValue().ult(BitWidth)) { + unsigned C1 = SA2->getZExtValue(); + unsigned Opc = ISD::SRL; + int Diff = ShAmt-C1; + if (Diff < 0) { + Diff = -Diff; + Opc = ISD::SHL; + } - SDValue NewSA = - TLO.DAG.getConstant(Diff, dl, Op.getOperand(1).getValueType()); - return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT, - InOp.getOperand(0), NewSA)); + SDValue NewSA = + TLO.DAG.getConstant(Diff, dl, Op.getOperand(1).getValueType()); + EVT VT = Op.getValueType(); + return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT, + InOp.getOperand(0), + NewSA)); + } + } } } @@ -923,14 +931,14 @@ TLO.DAG.getNode(ISD::SRL, dl, Op.getValueType(), Op.getOperand(0), Op.getOperand(1))); - if (ConstantSDNode *SA = dyn_cast(Op.getOperand(1))) { + if (ConstantSDNode *SA = isConstOrConstSplat(Op.getOperand(1))) { EVT VT = Op.getValueType(); - unsigned ShAmt = SA->getZExtValue(); // If the shift count is an invalid immediate, don't do anything. - if (ShAmt >= BitWidth) + if (SA->getAPIntValue().uge(BitWidth)) break; + unsigned ShAmt = SA->getZExtValue(); APInt InDemandedMask = (NewMask << ShAmt); // If the shift is exact, then it does demand the low bits (and knows that Index: llvm/trunk/lib/Target/ARM/ARMInstrNEON.td =================================================================== --- llvm/trunk/lib/Target/ARM/ARMInstrNEON.td +++ llvm/trunk/lib/Target/ARM/ARMInstrNEON.td @@ -5529,6 +5529,12 @@ (VSHLLi16 DPR:$Rn, 16)>; def : Pat<(v2i64 (NEONvshl (sext (v2i32 DPR:$Rn)), (i32 32))), (VSHLLi32 DPR:$Rn, 32)>; +def : Pat<(v8i16 (NEONvshl (anyext (v8i8 DPR:$Rn)), (i32 8))), + (VSHLLi8 DPR:$Rn, 8)>; +def : Pat<(v4i32 (NEONvshl (anyext (v4i16 DPR:$Rn)), (i32 16))), + (VSHLLi16 DPR:$Rn, 16)>; +def : Pat<(v2i64 (NEONvshl (anyext (v2i32 DPR:$Rn)), (i32 32))), + (VSHLLi32 DPR:$Rn, 32)>; // VSHRN : Vector Shift Right and Narrow defm VSHRN : N2VNSh_HSD<0,1,0b1000,0,0,1, IIC_VSHLiD, "vshrn", "i", Index: llvm/trunk/test/CodeGen/ARM/vshll.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/vshll.ll +++ llvm/trunk/test/CodeGen/ARM/vshll.ll @@ -97,7 +97,7 @@ define <4 x i32> @vshlls16_bad(<4 x i16>* %A) nounwind { ; CHECK-LABEL: vshlls16_bad: -; CHECK: vmovl.s16 +; CHECK: vmovl.u16 ; CHECK: vshl.i32 %tmp1 = load <4 x i16>, <4 x i16>* %A %sext = sext <4 x i16> %tmp1 to <4 x i32> Index: llvm/trunk/test/CodeGen/X86/combine-shl.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/combine-shl.ll +++ llvm/trunk/test/CodeGen/X86/combine-shl.ll @@ -193,17 +193,16 @@ define <8 x i32> @combine_vec_shl_ext_shl0(<8 x i16> %x) { ; SSE-LABEL: combine_vec_shl_ext_shl0: ; SSE: # BB#0: -; SSE-NEXT: pmovsxwd %xmm0, %xmm2 -; SSE-NEXT: pslld $20, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE-NEXT: pmovsxwd %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE-NEXT: pslld $20, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pslld $20, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_shl_ext_shl0: ; AVX: # BB#0: -; AVX-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX-NEXT: vpslld $20, %ymm0, %ymm0 ; AVX-NEXT: retq %1 = shl <8 x i16> %x, Index: llvm/trunk/test/CodeGen/X86/not-and-simplify.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/not-and-simplify.ll +++ llvm/trunk/test/CodeGen/X86/not-and-simplify.ll @@ -47,9 +47,7 @@ define <16 x i8> @shrink_xor_constant2_splat(<16 x i8> %x) { ; ALL-LABEL: shrink_xor_constant2_splat: ; ALL: # BB#0: -; ALL-NEXT: psllw $5, %xmm0 -; ALL-NEXT: pand {{.*}}(%rip), %xmm0 -; ALL-NEXT: pandn {{.*}}(%rip), %xmm0 +; ALL-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; ALL-NEXT: retq %sh = shl <16 x i8> %x, %not = xor <16 x i8> %sh, Index: llvm/trunk/test/CodeGen/X86/sse2-vector-shifts.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse2-vector-shifts.ll +++ llvm/trunk/test/CodeGen/X86/sse2-vector-shifts.ll @@ -336,7 +336,6 @@ define <4 x i32> @shl_zext_shl_v4i32(<4 x i16> %x) nounwind { ; CHECK-LABEL: shl_zext_shl_v4i32: ; CHECK: # BB#0: -; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 ; CHECK-NEXT: pslld $19, %xmm0 ; CHECK-NEXT: retq %shl0 = shl <4 x i16> %x, Index: llvm/trunk/test/CodeGen/X86/vector-blend.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-blend.ll +++ llvm/trunk/test/CodeGen/X86/vector-blend.ll @@ -985,17 +985,15 @@ ; SSE41-LABEL: blend_neg_logic_v4i32_2: ; SSE41: # BB#0: # %entry ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrad $31, %xmm1 ; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: psubd %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm3 ; SSE41-NEXT: movaps %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: blend_neg_logic_v4i32_2: ; AVX: # BB#0: # %entry -; AVX-NEXT: vpsrad $31, %xmm1, %xmm1 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm2 ; AVX-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 Index: llvm/trunk/test/CodeGen/X86/vector-rotate-128.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-rotate-128.ll +++ llvm/trunk/test/CodeGen/X86/vector-rotate-128.ll @@ -1559,13 +1559,8 @@ ; ; X32-SSE-LABEL: splatconstant_rotate_mask_v2i64: ; X32-SSE: # BB#0: -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psllq $15, %xmm1 ; X32-SSE-NEXT: psrlq $49, %xmm0 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 -; X32-SSE-NEXT: por %xmm0, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE-NEXT: retl %shl = shl <2 x i64> %a, %lshr = lshr <2 x i64> %a, @@ -1581,7 +1576,6 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pslld $4, %xmm1 ; SSE-NEXT: psrld $28, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 @@ -1591,7 +1585,6 @@ ; AVX: # BB#0: ; AVX-NEXT: vpslld $4, %xmm0, %xmm1 ; AVX-NEXT: vpsrld $28, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq @@ -1621,7 +1614,6 @@ ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: pslld $4, %xmm1 ; X32-SSE-NEXT: psrld $28, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 ; X32-SSE-NEXT: por %xmm0, %xmm1 ; X32-SSE-NEXT: movdqa %xmm1, %xmm0 Index: llvm/trunk/test/CodeGen/X86/vector-rotate-256.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-rotate-256.ll +++ llvm/trunk/test/CodeGen/X86/vector-rotate-256.ll @@ -997,10 +997,10 @@ define <4 x i64> @splatconstant_rotate_mask_v4i64(<4 x i64> %a) nounwind { ; AVX1-LABEL: splatconstant_rotate_mask_v4i64: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsrlq $49, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpsrlq $49, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlq $49, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; Index: llvm/trunk/test/CodeGen/X86/widen_cast-4.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/widen_cast-4.ll +++ llvm/trunk/test/CodeGen/X86/widen_cast-4.ll @@ -26,7 +26,7 @@ ; NARROW-NEXT: psubw %xmm0, %xmm2 ; NARROW-NEXT: psllw $8, %xmm2 ; NARROW-NEXT: psraw $8, %xmm2 -; NARROW-NEXT: psraw $2, %xmm2 +; NARROW-NEXT: psrlw $2, %xmm2 ; NARROW-NEXT: pshufb %xmm1, %xmm2 ; NARROW-NEXT: movq %xmm2, (%edx,%eax,8) ; NARROW-NEXT: incl (%esp)