Index: llvm/trunk/include/llvm/CodeGen/SelectionDAG.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/SelectionDAG.h +++ llvm/trunk/include/llvm/CodeGen/SelectionDAG.h @@ -1256,12 +1256,22 @@ const; /// Determine which bits of Op are known to be either zero or one and return - /// them in the KnownZero/KnownOne bitsets. Targets can implement the - /// computeKnownBitsForTargetNode method in the TargetLowering class to allow - /// target nodes to be understood. + /// them in the KnownZero/KnownOne bitsets. For vectors, the known bits are + /// those that are shared by every vector element. + /// Targets can implement the computeKnownBitsForTargetNode method in the + /// TargetLowering class to allow target nodes to be understood. void computeKnownBits(SDValue Op, APInt &KnownZero, APInt &KnownOne, unsigned Depth = 0) const; + /// Determine which bits of Op are known to be either zero or one and return + /// them in the KnownZero/KnownOne bitsets. The DemandedElts argument allows + /// us to only collect the known bits that are shared by the requested vector + /// elements. + /// Targets can implement the computeKnownBitsForTargetNode method in the + /// TargetLowering class to allow target nodes to be understood. + void computeKnownBits(SDValue Op, APInt &KnownZero, APInt &KnownOne, + const APInt &DemandedElts, unsigned Depth = 0) const; + /// Test if the given value is known to have exactly one bit set. This differs /// from computeKnownBits in that it doesn't necessarily determine which bit /// is set. Index: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2007,9 +2007,26 @@ } /// Determine which bits of Op are known to be either zero or one and return -/// them in the KnownZero/KnownOne bitsets. +/// them in the KnownZero/KnownOne bitsets. For vectors, the known bits are +/// those that are shared by every vector element. void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, APInt &KnownOne, unsigned Depth) const { + EVT VT = Op.getValueType(); + APInt DemandedElts = VT.isVector() + ? APInt::getAllOnesValue(VT.getVectorNumElements()) + : APInt(1, 1); + computeKnownBits(Op, KnownZero, KnownOne, DemandedElts, Depth); +} + +/// Determine which bits of Op are known to be either zero or one and return +/// them in the KnownZero/KnownOne bitsets. The DemandedElts argument allows +/// us to only collect the known bits that are shared by the requested vector +/// elements. +/// TODO: We only support DemandedElts on a few opcodes so far, the remainder +/// should be added when they become necessary. +void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, + APInt &KnownOne, const APInt &DemandedElts, + unsigned Depth) const { unsigned BitWidth = Op.getScalarValueSizeInBits(); KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything. @@ -2017,6 +2034,10 @@ return; // Limit search depth. APInt KnownZero2, KnownOne2; + unsigned NumElts = DemandedElts.getBitWidth(); + + if (DemandedElts == APInt(NumElts, 0)) + return; // No demanded elts, better to assume we don't know anything. switch (Op.getOpcode()) { case ISD::Constant: @@ -2025,9 +2046,15 @@ KnownZero = ~KnownOne; break; case ISD::BUILD_VECTOR: - // Collect the known bits that are shared by every vector element. + // Collect the known bits that are shared by every demanded vector element. + assert(NumElts == Op.getValueType().getVectorNumElements() && + "Unexpected vector size"); KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth); - for (SDValue SrcOp : Op->ops()) { + for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { + if (!DemandedElts[i]) + continue; + + SDValue SrcOp = Op.getOperand(i); computeKnownBits(SrcOp, KnownZero2, KnownOne2, Depth + 1); // BUILD_VECTOR can implicitly truncate sources, we must handle this. @@ -2038,16 +2065,72 @@ KnownZero2 = KnownZero2.trunc(BitWidth); } - // Known bits are the values that are shared by every element. - // TODO: support per-element known bits. + // Known bits are the values that are shared by every demanded element. + KnownOne &= KnownOne2; + KnownZero &= KnownZero2; + } + break; + case ISD::VECTOR_SHUFFLE: { + // Collect the known bits that are shared by every vector element referenced + // by the shuffle. + APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0); + KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth); + const ShuffleVectorSDNode *SVN = cast(Op); + assert(NumElts == SVN->getMask().size() && "Unexpected vector size"); + for (unsigned i = 0; i != NumElts; ++i) { + int M = SVN->getMaskElt(i); + if (M < 0) { + // For UNDEF elements, we don't know anything about the common state of + // the shuffle result. + // FIXME: Is this too pessimistic? + KnownZero = KnownOne = APInt(BitWidth, 0); + break; + } + if (!DemandedElts[i]) + continue; + + if ((unsigned)M < NumElts) + DemandedLHS.setBit((unsigned)M % NumElts); + else + DemandedRHS.setBit((unsigned)M % NumElts); + } + // Known bits are the values that are shared by every demanded element. + if (DemandedLHS != APInt(NumElts, 0)) { + SDValue LHS = Op.getOperand(0); + computeKnownBits(LHS, KnownZero2, KnownOne2, DemandedLHS, Depth + 1); KnownOne &= KnownOne2; KnownZero &= KnownZero2; } + if (DemandedRHS != APInt(NumElts, 0)) { + SDValue RHS = Op.getOperand(1); + computeKnownBits(RHS, KnownZero2, KnownOne2, DemandedRHS, Depth + 1); + KnownOne &= KnownOne2; + KnownZero &= KnownZero2; + } + break; + } + case ISD::EXTRACT_SUBVECTOR: { + // If we know the element index, just demand that subvector elements, + // otherwise demand them all. + SDValue Src = Op.getOperand(0); + ConstantSDNode *SubIdx = dyn_cast(Op.getOperand(1)); + unsigned NumSrcElts = Src.getValueType().getVectorNumElements(); + if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) { + // Offset the demanded elts by the subvector index. + uint64_t Idx = SubIdx->getZExtValue(); + APInt DemandedSrc = DemandedElts.zext(NumSrcElts).shl(Idx); + computeKnownBits(Src, KnownZero, KnownOne, DemandedSrc, Depth + 1); + } else { + computeKnownBits(Src, KnownZero, KnownOne, Depth + 1); + } break; + } case ISD::AND: // If either the LHS or the RHS are Zero, the result is zero. - computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1); - computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1); + computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, DemandedElts, + Depth + 1); + computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts, + Depth + 1); // Output known-1 bits are only known if set in both the LHS & RHS. KnownOne &= KnownOne2; @@ -2206,7 +2289,8 @@ if (NewBits.getBoolValue()) InputDemandedBits |= InSignBit; - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); + computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts, + Depth + 1); KnownOne &= InputDemandedBits; KnownZero &= InputDemandedBits; @@ -2253,7 +2337,8 @@ APInt NewBits = APInt::getHighBitsSet(BitWidth, BitWidth - InBits); KnownZero = KnownZero.trunc(InBits); KnownOne = KnownOne.trunc(InBits); - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); + computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts, + Depth + 1); KnownZero = KnownZero.zext(BitWidth); KnownOne = KnownOne.zext(BitWidth); KnownZero |= NewBits; @@ -2266,7 +2351,8 @@ KnownZero = KnownZero.trunc(InBits); KnownOne = KnownOne.trunc(InBits); - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); + computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts, + Depth + 1); // Note if the sign bit is known to be zero or one. bool SignBitKnownZero = KnownZero.isNegative(); @@ -2439,16 +2525,28 @@ // At the moment we keep this simple and skip tracking the specific // element. This way we get the lowest common denominator for all elements // of the vector. - // TODO: get information for given vector element + SDValue InVec = Op.getOperand(0); + SDValue EltNo = Op.getOperand(1); + EVT VecVT = InVec.getValueType(); const unsigned BitWidth = Op.getValueSizeInBits(); - const unsigned EltBitWidth = Op.getOperand(0).getScalarValueSizeInBits(); + const unsigned EltBitWidth = VecVT.getScalarSizeInBits(); + const unsigned NumSrcElts = VecVT.getVectorNumElements(); // If BitWidth > EltBitWidth the value is anyext:ed. So we do not know // anything about the extended bits. if (BitWidth > EltBitWidth) { KnownZero = KnownZero.trunc(EltBitWidth); KnownOne = KnownOne.trunc(EltBitWidth); } - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); + ConstantSDNode *ConstEltNo = dyn_cast(EltNo); + if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts)) { + // If we know the element index, just demand that vector element. + unsigned Idx = ConstEltNo->getZExtValue(); + APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx); + computeKnownBits(InVec, KnownZero, KnownOne, DemandedElt, Depth + 1); + } else { + // Unknown element index, so ignore DemandedElts and demand them all. + computeKnownBits(InVec, KnownZero, KnownOne, Depth + 1); + } if (BitWidth > EltBitWidth) { KnownZero = KnownZero.zext(BitWidth); KnownOne = KnownOne.zext(BitWidth); Index: llvm/trunk/test/CodeGen/X86/avx-vperm2x128.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx-vperm2x128.ll +++ llvm/trunk/test/CodeGen/X86/avx-vperm2x128.ll @@ -465,11 +465,9 @@ ; AVX1-LABEL: shuffle_v4i64_67zz: ; AVX1: ## BB#0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_67zz: Index: llvm/trunk/test/CodeGen/X86/known-bits-vector.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/known-bits-vector.ll +++ llvm/trunk/test/CodeGen/X86/known-bits-vector.ll @@ -5,16 +5,14 @@ define i32 @knownbits_mask_extract_sext(<8 x i16> %a0) nounwind { ; X32-LABEL: knownbits_mask_extract_sext: ; X32: # BB#0: -; X32-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0 -; X32-NEXT: vmovd %xmm0, %eax -; X32-NEXT: cwtl +; X32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; X32-NEXT: vpextrw $0, %xmm0, %eax ; X32-NEXT: retl ; ; X64-LABEL: knownbits_mask_extract_sext: ; X64: # BB#0: -; X64-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; X64-NEXT: vmovd %xmm0, %eax -; X64-NEXT: cwtl +; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vpextrw $0, %xmm0, %eax ; X64-NEXT: retq %1 = and <8 x i16> %a0, %2 = extractelement <8 x i16> %1, i32 0 @@ -31,17 +29,10 @@ ; X32-NEXT: subl $16, %esp ; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7] -; X32-NEXT: vpextrd $1, %xmm0, %eax ; X32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) -; X32-NEXT: xorl %ecx, %ecx -; X32-NEXT: testl %eax, %eax -; X32-NEXT: setns %cl ; X32-NEXT: fildll {{[0-9]+}}(%esp) -; X32-NEXT: fadds {{\.LCPI.*}}(,%ecx,4) ; X32-NEXT: fstps {{[0-9]+}}(%esp) -; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: vmovss %xmm0, (%esp) -; X32-NEXT: flds (%esp) +; X32-NEXT: flds {{[0-9]+}}(%esp) ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp ; X32-NEXT: retl @@ -51,18 +42,7 @@ ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7] ; X64-NEXT: vmovq %xmm0, %rax -; X64-NEXT: testq %rax, %rax -; X64-NEXT: js .LBB1_1 -; X64-NEXT: # BB#2: -; X64-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 -; X64-NEXT: retq -; X64-NEXT: .LBB1_1: -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: shrq %rcx -; X64-NEXT: andl $1, %eax -; X64-NEXT: orq %rcx, %rax ; X64-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 -; X64-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; X64-NEXT: retq %1 = and <2 x i64> %a0, %2 = extractelement <2 x i64> %1, i32 0 @@ -74,15 +54,15 @@ ; X32-LABEL: knownbits_mask_shuffle_sext: ; X32: # BB#0: ; X32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 -; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X32-NEXT: vpmovsxwd %xmm0, %xmm0 +; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X32-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X32-NEXT: retl ; ; X64-LABEL: knownbits_mask_shuffle_sext: ; X64: # BB#0: ; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X64-NEXT: vpmovsxwd %xmm0, %xmm0 +; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X64-NEXT: retq %1 = and <8 x i16> %a0, %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> @@ -95,22 +75,14 @@ ; X32: # BB#0: ; X32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 ; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; X32-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] -; X32-NEXT: vpsrld $16, %xmm0, %xmm0 -; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] -; X32-NEXT: vaddps {{\.LCPI.*}}, %xmm0, %xmm0 -; X32-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; X32-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: knownbits_mask_shuffle_uitofp: ; X64: # BB#0: ; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; X64-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] -; X64-NEXT: vpsrld $16, %xmm0, %xmm0 -; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] -; X64-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 -; X64-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-NEXT: retq %1 = and <4 x i32> %a0, %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32>