Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -884,6 +884,7 @@ SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const; SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const; Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1068,11 +1068,14 @@ // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); - setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); - setOperationAction(ISD::VSELECT, MVT::v2i64, Legal); + setOperationAction(ISD::VSELECT, MVT::v2f64, Custom); + setOperationAction(ISD::VSELECT, MVT::v2i64, Custom); + setOperationAction(ISD::VSELECT, MVT::v4i32, Custom); + setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); + setOperationAction(ISD::VSELECT, MVT::v8i16, Custom); + // There is no BLENDI for byte vectors. We don't need to custom lower + // some vselects for now. setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); - setOperationAction(ISD::VSELECT, MVT::v4i32, Legal); - setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); // i8 and i16 vectors are custom , because the source register and source // source memory operand types are not the same width. f32 vectors are @@ -1188,10 +1191,10 @@ setOperationAction(ISD::SELECT, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v8f32, Custom); - setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); - setOperationAction(ISD::VSELECT, MVT::v4i64, Legal); - setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); - setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); + setOperationAction(ISD::VSELECT, MVT::v4f64, Custom); + setOperationAction(ISD::VSELECT, MVT::v4i64, Custom); + setOperationAction(ISD::VSELECT, MVT::v8i32, Custom); + setOperationAction(ISD::VSELECT, MVT::v8f32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); @@ -1236,6 +1239,7 @@ setOperationAction(ISD::MULHU, MVT::v16i16, Legal); setOperationAction(ISD::MULHS, MVT::v16i16, Legal); + setOperationAction(ISD::VSELECT, MVT::v16i16, Custom); setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); } else { setOperationAction(ISD::ADD, MVT::v4i64, Custom); @@ -4694,11 +4698,17 @@ return getInsertVINSERTImmediate(N, 256); } +/// isZero - Returns true if Elt is a constant integer zero +static bool isZero(SDValue V) { + ConstantSDNode *C = dyn_cast(V); + return C && C->isNullValue(); +} + /// isZeroNode - Returns true if Elt is a constant zero or a floating point /// constant +0.0. bool X86::isZeroNode(SDValue Elt) { - if (ConstantSDNode *CN = dyn_cast(Elt)) - return CN->isNullValue(); + if (isZero(Elt)) + return true; if (ConstantFPSDNode *CFP = dyn_cast(Elt)) return CFP->getValueAPF().isPosZero(); return false; @@ -7961,6 +7971,117 @@ return SDValue(); } +static bool BUILD_VECTORtoBlendMask(SDValue BuildVector, unsigned &MaskValue, + SelectionDAG &DAG) { + // Check the mask for BLEND and build the value. + MaskValue = 0; + unsigned NumElems = BuildVector.getNumOperands(); + // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. + unsigned NumLanes = (NumElems - 1) / 8 + 1; + unsigned NumElemsInLane = NumElems / NumLanes; + SDValue Undef = DAG.getUNDEF(MVT::i1); + + // Blend for v16i16 should be symetric for the both lanes. + for (unsigned i = 0; i < NumElemsInLane; ++i) { + // The mask bits have to come from a BUILD_VECTOR of constants + + SDValue SndLaneEltCond = + (NumLanes == 2) ? BuildVector.getOperand(i + NumElemsInLane) : Undef; + SDValue EltCond = BuildVector.getOperand(i); + + int Lane1Cond, Lane2Cond; + if (isa(EltCond)) + Lane1Cond = !isZero(EltCond); + else { + if (EltCond.getOpcode() == ISD::UNDEF) + Lane1Cond = -1; + else + return false; + } + if (isa(SndLaneEltCond)) + Lane2Cond = !isZero(SndLaneEltCond); + else { + if (SndLaneEltCond.getOpcode() == ISD::UNDEF) + Lane2Cond = -1; + else + return false; + } + + if (Lane1Cond == Lane2Cond || Lane2Cond < 0) + MaskValue |= !!Lane1Cond << i; + else if (Lane1Cond < 0) + MaskValue |= !!Lane2Cond << i; + else + return false; + } + return true; +} + +// Try to lower a vselect node into a simple blend instruction. +static SDValue LowerVSELECTtoBlend(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDValue Cond = Op.getOperand(0); + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + unsigned NumElems = VT.getVectorNumElements(); + + // There is no blend with immediate in AVX-512. + if (VT.is512BitVector()) + return SDValue(); + + if (!Subtarget->hasSSE41() || EltVT == MVT::i8) + return SDValue(); + if (!Subtarget->hasInt256() && VT == MVT::v16i16) + return SDValue(); + if (Cond.getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + + // Check the mask for BLEND and build the value. + unsigned MaskValue = 0; + if (!BUILD_VECTORtoBlendMask(Cond, MaskValue, DAG)) + return SDValue(); + + // Convert i32 vectors to floating point if it is not AVX2. + // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. + MVT BlendVT = VT; + if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { + BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()), + NumElems); + LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS); + RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS); + } + + SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS, + DAG.getConstant(MaskValue, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Ret); +} + +SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { + SDValue BlendOp = LowerVSELECTtoBlend(Op, Subtarget, DAG); + if (BlendOp.getNode()) + return BlendOp; + + // Some types for vselect were previously set to Expand, not Legal or + // Custom. Return an empty SDValue so we fall-through to Expand, after + // the Custom lowering phase. + MVT VT = Op.getSimpleValueType(); + switch (VT.SimpleTy) { + default: + break; + case MVT::v8i16: + case MVT::v16i16: + return SDValue(); + } + + // We couldn't create a "Blend with immediate" node. + // This node should still be legal, but we'll have to emit a blendv* + // instruction. + return Op; +} + static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); @@ -10746,11 +10867,6 @@ return false; } -static bool isZero(SDValue V) { - ConstantSDNode *C = dyn_cast(V); - return C && C->isNullValue(); -} - static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { if (V.getOpcode() != ISD::TRUNCATE) return false; @@ -14326,6 +14442,7 @@ case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); + case ISD::VSELECT: return LowerVSELECT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); @@ -17631,6 +17748,50 @@ return std::make_pair(Opc, NeedSplit); } +static SDValue +TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDLoc dl(N); + SDValue Cond = N->getOperand(0); + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + + if (Cond.getOpcode() == ISD::SIGN_EXTEND) { + SDValue CondSrc = Cond->getOperand(0); + if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG) + Cond = CondSrc->getOperand(0); + } + + MVT VT = N->getSimpleValueType(0); + MVT EltVT = VT.getVectorElementType(); + unsigned NumElems = VT.getVectorNumElements(); + // There is no blend with immediate in AVX-512. + if (VT.is512BitVector()) + return SDValue(); + + if (!Subtarget->hasSSE41() || EltVT == MVT::i8) + return SDValue(); + if (!Subtarget->hasInt256() && VT == MVT::v16i16) + return SDValue(); + if (Cond.getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + + unsigned MaskValue = 0; + if (!BUILD_VECTORtoBlendMask(Cond, MaskValue, DAG)) + return SDValue(); + + SmallVector ShuffleMask(NumElems, -1); + for (unsigned i = 0; i < NumElems; ++i) { + // Be sure we emit undef where we can. + if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF) + ShuffleMask[i] = -1; + else + ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1); + } + + return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]); +} + /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT /// nodes. static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, @@ -18166,6 +18327,23 @@ DCI.CommitTargetLoweringOpt(TLO); } + // We should generate an X86ISD::BLENDI from a vselect if its argument + // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of + // constants. This specific pattern gets generated when we split a + // selector for a 512 bit vector in a machine without AVX512 (but with + // 256-bit vectors), during legalization: + // + // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS) + // + // Iff we find this pattern and the build_vectors are built from + // constants, we translate the vselect into a shuffle_vector that we + // know will be matched by LowerVECTOR_SHUFFLEtoBlend. + if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) { + SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget); + if (Shuffle.getNode()) + return Shuffle; + } + return SDValue(); } Index: test/CodeGen/X86/avx-blend.ll =================================================================== --- test/CodeGen/X86/avx-blend.ll +++ test/CodeGen/X86/avx-blend.ll @@ -3,7 +3,7 @@ ; AVX128 tests: ;CHECK-LABEL: vsel_float: -;CHECK: vblendvps +;CHECK: vblendps $5 ;CHECK: ret define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { %vsel = select <4 x i1> , <4 x float> %v1, <4 x float> %v2 @@ -12,7 +12,7 @@ ;CHECK-LABEL: vsel_i32: -;CHECK: vblendvps +;CHECK: vblendps $5 ;CHECK: ret define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) { %vsel = select <4 x i1> , <4 x i32> %v1, <4 x i32> %v2 @@ -52,7 +52,7 @@ ;CHECK-LABEL: vsel_float8: ;CHECK-NOT: vinsertf128 -;CHECK: vblendvps +;CHECK: vblendps $17 ;CHECK: ret define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) { %vsel = select <8 x i1> , <8 x float> %v1, <8 x float> %v2 @@ -61,7 +61,7 @@ ;CHECK-LABEL: vsel_i328: ;CHECK-NOT: vinsertf128 -;CHECK: vblendvps +;CHECK: vblendps $17 ;CHECK-NEXT: ret define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) { %vsel = select <8 x i1> , <8 x i32> %v1, <8 x i32> %v2 @@ -69,7 +69,8 @@ } ;CHECK-LABEL: vsel_double8: -;CHECK: vblendvpd +;CHECK: vblendpd $1 +;CHECK: vblendpd $1 ;CHECK: ret define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) { %vsel = select <8 x i1> , <8 x double> %v1, <8 x double> %v2 @@ -77,7 +78,8 @@ } ;CHECK-LABEL: vsel_i648: -;CHECK: vblendvpd +;CHECK: vblendpd $1 +;CHECK: vblendpd $1 ;CHECK: ret define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) { %vsel = select <8 x i1> , <8 x i64> %v1, <8 x i64> %v2 @@ -86,7 +88,7 @@ ;CHECK-LABEL: vsel_double4: ;CHECK-NOT: vinsertf128 -;CHECK: vblendvpd +;CHECK: vshufpd $10 ;CHECK-NEXT: ret define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) { %vsel = select <4 x i1> , <4 x double> %v1, <4 x double> %v2 @@ -111,5 +113,3 @@ %min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y ret <2 x double> %min } - - Index: test/CodeGen/X86/avx2.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/avx2.ll @@ -0,0 +1,25 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s + +define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @blendvb_fallback_v4i32 +; CHECK: vblendvps +; CHECK: ret + %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %y + ret <4 x i32> %ret +} + +define <8 x i32> @blendvb_fallback_v8i32(<8 x i1> %mask, <8 x i32> %x, <8 x i32> %y) { +; CHECK-LABEL: @blendvb_fallback_v8i32 +; CHECK: vblendvps +; CHECK: ret + %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y + ret <8 x i32> %ret +} + +define <8 x float> @blendvb_fallback_v8f32(<8 x i1> %mask, <8 x float> %x, <8 x float> %y) { +; CHECK-LABEL: @blendvb_fallback_v8f32 +; CHECK: vblendvps +; CHECK: ret + %ret = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y + ret <8 x float> %ret +} Index: test/CodeGen/X86/blend-msb.ll =================================================================== --- test/CodeGen/X86/blend-msb.ll +++ test/CodeGen/X86/blend-msb.ll @@ -4,7 +4,7 @@ ; Verify that we produce movss instead of blendvps when possible. ;CHECK-LABEL: vsel_float: -;CHECK-NOT: blendvps +;CHECK-NOT: blend ;CHECK: movss ;CHECK: ret define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { @@ -13,7 +13,7 @@ } ;CHECK-LABEL: vsel_4xi8: -;CHECK-NOT: blendvps +;CHECK-NOT: blend ;CHECK: movss ;CHECK: ret define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) { @@ -21,14 +21,8 @@ ret <4 x i8> %vsel } - -; We do not have native support for v8i16 blends and we have to use the -; blendvb instruction or a sequence of NAND/OR/AND. Make sure that we do not -; reduce the mask in this case. ;CHECK-LABEL: vsel_8xi16: -;CHECK: andps -;CHECK: andps -;CHECK: orps +;CHECK: pblendw $17 ;CHECK: ret define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) { %vsel = select <8 x i1> , <8 x i16> %v1, <8 x i16> %v2 Index: test/CodeGen/X86/sse41-blend.ll =================================================================== --- test/CodeGen/X86/sse41-blend.ll +++ test/CodeGen/X86/sse41-blend.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s ;CHECK-LABEL: vsel_float: -;CHECK: blendvps +;CHECK: blendps ;CHECK: ret define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { %vsel = select <4 x i1> , <4 x float> %v1, <4 x float> %v2 @@ -10,7 +10,7 @@ ;CHECK-LABEL: vsel_4xi8: -;CHECK: blendvps +;CHECK: blendps ;CHECK: ret define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) { %vsel = select <4 x i1> , <4 x i8> %v1, <4 x i8> %v2 @@ -18,7 +18,7 @@ } ;CHECK-LABEL: vsel_4xi16: -;CHECK: blendvps +;CHECK: blendps ;CHECK: ret define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) { %vsel = select <4 x i1> , <4 x i16> %v1, <4 x i16> %v2 @@ -27,7 +27,7 @@ ;CHECK-LABEL: vsel_i32: -;CHECK: blendvps +;CHECK: blendps ;CHECK: ret define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) { %vsel = select <4 x i1> , <4 x i32> %v1, <4 x i32> %v2 Index: test/CodeGen/X86/sse41.ll =================================================================== --- test/CodeGen/X86/sse41.ll +++ test/CodeGen/X86/sse41.ll @@ -576,3 +576,11 @@ %res = select <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5 ret <4 x float> %res } + +define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: blendvb_fallback +; CHECK: blendvb +; CHECK: ret + %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y + ret <8 x i16> %ret +}