Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -5513,6 +5513,68 @@ return DAG.getBitcast(VT, Result); } +/// Attempts to lower a BUILD_VECTOR to a BLEND +// Possible if all operands are constants or UNDEF's, except for the first. +// build_vector(X, C0, C1, C2) -> +// BLEND(scalar_to_vector(X), const_pool_vec(Ud, C0, C1 C2), 0111) +static SDValue LowerBuildVectorAsBLEND(SDValue BV, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + const TargetLowering &TLI = *Subtarget.getTargetLowering(); + EVT VT = BV.getValueType(); + EVT EltVT = VT.getScalarType(); + unsigned NumElts = BV.getNumOperands(); + SDLoc DL(BV); + + if (VT != MVT::v4f32 && VT != MVT::v4i32) + return {}; + + // Check that the BUILD_VECTOR's last N-1 operands are constants + for (unsigned i = 1; i != NumElts; ++i) { + const SDValue &Op = BV.getOperand(i); + if (!Op.isUndef() && !isa(Op) && !isa(Op)) + return {}; + } + + // Build the vector of constants which will be loaded from the constant pool + SmallVector CV; + CV.reserve(NumElts); + // First element is an UNDEF since it will not be selected + CV.push_back(UndefValue::get(EltVT.getTypeForEVT(*DAG.getContext()))); + // Remaining elements will be selected. + for (unsigned i = 1; i != NumElts; ++i) { + if (ConstantFPSDNode *V = dyn_cast(BV->getOperand(i))) { + CV.push_back(const_cast(V->getConstantFPValue())); + } else if (ConstantSDNode *V = + dyn_cast(BV->getOperand(i))) { + CV.push_back(const_cast(V->getConstantIntValue())); + } else { + assert(BV->getOperand(i).isUndef()); + Type *OpNTy = EltVT.getTypeForEVT(*DAG.getContext()); + CV.push_back(UndefValue::get(OpNTy)); + } + } + Constant *CP = ConstantVector::get(CV); + SDValue CPIdx = + DAG.getConstantPool(CP, TLI.getPointerTy(DAG.getDataLayout())); + unsigned Alignment = cast(CPIdx)->getAlignment(); + SDValue C = DAG.getLoad( + VT, DL, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment); + // The first element is placed into a vector which will be the BLEND's + // operand. + SDValue Elt0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, BV.getOperand(0)); + // BLEND is natively an operation on float elements, so cast if needed. + if (!EltVT.isFloatingPoint()) { + VT = MVT::v4f32; + C = DAG.getBitcast(VT, C); + Elt0 = DAG.getBitcast(VT, Elt0); + } + // Blend mask = [0, 1, 1, 1] + unsigned char BlendMask = 0xE; + return DAG.getNode(X86ISD::BLENDI, DL, VT, Elt0, C, + DAG.getConstant(BlendMask, DL, MVT::i8)); +} + /// Return a vector logical shift node. static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, @@ -6874,14 +6936,17 @@ if (SDValue Sh = buildFromShuffleMostly(Op, DAG)) return Sh; - // For SSE 4.1, use insertps to put the high elements into the low element. if (Subtarget.hasSSE41()) { SDValue Result; - if (!Op.getOperand(0).isUndef()) + if (!Op.getOperand(0).isUndef()) { + // Attempt to lower as a BLEND + if (SDValue Res = LowerBuildVectorAsBLEND(Op, DAG, Subtarget)) + return Res; Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); + } else Result = DAG.getUNDEF(VT); - + // use insertps to put the high elements into the low element. for (unsigned i = 1; i < NumElems; ++i) { if (Op.getOperand(i).isUndef()) continue; Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, Index: test/CodeGen/X86/vector-shuffle-combining.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining.ll +++ test/CodeGen/X86/vector-shuffle-combining.ll @@ -2859,16 +2859,12 @@ ; ; SSE41-LABEL: combine_constant_insertion_v4f32: ; SSE41: # BB#0: -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_constant_insertion_v4f32: ; AVX: # BB#0: -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] ; AVX-NEXT: retq %a0 = insertelement <4 x float> undef, float %f, i32 0 %ret = shufflevector <4 x float> %a0, <4 x float> , <4 x i32> @@ -2907,23 +2903,13 @@ ; SSE41-LABEL: combine_constant_insertion_v4i32: ; SSE41: # BB#0: ; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: movl $4, %eax -; SSE41-NEXT: pinsrd $1, %eax, %xmm0 -; SSE41-NEXT: movl $5, %eax -; SSE41-NEXT: pinsrd $2, %eax, %xmm0 -; SSE41-NEXT: movl $30, %eax -; SSE41-NEXT: pinsrd $3, %eax, %xmm0 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_constant_insertion_v4i32: ; AVX: # BB#0: ; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: movl $4, %eax -; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX-NEXT: movl $5, %eax -; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; AVX-NEXT: movl $30, %eax -; AVX-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] ; AVX-NEXT: retq %a0 = insertelement <4 x i32> undef, i32 %f, i32 0 %ret = shufflevector <4 x i32> %a0, <4 x i32> , <4 x i32>