Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -5522,6 +5522,61 @@ return DAG.getBitcast(VT, Result); } +/// Attempts to lower a BUILD_VECTOR to a BLEND-matchable pattern. +// Possible if all operands are constants or UNDEF's, except for the first. +// build_vector(X, C0, C1, C2) -> +// vector_shuffle<4,1,2,3>(const_pool_vec(Ud,C0,C1,C2), scalar_to_vector(X)) +static SDValue LowerBuildVectorAsBLEND(SDValue BV, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + const TargetLowering &TLI = *Subtarget.getTargetLowering(); + EVT VT = BV.getValueType(); + EVT EltVT = VT.getScalarType(); + unsigned NumElts = BV.getNumOperands(); + SDLoc DL(BV); + + if (VT != MVT::v4f32 && VT != MVT::v4i32) + return {}; + + // Check that the BUILD_VECTOR's last N-1 operands are constants + for (unsigned i = 1; i != NumElts; ++i) { + const SDValue &Op = BV.getOperand(i); + if (!Op.isUndef() && !isa(Op) && !isa(Op)) + return {}; + } + // Build the vector of constants which will be loaded from the constant pool + // FIXME: It would be better to construct a BUILD_VECTOR node of constants and + // undefs and let the legalizer expand it to a constant pool load. + // Unfortunately, this does not work because the constant nodes get legaliazed + // to individual consant pool load before the BUILD_VECTOR is visited. + SmallVector CV; + CV.reserve(NumElts); + // First element is an UNDEF since it will not be selected + CV.push_back(UndefValue::get(EltVT.getTypeForEVT(*DAG.getContext()))); + // Remaining elements will be selected. + for (unsigned i = 1; i != NumElts; ++i) { + if (ConstantFPSDNode *V = dyn_cast(BV->getOperand(i))) { + CV.push_back(const_cast(V->getConstantFPValue())); + } else if (ConstantSDNode *V = + dyn_cast(BV->getOperand(i))) { + CV.push_back(const_cast(V->getConstantIntValue())); + } else { + assert(BV->getOperand(i).isUndef()); + Type *OpNTy = EltVT.getTypeForEVT(*DAG.getContext()); + CV.push_back(UndefValue::get(OpNTy)); + } + } + Constant *CP = ConstantVector::get(CV); + SDValue CPIdx = + DAG.getConstantPool(CP, TLI.getPointerTy(DAG.getDataLayout())); + unsigned Alignment = cast(CPIdx)->getAlignment(); + SDValue C = DAG.getLoad( + VT, DL, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment); + SDValue Elt0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, BV.getOperand(0)); + int Mask[] = {4, 1, 2, 3}; + return DAG.getVectorShuffle(VT, DL, C, Elt0, Mask); +} + /// Return a vector logical shift node. static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, @@ -6886,9 +6941,12 @@ // For SSE 4.1, use insertps to put the high elements into the low element. if (Subtarget.hasSSE41()) { SDValue Result; - if (!Op.getOperand(0).isUndef()) + if (!Op.getOperand(0).isUndef()) { + // Attempt to lower as a BLEND + if (SDValue Res = LowerBuildVectorAsBLEND(Op, DAG, Subtarget)) + return Res; Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); - else + } else Result = DAG.getUNDEF(VT); for (unsigned i = 1; i < NumElems; ++i) { Index: test/CodeGen/X86/vector-shuffle-combining.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining.ll +++ test/CodeGen/X86/vector-shuffle-combining.ll @@ -2859,16 +2859,12 @@ ; ; SSE41-LABEL: combine_constant_insertion_v4f32: ; SSE41: # BB#0: -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_constant_insertion_v4f32: ; AVX: # BB#0: -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] ; AVX-NEXT: retq %a0 = insertelement <4 x float> undef, float %f, i32 0 %ret = shufflevector <4 x float> %a0, <4 x float> , <4 x i32> @@ -2907,24 +2903,20 @@ ; SSE41-LABEL: combine_constant_insertion_v4i32: ; SSE41: # BB#0: ; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: movl $4, %eax -; SSE41-NEXT: pinsrd $1, %eax, %xmm0 -; SSE41-NEXT: movl $5, %eax -; SSE41-NEXT: pinsrd $2, %eax, %xmm0 -; SSE41-NEXT: movl $30, %eax -; SSE41-NEXT: pinsrd $3, %eax, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_constant_insertion_v4i32: -; AVX: # BB#0: -; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: movl $4, %eax -; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX-NEXT: movl $5, %eax -; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; AVX-NEXT: movl $30, %eax -; AVX-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_constant_insertion_v4i32: +; AVX1: # BB#0: +; AVX1-NEXT: vmovd %edi, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_constant_insertion_v4i32: +; AVX2: # BB#0: +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] +; AVX2-NEXT: retq %a0 = insertelement <4 x i32> undef, i32 %f, i32 0 %ret = shufflevector <4 x i32> %a0, <4 x i32> , <4 x i32> ret <4 x i32> %ret