Index: include/llvm/CodeGen/TargetLowering.h =================================================================== --- include/llvm/CodeGen/TargetLowering.h +++ include/llvm/CodeGen/TargetLowering.h @@ -532,6 +532,16 @@ return false; } + /// Given an insert-element vector (InsElt) and a scalar operation (Op), + /// return true if it would be profitable to convert the scalar operation into + /// a vector operation. This would normally be true if: + /// 1. The vector operation does not cost much more than a scalar version. + /// 2. The target can avoid a costly transfer from scalar to vector register + /// by loading a scalar operand directly into a vector register. + virtual bool shouldLoadScalarIntoVectorOp(SDValue InsElt, SDValue Op) const { + return false; + } + /// Return true if the target wants to use the optimization that /// turns ext(promotableInst1(...(promotableInstN(load)))) into /// promotedInst1(...(promotedInstN(ext(load)))). Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -15089,6 +15089,65 @@ return DAG.getBitcast(VT, Shuf); } +/// Try to convert a load of a scalar + scalar binop + insert element into a +/// load + insert element + vector binop. If we can load the scalar directly +/// into a vector register, this eliminates a potentially expensive transfer +/// from scalar register to vector register. +static SDValue combineLoadBinopInsElt(SDNode *Ins, SelectionDAG &DAG) { + // TODO: This can be loosened to allow insertion into any constant vector. + SDValue UndefVec = Ins->getOperand(0); + SDValue BO = Ins->getOperand(1); + if (!UndefVec.isUndef() || !ISD::isBinaryOp(BO.getNode()) || !BO.hasOneUse()) + return SDValue(); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.shouldLoadScalarIntoVectorOp(SDValue(Ins, 0), BO)) + return SDValue(); + + // We are matching a binop that has a loaded operand and a constant operand. + // This is complicated because the operands can be in either order (and we + // must capture that fact), and the constant can be either integer or FP. + EVT VecVT = Ins->getOperand(0).getValueType(); + SDValue Ld, C; + SDLoc DL(Ins); + auto matchLoadAndConstant = [&](SDValue Op0, SDValue Op1) { + if (Op0.getOpcode() != ISD::LOAD) + return false; + // Splat a scalar constant operand for use in a vector op. The caller can + // adjust (blend) this constant with the original insertion vector constant. + if (auto *CInt = dyn_cast(Op1)) { + // Account for scalar operand size differences. For example, scalar shift + // amount may have a different type than the other operand/result. + unsigned Width = VecVT.getScalarSizeInBits(); + C = DAG.getConstant(CInt->getAPIntValue().zextOrTrunc(Width), DL, VecVT); + } else if (auto *CFP = dyn_cast(Op1)) { + C = DAG.getConstantFP(CFP->getValueAPF(), DL, VecVT); + } else { + return false; + } + Ld = Op0; + return true; + }; + + SDValue BO0 = BO.getOperand(0), BO1 = BO.getOperand(1); + bool Op0IsLoad = matchLoadAndConstant(BO0, BO1); + if (!Op0IsLoad && !matchLoadAndConstant(BO1, BO0)) + return SDValue(); + + // ins undef, (bo (load X), C), index --> bo (ins undef, (load X), index), C' + // ins undef, (bo C, (load X)), index --> bo C', (ins undef, (load X), index) + SDValue NewInsert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, + UndefVec, Op0IsLoad ? BO0 : BO1, + Ins->getOperand(2)); + + // TODO: We created a splat constant above because we did not check the + // insert index. If the insert index is a constant and/or we were not + // originally inserting into an undef constant, we should compute the + // other constant elements as needed. + return Op0IsLoad ? DAG.getNode(BO.getOpcode(), DL, VecVT, NewInsert, C) + : DAG.getNode(BO.getOpcode(), DL, VecVT, C, NewInsert); +} + SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { SDValue InVec = N->getOperand(0); SDValue InVal = N->getOperand(1); @@ -15118,6 +15177,9 @@ return SDValue(); } + if (SDValue BO = combineLoadBinopInsElt(N, DAG)) + return BO; + // We must know which element is being inserted for folds below here. unsigned Elt = IndexC->getZExtValue(); if (SDValue Shuf = combineInsertEltToShuffle(N, Elt)) Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -835,6 +835,8 @@ bool shouldSplatInsEltVarIndex(EVT VT) const override; + bool shouldLoadScalarIntoVectorOp(SDValue Ins, SDValue Op) const override; + bool convertSetCCLogicToBitwiseLogic(EVT VT) const override { return VT.isScalarInteger(); } Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -4812,6 +4812,33 @@ return isTypeLegal(VT); } +/// If we can load an integer scalar into a vector register with minimal +/// shuffling and the vector operation is supported, then avoiding a transfer +/// from GPR to vector is probably a win. +bool X86TargetLowering::shouldLoadScalarIntoVectorOp(SDValue InsElt, + SDValue BinOp) const { + // Without SSE2, we only have movss and no integer vector ops; don't bother. + // FP uses the same registers, so the transform would never help for an FP op. + EVT VecVT = InsElt.getValueType(); + if (!Subtarget.hasSSE2() || VecVT.isFloatingPoint()) + return false; + + // Loading into the 0-index lane is possible with SSE2 using movd/movq. + // TODO: AVX1 and AVX2 can splat (broadcast) various scalar types. + EVT ScalarVT = BinOp.getValueType(); + SDValue InsIdx = InsElt.getOperand(2); + if ((ScalarVT != MVT::i32 && ScalarVT != MVT::i64) || !isNullConstant(InsIdx)) + return false; + + // Filter out illegal vector types, ISA holes, and unsupported vector ops. + // TODO: This eliminates custom/promoted ops that are probably ok. + auto Opcode = BinOp.getOpcode(); + if (!isTypeDesirableForOp(Opcode, VecVT) || !isOperationLegal(Opcode, VecVT)) + return false; + + return true; +} + MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const { MVT VT = MVT::getIntegerVT(NumBits); if (isTypeLegal(VT)) Index: test/CodeGen/X86/load-scalar-as-vector.ll =================================================================== --- test/CodeGen/X86/load-scalar-as-vector.ll +++ test/CodeGen/X86/load-scalar-as-vector.ll @@ -7,17 +7,22 @@ define <4 x i32> @add_op1_constant(i32* %p) nounwind { ; SSE-LABEL: add_op1_constant: ; SSE: # %bb.0: -; SSE-NEXT: movl (%rdi), %eax -; SSE-NEXT: addl $42, %eax -; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: add_op1_constant: -; AVX: # %bb.0: -; AVX-NEXT: movl (%rdi), %eax -; AVX-NEXT: addl $42, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: add_op1_constant: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: add_op1_constant: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42,42,42,42] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %x = load i32, i32* %p %b = add i32 %x, 42 %r = insertelement <4 x i32> undef, i32 %b, i32 0 @@ -47,16 +52,16 @@ define <2 x i64> @sub_op0_constant(i64* %p) nounwind { ; SSE-LABEL: sub_op0_constant: ; SSE: # %bb.0: -; SSE-NEXT: movl $42, %eax -; SSE-NEXT: subq (%rdi), %rax -; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [42,42] +; SSE-NEXT: psubq %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: sub_op0_constant: ; AVX: # %bb.0: -; AVX-NEXT: movl $42, %eax -; AVX-NEXT: subq (%rdi), %rax -; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42] +; AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %x = load i64, i64* %p %b = sub i64 42, %x @@ -87,17 +92,30 @@ } define <4 x i32> @mul_op1_constant(i32* %p) nounwind { -; SSE-LABEL: mul_op1_constant: -; SSE: # %bb.0: -; SSE-NEXT: imull $42, (%rdi), %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: mul_op1_constant: -; AVX: # %bb.0: -; AVX-NEXT: imull $42, (%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: retq +; SSE2-LABEL: mul_op1_constant: +; SSE2: # %bb.0: +; SSE2-NEXT: imull $42, (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: retq +; +; SSE4-LABEL: mul_op1_constant: +; SSE4: # %bb.0: +; SSE4-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE4-NEXT: pmulld {{.*}}(%rip), %xmm0 +; SSE4-NEXT: retq +; +; AVX1-LABEL: mul_op1_constant: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: mul_op1_constant: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42,42,42,42] +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %x = load i32, i32* %p %b = mul i32 %x, 42 %r = insertelement <4 x i32> undef, i32 %b, i32 0 @@ -147,16 +165,14 @@ define <2 x i64> @or_op1_constant(i64* %p) nounwind { ; SSE-LABEL: or_op1_constant: ; SSE: # %bb.0: -; SSE-NEXT: movq (%rdi), %rax -; SSE-NEXT: orq $42, %rax -; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: orps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: or_op1_constant: ; AVX: # %bb.0: -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: orq $42, %rax -; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %x = load i64, i64* %p %b = or i64 %x, 42