Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -748,6 +748,17 @@ return true; } + /// Given an insert-element vector (InsElt) and a scalar operation (Op), + /// return true if it would be profitable to convert the scalar operation into + /// a vector operation. This would normally be true if: + /// 1. The vector operation does not cost much more than a scalar version. + /// 2. The target can avoid a costly transfer from scalar to vector register + /// by loading a scalar operand directly into a vector register. + virtual bool shouldLoadScalarIntoVectorOp(SDValue InsElt, SDValue Op, + SelectionDAG &DAG) const { + return false; + } + /// Return true if the target wants to use the optimization that /// turns ext(promotableInst1(...(promotableInstN(load)))) into /// promotedInst1(...(promotedInstN(ext(load)))). Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17267,6 +17267,65 @@ return DAG.getBitcast(VT, Shuf); } +/// Try to convert a load of a scalar + scalar binop + insert element into a +/// load + insert element + vector binop. If we can load the scalar directly +/// into a vector register, this eliminates a potentially expensive transfer +/// from scalar register to vector register. +static SDValue combineLoadBinopInsElt(SDNode *Ins, SelectionDAG &DAG) { + // TODO: This can be loosened to allow insertion into any constant vector. + SDValue UndefVec = Ins->getOperand(0); + SDValue BO = Ins->getOperand(1); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!UndefVec.isUndef() || !TLI.isBinOp(BO.getOpcode()) || !BO.hasOneUse()) + return SDValue(); + + if (!TLI.shouldLoadScalarIntoVectorOp(SDValue(Ins, 0), BO, DAG)) + return SDValue(); + + // We are matching a binop that has a loaded operand and a constant operand. + // This is complicated because the operands can be in either order (and we + // must capture that fact), and the constant can be either integer or FP. + EVT VecVT = Ins->getOperand(0).getValueType(); + SDValue Ld, C; + SDLoc DL(Ins); + auto matchLoadAndConstant = [&](SDValue Op0, SDValue Op1) { + if (Op0.getOpcode() != ISD::LOAD) + return false; + // Splat a scalar constant operand for use in a vector op. The caller can + // adjust (blend) this constant with the original insertion vector constant. + if (auto *CInt = dyn_cast(Op1)) { + // Account for scalar operand size differences. For example, scalar shift + // amount may have a different type than the other operand/result. + unsigned Width = VecVT.getScalarSizeInBits(); + C = DAG.getConstant(CInt->getAPIntValue().zextOrTrunc(Width), DL, VecVT); + } else if (auto *CFP = dyn_cast(Op1)) { + C = DAG.getConstantFP(CFP->getValueAPF(), DL, VecVT); + } else { + return false; + } + Ld = Op0; + return true; + }; + + SDValue BO0 = BO.getOperand(0), BO1 = BO.getOperand(1); + bool Op0IsLoad = matchLoadAndConstant(BO0, BO1); + if (!Op0IsLoad && !matchLoadAndConstant(BO1, BO0)) + return SDValue(); + + // ins undef, (bo (load X), C), index --> bo (ins undef, (load X), index), C' + // ins undef, (bo C, (load X)), index --> bo C', (ins undef, (load X), index) + SDValue NewInsert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, + UndefVec, Op0IsLoad ? BO0 : BO1, + Ins->getOperand(2)); + + // TODO: We created a splat constant above because we did not check the + // insert index. If the insert index is a constant and/or we were not + // originally inserting into an undef constant, we should compute the + // other constant elements as needed. + return Op0IsLoad ? DAG.getNode(BO.getOpcode(), DL, VecVT, NewInsert, C) + : DAG.getNode(BO.getOpcode(), DL, VecVT, C, NewInsert); +} + SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { SDValue InVec = N->getOperand(0); SDValue InVal = N->getOperand(1); @@ -17306,6 +17365,9 @@ unsigned NumElts = VT.getVectorNumElements(); + if (SDValue BO = combineLoadBinopInsElt(N, DAG)) + return BO; + // We must know which element is being inserted for folds below here. unsigned Elt = IndexC->getZExtValue(); if (SDValue Shuf = combineInsertEltToShuffle(N, Elt)) Index: llvm/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.h +++ llvm/lib/Target/X86/X86ISelLowering.h @@ -1025,6 +1025,9 @@ bool shouldSplatInsEltVarIndex(EVT VT) const override; + bool shouldLoadScalarIntoVectorOp(SDValue Ins, SDValue Op, + SelectionDAG &DAG) const override; + bool convertSetCCLogicToBitwiseLogic(EVT VT) const override { return VT.isScalarInteger(); } Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5376,6 +5376,39 @@ return isTypeLegal(VT); } +/// If we can load an integer scalar into a vector register with minimal +/// shuffling and the vector operation is supported, then avoiding a transfer +/// from GPR to vector is probably a win. +bool X86TargetLowering::shouldLoadScalarIntoVectorOp(SDValue InsElt, + SDValue BinOp, + SelectionDAG &DAG) const { + // Without SSE2, we only have movss and no integer vector ops; don't bother. + // FP uses the same registers, so the transform would never help for an FP op. + EVT VecVT = InsElt.getValueType(); + if (!Subtarget.hasSSE2() || VecVT.isFloatingPoint()) + return false; + + // Don't try this when optimizing for size because vector code and vector + // constants are probably bigger than their scalar counterparts. + if (DAG.getMachineFunction().getFunction().hasOptSize()) + return false; + + // Loading into the 0-index lane is possible with SSE2 using movd/movq. + // TODO: AVX1 and AVX2 can splat (broadcast) various scalar types. + EVT ScalarVT = BinOp.getValueType(); + SDValue InsIdx = InsElt.getOperand(2); + if ((ScalarVT != MVT::i32 && ScalarVT != MVT::i64) || !isNullConstant(InsIdx)) + return false; + + // Filter out illegal vector types, ISA holes, and unsupported vector ops. + // TODO: This eliminates custom/promoted ops that are probably ok. + auto Opcode = BinOp.getOpcode(); + if (!isTypeDesirableForOp(Opcode, VecVT) || !isOperationLegal(Opcode, VecVT)) + return false; + + return true; +} + MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const { MVT VT = MVT::getIntegerVT(NumBits); if (isTypeLegal(VT)) Index: llvm/test/CodeGen/X86/load-scalar-as-vector.ll =================================================================== --- llvm/test/CodeGen/X86/load-scalar-as-vector.ll +++ llvm/test/CodeGen/X86/load-scalar-as-vector.ll @@ -8,17 +8,29 @@ define <4 x i32> @add_op1_constant(i32* %p) nounwind { ; SSE-LABEL: add_op1_constant: ; SSE: # %bb.0: -; SSE-NEXT: movl (%rdi), %eax -; SSE-NEXT: addl $42, %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: add_op1_constant: -; AVX: # %bb.0: -; AVX-NEXT: movl (%rdi), %eax -; AVX-NEXT: addl $42, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: retq +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: add_op1_constant: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: add_op1_constant: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42,42,42,42] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: add_op1_constant: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42,42,42,42] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %x = load i32, i32* %p %b = add i32 %x, 42 %r = insertelement <4 x i32> undef, i32 %b, i32 0 @@ -70,16 +82,16 @@ define <2 x i64> @sub_op0_constant(i64* %p) nounwind { ; SSE-LABEL: sub_op0_constant: ; SSE: # %bb.0: -; SSE-NEXT: movl $42, %eax -; SSE-NEXT: subq (%rdi), %rax -; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [42,42] +; SSE-NEXT: psubq %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: sub_op0_constant: ; AVX: # %bb.0: -; AVX-NEXT: movl $42, %eax -; AVX-NEXT: subq (%rdi), %rax -; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42] +; AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %x = load i64, i64* %p %b = sub i64 42, %x @@ -110,17 +122,37 @@ } define <4 x i32> @mul_op1_constant(i32* %p) nounwind { -; SSE-LABEL: mul_op1_constant: -; SSE: # %bb.0: -; SSE-NEXT: imull $42, (%rdi), %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: mul_op1_constant: -; AVX: # %bb.0: -; AVX-NEXT: imull $42, (%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: retq +; SSE2-LABEL: mul_op1_constant: +; SSE2: # %bb.0: +; SSE2-NEXT: imull $42, (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: retq +; +; SSE4-LABEL: mul_op1_constant: +; SSE4: # %bb.0: +; SSE4-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE4-NEXT: pmulld {{.*}}(%rip), %xmm0 +; SSE4-NEXT: retq +; +; AVX1-LABEL: mul_op1_constant: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: mul_op1_constant: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42,42,42,42] +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_op1_constant: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42,42,42,42] +; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %x = load i32, i32* %p %b = mul i32 %x, 42 %r = insertelement <4 x i32> undef, i32 %b, i32 0 @@ -150,17 +182,29 @@ define <4 x i32> @and_op1_constant(i32* %p) nounwind { ; SSE-LABEL: and_op1_constant: ; SSE: # %bb.0: -; SSE-NEXT: movl (%rdi), %eax -; SSE-NEXT: andl $42, %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: and_op1_constant: -; AVX: # %bb.0: -; AVX-NEXT: movl (%rdi), %eax -; AVX-NEXT: andl $42, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: retq +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: and_op1_constant: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: and_op1_constant: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [42,42,42,42] +; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: and_op1_constant: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [42,42,42,42] +; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %x = load i32, i32* %p %b = and i32 %x, 42 %r = insertelement <4 x i32> undef, i32 %b, i32 0 @@ -170,16 +214,14 @@ define <2 x i64> @or_op1_constant(i64* %p) nounwind { ; SSE-LABEL: or_op1_constant: ; SSE: # %bb.0: -; SSE-NEXT: movq (%rdi), %rax -; SSE-NEXT: orq $42, %rax -; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: orps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: or_op1_constant: ; AVX: # %bb.0: -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: orq $42, %rax -; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %x = load i64, i64* %p %b = or i64 %x, 42 @@ -717,12 +759,26 @@ ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: add_op1_constant_v8i32: -; AVX: # %bb.0: -; AVX-NEXT: movl (%rdi), %eax -; AVX-NEXT: addl $42, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: add_op1_constant_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: movl (%rdi), %eax +; AVX1-NEXT: addl $42, %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: add_op1_constant_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [42,42,42,42,42,42,42,42] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: add_op1_constant_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [42,42,42,42,42,42,42,42] +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retq %x = load i32, i32* %p %b = add i32 %x, 42 %r = insertelement <8 x i32> undef, i32 %b, i32 0 @@ -737,12 +793,26 @@ ; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: sub_op0_constant_v4i64: -; AVX: # %bb.0: -; AVX-NEXT: movl $42, %eax -; AVX-NEXT: subq (%rdi), %rax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: sub_op0_constant_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: movl $42, %eax +; AVX1-NEXT: subq (%rdi), %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sub_op0_constant_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [42,42,42,42] +; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sub_op0_constant_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [42,42,42,42] +; AVX512-NEXT: vpsubq %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: retq %x = load i64, i64* %p %b = sub i64 42, %x %r = insertelement <4 x i64> undef, i64 %b, i32 0 @@ -756,11 +826,25 @@ ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: mul_op1_constant_v8i32: -; AVX: # %bb.0: -; AVX-NEXT: imull $42, (%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: mul_op1_constant_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: imull $42, (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: mul_op1_constant_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [42,42,42,42,42,42,42,42] +; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_op1_constant_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [42,42,42,42,42,42,42,42] +; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retq %x = load i32, i32* %p %b = mul i32 %x, 42 %r = insertelement <8 x i32> undef, i32 %b, i32 0 @@ -775,12 +859,25 @@ ; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: or_op1_constant_v4i64: -; AVX: # %bb.0: -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: orq $42, %rax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: or_op1_constant_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: or_op1_constant_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [42,42,42,42] +; AVX2-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: or_op1_constant_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm1 = [42,42,42,42] +; AVX512-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retq %x = load i64, i64* %p %b = or i64 %x, 42 %r = insertelement <4 x i64> undef, i64 %b, i32 0 @@ -797,12 +894,25 @@ ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: add_op1_constant_v16i32: -; AVX: # %bb.0: -; AVX-NEXT: movl (%rdi), %eax -; AVX-NEXT: addl $42, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: add_op1_constant_v16i32: +; AVX1: # %bb.0: +; AVX1-NEXT: movl (%rdi), %eax +; AVX1-NEXT: addl $42, %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: add_op1_constant_v16i32: +; AVX2: # %bb.0: +; AVX2-NEXT: movl (%rdi), %eax +; AVX2-NEXT: addl $42, %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: add_op1_constant_v16i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512-NEXT: retq %x = load i32, i32* %p %b = add i32 %x, 42 %r = insertelement <16 x i32> undef, i32 %b, i32 0 @@ -817,12 +927,26 @@ ; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: sub_op0_constant_v8i64: -; AVX: # %bb.0: -; AVX-NEXT: movl $42, %eax -; AVX-NEXT: subq (%rdi), %rax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: sub_op0_constant_v8i64: +; AVX1: # %bb.0: +; AVX1-NEXT: movl $42, %eax +; AVX1-NEXT: subq (%rdi), %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sub_op0_constant_v8i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movl $42, %eax +; AVX2-NEXT: subq (%rdi), %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sub_op0_constant_v8i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [42,42,42,42,42,42,42,42] +; AVX512-NEXT: vpsubq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: retq %x = load i64, i64* %p %b = sub i64 42, %x %r = insertelement <8 x i64> undef, i64 %b, i32 0 @@ -836,11 +960,23 @@ ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: mul_op1_constant_v16i32: -; AVX: # %bb.0: -; AVX-NEXT: imull $42, (%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: mul_op1_constant_v16i32: +; AVX1: # %bb.0: +; AVX1-NEXT: imull $42, (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: mul_op1_constant_v16i32: +; AVX2: # %bb.0: +; AVX2-NEXT: imull $42, (%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_op1_constant_v16i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512-NEXT: retq %x = load i32, i32* %p %b = mul i32 %x, 42 %r = insertelement <16 x i32> undef, i32 %b, i32 0 @@ -855,12 +991,25 @@ ; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: or_op1_constant_v8i64: -; AVX: # %bb.0: -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: orq $42, %rax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: or_op1_constant_v8i64: +; AVX1: # %bb.0: +; AVX1-NEXT: movq (%rdi), %rax +; AVX1-NEXT: orq $42, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: or_op1_constant_v8i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: orq $42, %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: or_op1_constant_v8i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vorpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NEXT: retq %x = load i64, i64* %p %b = or i64 %x, 42 %r = insertelement <8 x i64> undef, i64 %b, i32 0