Index: include/llvm/CodeGen/TargetLowering.h
===================================================================
--- include/llvm/CodeGen/TargetLowering.h
+++ include/llvm/CodeGen/TargetLowering.h
@@ -533,6 +533,17 @@
     return false;
   }
 
+  /// Given an insert-element vector (InsElt) and a scalar operation (Op),
+  /// return true if it would be profitable to convert the scalar operation into
+  /// a vector operation. This would normally be true if:
+  /// 1. The vector operation does not cost much more than a scalar version.
+  /// 2. The target can avoid a costly transfer from scalar to vector register
+  ///    by loading a scalar operand directly into a vector register.
+  virtual bool shouldLoadScalarIntoVectorOp(SDValue InsElt, SDValue Op,
+                                            SelectionDAG &DAG) const {
+    return false;
+  }
+
   /// Return true if the target wants to use the optimization that
   /// turns ext(promotableInst1(...(promotableInstN(load)))) into
   /// promotedInst1(...(promotedInstN(ext(load)))).
Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15168,6 +15168,65 @@
   return DAG.getBitcast(VT, Shuf);
 }
 
+/// Try to convert a load of a scalar + scalar binop + insert element into a
+/// load + insert element + vector binop. If we can load the scalar directly
+/// into a vector register, this eliminates a potentially expensive transfer
+/// from scalar register to vector register.
+static SDValue combineLoadBinopInsElt(SDNode *Ins, SelectionDAG &DAG) {
+  // TODO: This can be loosened to allow insertion into any constant vector.
+  SDValue UndefVec = Ins->getOperand(0);
+  SDValue BO = Ins->getOperand(1);
+  if (!UndefVec.isUndef() || !ISD::isBinaryOp(BO.getNode()) || !BO.hasOneUse())
+    return SDValue();
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.shouldLoadScalarIntoVectorOp(SDValue(Ins, 0), BO, DAG))
+    return SDValue();
+
+  // We are matching a binop that has a loaded operand and a constant operand.
+  // This is complicated because the operands can be in either order (and we
+  // must capture that fact), and the constant can be either integer or FP.
+  EVT VecVT = Ins->getOperand(0).getValueType();
+  SDValue Ld, C;
+  SDLoc DL(Ins);
+  auto matchLoadAndConstant = [&](SDValue Op0, SDValue Op1) {
+    if (Op0.getOpcode() != ISD::LOAD)
+      return false;
+    // Splat a scalar constant operand for use in a vector op. The caller can
+    // adjust (blend) this constant with the original insertion vector constant.
+    if (auto *CInt = dyn_cast<ConstantSDNode>(Op1)) {
+      // Account for scalar operand size differences. For example, scalar shift
+      // amount may have a different type than the other operand/result.
+      unsigned Width = VecVT.getScalarSizeInBits();
+      C = DAG.getConstant(CInt->getAPIntValue().zextOrTrunc(Width), DL, VecVT);
+    } else if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op1)) {
+      C = DAG.getConstantFP(CFP->getValueAPF(), DL, VecVT);
+    } else {
+      return false;
+    }
+    Ld = Op0;
+    return true;
+  };
+
+  SDValue BO0 = BO.getOperand(0), BO1 = BO.getOperand(1);
+  bool Op0IsLoad = matchLoadAndConstant(BO0, BO1);
+  if (!Op0IsLoad && !matchLoadAndConstant(BO1, BO0))
+    return SDValue();
+
+  // ins undef, (bo (load X), C), index --> bo (ins undef, (load X), index), C'
+  // ins undef, (bo C, (load X)), index --> bo C', (ins undef, (load X), index)
+  SDValue NewInsert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT,
+                                  UndefVec, Op0IsLoad ? BO0 : BO1,
+                                  Ins->getOperand(2));
+
+  // TODO: We created a splat constant above because we did not check the
+  //       insert index. If the insert index is a constant and/or we were not
+  //       originally inserting into an undef constant, we should compute the
+  //       other constant elements as needed.
+  return Op0IsLoad ? DAG.getNode(BO.getOpcode(), DL, VecVT, NewInsert, C)
+                   : DAG.getNode(BO.getOpcode(), DL, VecVT, C, NewInsert);
+}
+
 SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
   SDValue InVec = N->getOperand(0);
   SDValue InVal = N->getOperand(1);
@@ -15197,6 +15256,9 @@
     return SDValue();
   }
 
+  if (SDValue BO = combineLoadBinopInsElt(N, DAG))
+    return BO;
+
   // We must know which element is being inserted for folds below here.
   unsigned Elt = IndexC->getZExtValue();
   if (SDValue Shuf = combineInsertEltToShuffle(N, Elt))
Index: lib/Target/X86/X86ISelLowering.h
===================================================================
--- lib/Target/X86/X86ISelLowering.h
+++ lib/Target/X86/X86ISelLowering.h
@@ -835,6 +835,9 @@
 
     bool shouldSplatInsEltVarIndex(EVT VT) const override;
 
+    bool shouldLoadScalarIntoVectorOp(SDValue Ins, SDValue Op,
+                                      SelectionDAG &DAG) const override;
+
     bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
       return VT.isScalarInteger();
     }
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -4852,6 +4852,39 @@
   return isTypeLegal(VT);
 }
 
+/// If we can load an integer scalar into a vector register with minimal
+/// shuffling and the vector operation is supported, then avoiding a transfer
+/// from GPR to vector is probably a win.
+bool X86TargetLowering::shouldLoadScalarIntoVectorOp(SDValue InsElt,
+                                                     SDValue BinOp,
+                                                     SelectionDAG &DAG) const {
+  // Without SSE2, we only have movss and no integer vector ops; don't bother.
+  // FP uses the same registers, so the transform would never help for an FP op.
+  EVT VecVT = InsElt.getValueType();
+  if (!Subtarget.hasSSE2() || VecVT.isFloatingPoint())
+    return false;
+
+  // Don't try this when optimizing for size because vector code and vector
+  // constants are probably bigger than their scalar counterparts.
+  if (DAG.getMachineFunction().getFunction().optForSize())
+    return false;
+
+  // Loading into the 0-index lane is possible with SSE2 using movd/movq.
+  // TODO: AVX1 and AVX2 can splat (broadcast) various scalar types.
+  EVT ScalarVT = BinOp.getValueType();
+  SDValue InsIdx = InsElt.getOperand(2);
+  if ((ScalarVT != MVT::i32 && ScalarVT != MVT::i64) || !isNullConstant(InsIdx))
+    return false;
+
+  // Filter out illegal vector types, ISA holes, and unsupported vector ops.
+  // TODO: This eliminates custom/promoted ops that are probably ok.
+  auto Opcode = BinOp.getOpcode();
+  if (!isTypeDesirableForOp(Opcode, VecVT) || !isOperationLegal(Opcode, VecVT))
+    return false;
+
+  return true;
+}
+
 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
   MVT VT = MVT::getIntegerVT(NumBits);
   if (isTypeLegal(VT))
Index: test/CodeGen/X86/load-scalar-as-vector.ll
===================================================================
--- test/CodeGen/X86/load-scalar-as-vector.ll
+++ test/CodeGen/X86/load-scalar-as-vector.ll
@@ -8,17 +8,29 @@
 define <4 x i32> @add_op1_constant(i32* %p) nounwind {
 ; SSE-LABEL: add_op1_constant:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movl (%rdi), %eax
-; SSE-NEXT:    addl $42, %eax
-; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    paddd {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: add_op1_constant:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movl (%rdi), %eax
-; AVX-NEXT:    addl $42, %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: add_op1_constant:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: add_op1_constant:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [42,42,42,42]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: add_op1_constant:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [42,42,42,42]
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %x = load i32, i32* %p
   %b = add i32 %x, 42
   %r = insertelement <4 x i32> undef, i32 %b, i32 0
@@ -70,16 +82,16 @@
 define <2 x i64> @sub_op0_constant(i64* %p) nounwind {
 ; SSE-LABEL: sub_op0_constant:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movl $42, %eax
-; SSE-NEXT:    subq (%rdi), %rax
-; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [42,42]
+; SSE-NEXT:    psubq %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: sub_op0_constant:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    movl $42, %eax
-; AVX-NEXT:    subq (%rdi), %rax
-; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [42,42]
+; AVX-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
   %x = load i64, i64* %p
   %b = sub i64 42, %x
@@ -110,17 +122,37 @@
 }
 
 define <4 x i32> @mul_op1_constant(i32* %p) nounwind {
-; SSE-LABEL: mul_op1_constant:
-; SSE:       # %bb.0:
-; SSE-NEXT:    imull $42, (%rdi), %eax
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: mul_op1_constant:
-; AVX:       # %bb.0:
-; AVX-NEXT:    imull $42, (%rdi), %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
-; AVX-NEXT:    retq
+; SSE2-LABEL: mul_op1_constant:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    imull $42, (%rdi), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: mul_op1_constant:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE4-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: mul_op1_constant:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: mul_op1_constant:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [42,42,42,42]
+; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: mul_op1_constant:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [42,42,42,42]
+; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %x = load i32, i32* %p
   %b = mul i32 %x, 42
   %r = insertelement <4 x i32> undef, i32 %b, i32 0
@@ -170,16 +202,14 @@
 define <2 x i64> @or_op1_constant(i64* %p) nounwind {
 ; SSE-LABEL: or_op1_constant:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movq (%rdi), %rax
-; SSE-NEXT:    orq $42, %rax
-; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    orps {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: or_op1_constant:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    movq (%rdi), %rax
-; AVX-NEXT:    orq $42, %rax
-; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vorps {{.*}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %x = load i64, i64* %p
   %b = or i64 %x, 42
@@ -719,12 +749,26 @@
 ; SSE-NEXT:    movd %eax, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: add_op1_constant_v8i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movl (%rdi), %eax
-; AVX-NEXT:    addl $42, %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: add_op1_constant_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    movl (%rdi), %eax
+; AVX1-NEXT:    addl $42, %eax
+; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: add_op1_constant_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [42,42,42,42,42,42,42,42]
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: add_op1_constant_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [42,42,42,42,42,42,42,42]
+; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %x = load i32, i32* %p
   %b = add i32 %x, 42
   %r = insertelement <8 x i32> undef, i32 %b, i32 0
@@ -739,12 +783,26 @@
 ; SSE-NEXT:    movq %rax, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: sub_op0_constant_v4i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movl $42, %eax
-; AVX-NEXT:    subq (%rdi), %rax
-; AVX-NEXT:    vmovq %rax, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: sub_op0_constant_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    movl $42, %eax
+; AVX1-NEXT:    subq (%rdi), %rax
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sub_op0_constant_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [42,42,42,42]
+; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sub_op0_constant_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [42,42,42,42]
+; AVX512-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
   %x = load i64, i64* %p
   %b = sub i64 42, %x
   %r = insertelement <4 x i64> undef, i64 %b, i32 0
@@ -758,11 +816,25 @@
 ; SSE-NEXT:    movd %eax, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: mul_op1_constant_v8i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    imull $42, (%rdi), %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: mul_op1_constant_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    imull $42, (%rdi), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: mul_op1_constant_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [42,42,42,42,42,42,42,42]
+; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: mul_op1_constant_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [42,42,42,42,42,42,42,42]
+; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %x = load i32, i32* %p
   %b = mul i32 %x, 42
   %r = insertelement <8 x i32> undef, i32 %b, i32 0
@@ -777,12 +849,25 @@
 ; SSE-NEXT:    movq %rax, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: or_op1_constant_v4i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movq (%rdi), %rax
-; AVX-NEXT:    orq $42, %rax
-; AVX-NEXT:    vmovq %rax, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: or_op1_constant_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT:    vorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: or_op1_constant_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [42,42,42,42]
+; AVX2-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: or_op1_constant_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [42,42,42,42]
+; AVX512-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %x = load i64, i64* %p
   %b = or i64 %x, 42
   %r = insertelement <4 x i64> undef, i64 %b, i32 0
@@ -799,12 +884,25 @@
 ; SSE-NEXT:    movd %eax, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: add_op1_constant_v16i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movl (%rdi), %eax
-; AVX-NEXT:    addl $42, %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: add_op1_constant_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    movl (%rdi), %eax
+; AVX1-NEXT:    addl $42, %eax
+; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: add_op1_constant_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movl (%rdi), %eax
+; AVX2-NEXT:    addl $42, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: add_op1_constant_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512-NEXT:    retq
   %x = load i32, i32* %p
   %b = add i32 %x, 42
   %r = insertelement <16 x i32> undef, i32 %b, i32 0
@@ -819,12 +917,26 @@
 ; SSE-NEXT:    movq %rax, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: sub_op0_constant_v8i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movl $42, %eax
-; AVX-NEXT:    subq (%rdi), %rax
-; AVX-NEXT:    vmovq %rax, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: sub_op0_constant_v8i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    movl $42, %eax
+; AVX1-NEXT:    subq (%rdi), %rax
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sub_op0_constant_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movl $42, %eax
+; AVX2-NEXT:    subq (%rdi), %rax
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sub_op0_constant_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [42,42,42,42,42,42,42,42]
+; AVX512-NEXT:    vpsubq %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    retq
   %x = load i64, i64* %p
   %b = sub i64 42, %x
   %r = insertelement <8 x i64> undef, i64 %b, i32 0
@@ -838,11 +950,23 @@
 ; SSE-NEXT:    movd %eax, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: mul_op1_constant_v16i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    imull $42, (%rdi), %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: mul_op1_constant_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    imull $42, (%rdi), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: mul_op1_constant_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    imull $42, (%rdi), %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: mul_op1_constant_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512-NEXT:    retq
   %x = load i32, i32* %p
   %b = mul i32 %x, 42
   %r = insertelement <16 x i32> undef, i32 %b, i32 0
@@ -857,12 +981,25 @@
 ; SSE-NEXT:    movq %rax, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: or_op1_constant_v8i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movq (%rdi), %rax
-; AVX-NEXT:    orq $42, %rax
-; AVX-NEXT:    vmovq %rax, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: or_op1_constant_v8i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    movq (%rdi), %rax
+; AVX1-NEXT:    orq $42, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: or_op1_constant_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq (%rdi), %rax
+; AVX2-NEXT:    orq $42, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: or_op1_constant_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vorpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-NEXT:    retq
   %x = load i64, i64* %p
   %b = or i64 %x, 42
   %r = insertelement <8 x i64> undef, i64 %b, i32 0