Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -6007,7 +6007,10 @@
 /// or SDValue() otherwise.
 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
                                     SelectionDAG &DAG) {
-  if (!Subtarget->hasFp256())
+  // VBROADCAST requires AVX.
+  // TODO: It's possible to optimize splats for non-AVX CPUs using SSE
+  // instructions, but there's less potential gain for only 128-bit vectors.
+  if (!Subtarget->hasAVX())
     return SDValue();
 
   MVT VT = Op.getSimpleValueType();
@@ -6084,25 +6087,57 @@
     }
   }
 
-  bool IsGE256 = (VT.getSizeInBits() >= 256);
+  unsigned VecSize = VT.getSizeInBits();
+  unsigned ScalarSize = Ld.getValueType().getSizeInBits();
+  bool IsGE256 = (VecSize >= 256);
+
+  // When optimizing for size, generate up to 5 extra bytes for a broadcast
+  // instruction to save 8 or more bytes of constant pool data.
+  const Function *F = DAG.getMachineFunction().getFunction();
+  bool OptForSize = F->getAttributes().
+    hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
 
-  // Handle the broadcasting a single constant scalar from the constant pool
-  // into a vector. On Sandybridge it is still better to load a constant vector
+  // Handle broadcasting a single constant scalar from the constant pool
+  // into a vector.
+  // On Sandybridge (no AVX2), it is still better to load a constant vector
   // from the constant pool and not to broadcast it from a scalar.
-  if (ConstSplatVal && Subtarget->hasInt256()) {
+  // But override that restriction when optimizing for size.
+  // TODO: Check if splatting is recommended for other AVX-capable CPUs.
+  if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
     EVT CVT = Ld.getValueType();
     assert(!CVT.isVector() && "Must not broadcast a vector type");
-    unsigned ScalarSize = CVT.getSizeInBits();
-
-    if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) {
-      const Constant *C = nullptr;
+    unsigned Opcode = X86ISD::VBROADCAST; // This only changes for v2[f|i]64.
+    const Constant *C = nullptr;
+    
+    // The v2[f/i]64 case is a mess because there is no VBROADCAST to handle it.
+    // Deal with it before handling all other vector types.
+    if (VecSize == 128 && ScalarSize == 64) {
+      // This is only a size optimization - could be slightly slower in time.
+      if (OptForSize) {
+        if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) {
+          C = CI->getConstantIntValue();
+          if (!Subtarget->hasAVX2()) {
+            // For an AVX CPU, fake an int splat with FP splat.
+            Opcode = X86ISD::MOVDDUP;
+            CVT = MVT::v2f64;
+            VT = MVT::v2f64;
+          }
+        } else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld)) {
+          C = CF->getConstantFPValue();
+          Opcode = X86ISD::MOVDDUP;
+        }
+      }
+    } else if (ScalarSize >= 32 || (OptForSize && Subtarget->hasAVX2())) {
+      // Splat f32, f64, i32, i64 (excluding v2f64 and v2i64 handled above).
+      // For size optimization with AVX2, also splat i8 and i16.
+      // For an AVX CPU, fake i32 splats using the FP splat instruction.
       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
         C = CI->getConstantIntValue();
       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
         C = CF->getConstantFPValue();
+    }
 
-      assert(C && "Invalid constant type");
-
+    if (C) {
       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
       SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
@@ -6110,12 +6145,11 @@
                        MachinePointerInfo::getConstantPool(),
                        false, false, false, Alignment);
 
-      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+      return DAG.getNode(Opcode, dl, VT, Ld);
     }
   }
 
   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
-  unsigned ScalarSize = Ld.getValueType().getSizeInBits();
 
   // Handle AVX2 in-register broadcasts.
   if (!IsLoad && Subtarget->hasInt256() &&
Index: test/CodeGen/X86/splat-for-size.ll
===================================================================
--- test/CodeGen/X86/splat-for-size.ll
+++ test/CodeGen/X86/splat-for-size.ll
@@ -0,0 +1,141 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx < %s | FileCheck %s -check-prefix=CHECK --check-prefix=AVX
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx2 < %s | FileCheck %s -check-prefix=CHECK --check-prefix=AVX2
+
+; Check constant loads of every 128-bit and 256-bit vector type 
+; for size optimization using splat ops available with AVX and AVX2.
+
+; There is no AVX broadcast from double to 128-bit vector because movddup has been around since SSE3 (grrr).
+define <2 x double> @splat_v2f64(<2 x double> %x) #0 {
+  %add = fadd <2 x double> %x, <double 1.0, double 1.0>
+  ret <2 x double> %add
+; CHECK-LABEL: splat_v2f64
+; CHECK: vmovddup
+; CHECK: vaddpd 
+; CHECK-NEXT: retq
+}
+
+define <4 x double> @splat_v4f64(<4 x double> %x) #0 {
+  %add = fadd <4 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0>
+  ret <4 x double> %add
+; CHECK-LABEL: splat_v4f64
+; CHECK: vbroadcastsd 
+; CHECK-NEXT: vaddpd
+; CHECK-NEXT: retq
+}
+
+define <4 x float> @splat_v4f32(<4 x float> %x) #0 {
+  %add = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
+  ret <4 x float> %add
+; CHECK-LABEL: splat_v4f32
+; CHECK: vbroadcastss 
+; CHECK-NEXT: vaddps
+; CHECK-NEXT: retq
+}
+
+define <8 x float> @splat_v8f32(<8 x float> %x) #0 {
+  %add = fadd <8 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
+  ret <8 x float> %add
+; CHECK-LABEL: splat_v8f32
+; CHECK: vbroadcastss 
+; CHECK-NEXT: vaddps
+; CHECK-NEXT: retq
+}
+
+; AVX can't do integer splats, so fake it: use vmovddup to splat 64-bit value.
+define <2 x i64> @splat_v2i64(<2 x i64> %x) #0 {
+  %add = add <2 x i64> %x, <i64 1, i64 1>
+  ret <2 x i64> %add
+; CHECK-LABEL: splat_v2i64
+; AVX: vmovddup 
+; AVX2: vpbroadcastq 
+; CHECK: vpaddq
+; CHECK-NEXT: retq
+}
+
+; AVX can't do 256-bit integer ops, so we split this into two 128-bit vectors,
+; and then we fake it: use vmovddup to splat 64-bit value.
+define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 {
+  %add = add <4 x i64> %x, <i64 1, i64 1, i64 1, i64 1>
+  ret <4 x i64> %add
+; CHECK-LABEL: splat_v4i64
+; AVX: vmovddup
+; AVX: vpaddq 
+; AVX: vpaddq 
+; AVX2: vpbroadcastq 
+; AVX2: vpaddq 
+; CHECK: retq
+}
+
+; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value.
+define <4 x i32> @splat_v4i32(<4 x i32> %x) #0 {
+  %add = add <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %add
+; CHECK-LABEL: splat_v4i32
+; AVX: vbroadcastss
+; AVX2: vpbroadcastd 
+; CHECK-NEXT: vpaddd 
+; CHECK-NEXT: retq
+}
+
+; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value.
+define <8 x i32> @splat_v8i32(<8 x i32> %x) #0 {
+  %add = add <8 x i32> %x, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x i32> %add
+; CHECK-LABEL: splat_v8i32
+; AVX: vbroadcastss
+; AVX: vpaddd 
+; AVX: vpaddd 
+; AVX2: vpbroadcastd 
+; AVX2: vpaddd 
+; CHECK: retq
+}
+
+; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc?
+define <8 x i16> @splat_v8i16(<8 x i16> %x) #0 {
+  %add = add <8 x i16> %x, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %add
+; CHECK-LABEL: splat_v8i16
+; AVX-NOT: broadcast
+; AVX2: vpbroadcastw 
+; CHECK: vpaddw 
+; CHECK-NEXT: retq
+}
+
+; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc?
+define <16 x i16> @splat_v16i16(<16 x i16> %x) #0 {
+  %add = add <16 x i16> %x, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <16 x i16> %add
+; CHECK-LABEL: splat_v16i16
+; AVX-NOT: broadcast
+; AVX: vpaddw 
+; AVX: vpaddw 
+; AVX2: vpbroadcastw 
+; AVX2: vpaddw 
+; CHECK: retq
+}
+
+; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc?
+define <16 x i8> @splat_v16i8(<16 x i8> %x) #0 {
+  %add = add <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %add
+; CHECK-LABEL: splat_v16i8
+; AVX-NOT: broadcast
+; AVX2: vpbroadcastb 
+; CHECK: vpaddb 
+; CHECK-NEXT: retq
+}
+
+; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc?
+define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 {
+  %add = add <32 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <32 x i8> %add
+; CHECK-LABEL: splat_v32i8
+; AVX-NOT: broadcast
+; AVX: vpaddb 
+; AVX: vpaddb 
+; AVX2: vpbroadcastb 
+; AVX2: vpaddb 
+; CHECK: retq
+}
+
+attributes #0 = { optsize }