Index: llvm/lib/Transforms/Vectorize/VectorCombine.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
@@ -33,6 +34,7 @@
 using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "vector-combine"
+STATISTIC(NumVecLoad, "Number of vector loads formed");
 STATISTIC(NumVecCmp, "Number of vector compares formed");
 STATISTIC(NumVecBO, "Number of vector binops formed");
 STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
@@ -65,6 +67,7 @@
   const TargetTransformInfo &TTI;
   const DominatorTree &DT;
 
+  bool vectorizeLoadInsert(Instruction &I);
   ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
                                         ExtractElementInst *Ext1,
                                         unsigned PreferredExtractIndex) const;
@@ -88,6 +91,61 @@
   New.takeName(&Old);
 }
 
+bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
+  // Match insert of scalar load.
+  Value *Scalar;
+  if (!match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())))
+    return false;
+  auto *Load = dyn_cast<LoadInst>(Scalar);
+  Type *ScalarTy = Scalar->getType();
+  if (!Load || !Load->isSimple())
+    return false;
+
+  // TODO: Extend this to match GEP with constant offsets.
+  Value *PtrOp = Load->getPointerOperand()->stripPointerCasts();
+  auto *PtrOpTy = dyn_cast<PointerType>(PtrOp->getType());
+  if (!PtrOpTy)
+    return false;
+
+  unsigned VectorSize = TTI.getMinVectorRegisterBitWidth();
+  uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
+  if (!ScalarSize || VectorSize % ScalarSize != 0)
+    return false;
+
+  // Check safety of replacing the scalar load with a larger vector load.
+  auto *VectorTy = VectorType::get(ScalarTy, VectorSize / ScalarSize, false);
+  // TODO: Allow insert/extract subvector if the type does not match.
+  if (VectorTy != I.getType())
+    return false;
+  Align Alignment = Load->getAlign();
+  const DataLayout &DL = I.getModule()->getDataLayout();
+  if (!isSafeToLoadUnconditionally(PtrOp, VectorTy, Alignment, DL, Load, &DT))
+    return false;
+
+  // Original pattern: insertelt undef, load [free casts of] ScalarPtr, 0
+  int OldCost = TTI.getMemoryOpCost(Instruction::Load, ScalarTy, Alignment,
+                                    Load->getPointerAddressSpace());
+  OldCost += TTI.getVectorInstrCost(Instruction::InsertElement, I.getType());
+
+  // New pattern: load VecPtr
+  int NewCost = TTI.getMemoryOpCost(Instruction::Load, VectorTy, Alignment,
+                                    Load->getPointerAddressSpace());
+
+  // We can aggressively convert to the vector form because the backend can
+  // invert this transform if it does not result in a performance win.
+  if (OldCost < NewCost)
+    return false;
+
+  // It is safe and potentially profitable to load a vector directly:
+  // inselt undef, load Scalar, 0 --> load VecPtr
+  IRBuilder<> Builder(Load);
+  Value *CastedPtr = Builder.CreateBitCast(PtrOp, VectorTy->getPointerTo());
+  LoadInst *VecLd = Builder.CreateAlignedLoad(VectorTy, CastedPtr, Alignment);
+  replaceValue(I, *VecLd);
+  ++NumVecLoad;
+  return true;
+}
+
 /// Determine which, if any, of the inputs should be replaced by a shuffle
 /// followed by extract from a different index.
 ExtractElementInst *VectorCombine::getShuffleExtract(
@@ -625,6 +683,7 @@
       if (isa<DbgInfoIntrinsic>(I))
         continue;
       Builder.SetInsertPoint(&I);
+      MadeChange |= vectorizeLoadInsert(I);
       MadeChange |= foldExtractExtract(I);
       MadeChange |= foldBitcastShuf(I);
       MadeChange |= scalarizeBinopOrCmp(I);
Index: llvm/test/Transforms/VectorCombine/X86/load.ll
===================================================================
--- llvm/test/Transforms/VectorCombine/X86/load.ll
+++ llvm/test/Transforms/VectorCombine/X86/load.ll
@@ -174,8 +174,8 @@
 
 define <4 x float> @load_f32_insert_v4f32(float* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_f32_insert_v4f32(
-; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[R:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %s = load float, float* %p, align 4
@@ -185,9 +185,7 @@
 
 define <4 x float> @casted_load_f32_insert_v4f32(<4 x float>* align 4 dereferenceable(16) %p) {
 ; CHECK-LABEL: @casted_load_f32_insert_v4f32(
-; CHECK-NEXT:    [[B:%.*]] = bitcast <4 x float>* [[P:%.*]] to float*
-; CHECK-NEXT:    [[S:%.*]] = load float, float* [[B]], align 4
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
+; CHECK-NEXT:    [[R:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 4
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %b = bitcast <4 x float>* %p to float*
@@ -196,10 +194,12 @@
   ret <4 x float> %r
 }
 
+; Element type does not change cost.
+
 define <4 x i32> @load_i32_insert_v4i32(i32* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_i32_insert_v4i32(
-; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[P:%.*]], align 4
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[R:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
   %s = load i32, i32* %p, align 4
@@ -207,11 +207,12 @@
   ret <4 x i32> %r
 }
 
+; Pointer type does not change cost.
+
 define <4 x i32> @casted_load_i32_insert_v4i32(<16 x i8>* align 4 dereferenceable(16) %p) {
 ; CHECK-LABEL: @casted_load_i32_insert_v4i32(
-; CHECK-NEXT:    [[B:%.*]] = bitcast <16 x i8>* [[P:%.*]] to i32*
-; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[B]], align 4
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[R:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
   %b = bitcast <16 x i8>* %p to i32*
@@ -220,11 +221,11 @@
   ret <4 x i32> %r
 }
 
+; This is canonical form for vector element access.
+
 define <4 x float> @gep00_load_f32_insert_v4f32(<4 x float>* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @gep00_load_f32_insert_v4f32(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[S:%.*]] = load float, float* [[GEP]], align 16
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i64 0
+; CHECK-NEXT:    [[R:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 16
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 0
@@ -233,11 +234,13 @@
   ret <4 x float> %r
 }
 
+; If there are enough dereferenceable bytes, we can offset the vector load.
+
 define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(18) %p) {
 ; CHECK-LABEL: @gep01_load_i16_insert_v8i16(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 2
-; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>*
+; CHECK-NEXT:    [[R:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1
@@ -246,6 +249,8 @@
   ret <8 x i16> %r
 }
 
+; Negative test - can't safely load the offset vector, but could load+shuffle.
+
 define <8 x i16> @gep01_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(17) %p) {
 ; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
@@ -259,11 +264,13 @@
   ret <8 x i16> %r
 }
 
+; If there are enough dereferenceable bytes, we can offset the vector load.
+
 define <8 x i16> @gep10_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(32) %p) {
 ; CHECK-LABEL: @gep10_load_i16_insert_v8i16(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0
-; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 16
-; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>*
+; CHECK-NEXT:    [[R:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 16
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0
@@ -272,6 +279,8 @@
   ret <8 x i16> %r
 }
 
+; Negative test - can't safely load the offset vector, but could load+shuffle.
+
 define <8 x i16> @gep10_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(31) %p) {
 ; CHECK-LABEL: @gep10_load_i16_insert_v8i16_deref(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0
@@ -285,6 +294,8 @@
   ret <8 x i16> %r
 }
 
+; Negative test - do not alter volatile.
+
 define <4 x float> @load_f32_insert_v4f32_volatile(float* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_f32_insert_v4f32_volatile(
 ; CHECK-NEXT:    [[S:%.*]] = load volatile float, float* [[P:%.*]], align 4
@@ -296,6 +307,8 @@
   ret <4 x float> %r
 }
 
+; Negative test? - pointer is not as aligned as load.
+
 define <4 x float> @load_f32_insert_v4f32_align(float* align 1 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_f32_insert_v4f32_align(
 ; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
@@ -307,6 +320,8 @@
   ret <4 x float> %r
 }
 
+; Negative test - not enough bytes.
+
 define <4 x float> @load_f32_insert_v4f32_deref(float* align 4 dereferenceable(15) %p) {
 ; CHECK-LABEL: @load_f32_insert_v4f32_deref(
 ; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
@@ -318,6 +333,8 @@
   ret <4 x float> %r
 }
 
+; TODO: Should load v4i32.
+
 define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_i32_insert_v8i32(
 ; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[P:%.*]], align 4
@@ -329,6 +346,8 @@
   ret <8 x i32> %r
 }
 
+; TODO: Should load v4i32.
+
 define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceable(16) %p) {
 ; CHECK-LABEL: @casted_load_i32_insert_v8i32(
 ; CHECK-NEXT:    [[B:%.*]] = bitcast <4 x i32>* [[P:%.*]] to i32*