Index: llvm/lib/Transforms/Vectorize/VectorCombine.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
@@ -33,6 +34,7 @@
 using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "vector-combine"
+STATISTIC(NumVecLoad, "Number of vector loads formed");
 STATISTIC(NumVecCmp, "Number of vector compares formed");
 STATISTIC(NumVecBO, "Number of vector binops formed");
 STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
@@ -65,6 +67,7 @@
   const TargetTransformInfo &TTI;
   const DominatorTree &DT;
 
+  bool vectorizeLoad(Instruction &I);
   ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
                                         ExtractElementInst *Ext1,
                                         unsigned PreferredExtractIndex) const;
@@ -88,6 +91,58 @@
   New.takeName(&Old);
 }
 
+bool VectorCombine::vectorizeLoad(Instruction &I) {
+  // Match regular scalar loads.
+  auto *Load = dyn_cast<LoadInst>(&I);
+  Type *LoadTy = I.getType();
+  if (!Load || !Load->isSimple() || isa<VectorType>(LoadTy))
+    return false;
+
+  // TODO: Extend this to match GEP with constant offsets.
+  Value *PtrOp = Load->getPointerOperand()->stripPointerCasts();
+  auto *PtrOpTy = dyn_cast<PointerType>(PtrOp->getType());
+  if (!PtrOpTy)
+    return false;
+
+  unsigned VectorSize = TTI.getMinVectorRegisterBitWidth();
+  uint64_t ScalarSize = LoadTy->getPrimitiveSizeInBits();
+  if (!ScalarSize || VectorSize % ScalarSize != 0)
+    return false;
+
+
+  // Check safety of replacing the scalar load with a larger vector load.
+  auto *VectorTy = VectorType::get(LoadTy, VectorSize / ScalarSize, false);
+  Align Alignment = Load->getAlign();
+  const DataLayout &DL = I.getModule()->getDataLayout();
+  if (!isSafeToLoadUnconditionally(PtrOp, VectorTy, Alignment, DL, Load, &DT))
+    return false;
+
+  // Original pattern: load [free casts of] ScalarPtr
+  int OldCost = TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
+                                    Load->getPointerAddressSpace());
+
+  // New pattern: extractelt (load VecPtr), 0
+  int NewCost = TTI.getMemoryOpCost(Instruction::Load, VectorTy, Alignment,
+                                    Load->getPointerAddressSpace());
+  NewCost += TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 0);
+
+  // We can aggressively convert to the vector form because the backend will
+  // invert this transform if it does not result in a performance win.
+  if (OldCost < NewCost)
+    return false;
+
+  // It is safe and potentially profitable to load a vector and extract the
+  // scalar value from that:
+  // load (bitcast VecPtr to ScalarPtr) --> extractelt (load VecPtr), 0
+  IRBuilder<> Builder(Load);
+  Value *CastedPtr = Builder.CreateBitCast(PtrOp, VectorTy->getPointerTo());
+  LoadInst *VecLd = Builder.CreateAlignedLoad(VectorTy, CastedPtr, Alignment);
+  Value *ExtElt = Builder.CreateExtractElement(VecLd, Builder.getInt32(0));
+  replaceValue(I, *ExtElt);
+  ++NumVecLoad;
+  return true;
+}
+
 /// Determine which, if any, of the inputs should be replaced by a shuffle
 /// followed by extract from a different index.
 ExtractElementInst *VectorCombine::getShuffleExtract(
@@ -625,6 +680,7 @@
       if (isa<DbgInfoIntrinsic>(I))
         continue;
       Builder.SetInsertPoint(&I);
+      MadeChange |= vectorizeLoad(I);
       MadeChange |= foldExtractExtract(I);
       MadeChange |= foldBitcastShuf(I);
       MadeChange |= scalarizeBinopOrCmp(I);
Index: llvm/test/Transforms/VectorCombine/X86/load.ll
===================================================================
--- llvm/test/Transforms/VectorCombine/X86/load.ll
+++ llvm/test/Transforms/VectorCombine/X86/load.ll
@@ -6,13 +6,17 @@
 
 define float @matching_fp_scalar(float* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @matching_fp_scalar(
-; CHECK-NEXT:    [[R:%.*]] = load float, float* [[P:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
+; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %r = load float, float* %p, align 16
   ret float %r
 }
 
+; Negative test - do not alter volatile.
+
 define float @matching_fp_scalar_volatile(float* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @matching_fp_scalar_volatile(
 ; CHECK-NEXT:    [[R:%.*]] = load volatile float, float* [[P:%.*]], align 16
@@ -24,8 +28,9 @@
 
 define double @larger_fp_scalar(float* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @larger_fp_scalar(
-; CHECK-NEXT:    [[BC:%.*]] = bitcast float* [[P:%.*]] to double*
-; CHECK-NEXT:    [[R:%.*]] = load double, double* [[BC]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 16
+; CHECK-NEXT:    [[R:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
 ; CHECK-NEXT:    ret double [[R]]
 ;
   %bc = bitcast float* %p to double*
@@ -35,8 +40,9 @@
 
 define float @smaller_fp_scalar(double* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @smaller_fp_scalar(
-; CHECK-NEXT:    [[BC:%.*]] = bitcast double* [[P:%.*]] to float*
-; CHECK-NEXT:    [[R:%.*]] = load float, float* [[BC]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[P:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
+; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %bc = bitcast double* %p to float*
@@ -46,8 +52,8 @@
 
 define float @matching_fp_vector(<4 x float>* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @matching_fp_vector(
-; CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x float>* [[P:%.*]] to float*
-; CHECK-NEXT:    [[R:%.*]] = load float, float* [[BC]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 16
+; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %bc = bitcast <4 x float>* %p to float*
@@ -55,10 +61,12 @@
   ret float %r
 }
 
+; This is canonical form for load of vector element 0.
+
 define float @matching_fp_vector_gep0(<4 x float>* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @matching_fp_vector_gep0(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[R:%.*]] = load float, float* [[GEP]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 16
+; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 0
@@ -66,10 +74,13 @@
   ret float %r
 }
 
+; Pointer types do not affect the transform - only the loaded type is considered for extract cost.
+
 define float @nonmatching_int_vector(<2 x i64>* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @nonmatching_int_vector(
-; CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64>* [[P:%.*]] to float*
-; CHECK-NEXT:    [[R:%.*]] = load float, float* [[BC]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64>* [[P:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
+; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %bc = bitcast <2 x i64>* %p to float*
@@ -79,13 +90,17 @@
 
 define double @less_aligned(double* align 4 dereferenceable(16) %p) {
 ; CHECK-LABEL: @less_aligned(
-; CHECK-NEXT:    [[R:%.*]] = load double, double* [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[P:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
+; CHECK-NEXT:    [[R:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
 ; CHECK-NEXT:    ret double [[R]]
 ;
   %r = load double, double* %p, align 4
   ret double %r
 }
 
+; Negative test - not enough dereferenceable bytes.
+
 define float @matching_fp_scalar_small_deref(float* align 16 dereferenceable(15) %p) {
 ; CHECK-LABEL: @matching_fp_scalar_small_deref(
 ; CHECK-NEXT:    [[R:%.*]] = load float, float* [[P:%.*]], align 16
@@ -95,6 +110,8 @@
   ret float %r
 }
 
+; Negative test - extract from vector to int is not free.
+
 define i64 @larger_int_scalar(<4 x float>* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @larger_int_scalar(
 ; CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x float>* [[P:%.*]] to i64*
@@ -106,6 +123,8 @@
   ret i64 %r
 }
 
+; Negative test - extract from vector to int is not free.
+
 define i8 @smaller_int_scalar(<4 x float>* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @smaller_int_scalar(
 ; CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x float>* [[P:%.*]] to i8*
@@ -117,10 +136,13 @@
   ret i8 %r
 }
 
+; We could load a 32-byte vector, but we favor the minimum vector size.
+
 define double @larger_fp_scalar_256bit_vec(<8 x float>* align 32 dereferenceable(32) %p) {
 ; CHECK-LABEL: @larger_fp_scalar_256bit_vec(
-; CHECK-NEXT:    [[BC:%.*]] = bitcast <8 x float>* [[P:%.*]] to double*
-; CHECK-NEXT:    [[R:%.*]] = load double, double* [[BC]], align 32
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float>* [[P:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 32
+; CHECK-NEXT:    [[R:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
 ; CHECK-NEXT:    ret double [[R]]
 ;
   %bc = bitcast <8 x float>* %p to double*