Index: llvm/lib/Transforms/Vectorize/VectorCombine.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -93,6 +93,7 @@
 
 bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   // Match insert into fixed vector of scalar value.
+  // TODO: Handle non-zero insert index.
   auto *Ty = dyn_cast<FixedVectorType>(I.getType());
   Value *Scalar;
   if (!Ty || !match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) ||
@@ -115,7 +116,6 @@
       mustSuppressSpeculation(*Load))
     return false;
 
-  // TODO: Extend this to match GEP with constant offsets.
   const DataLayout &DL = I.getModule()->getDataLayout();
   Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
   assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
@@ -136,9 +136,29 @@
   // Check safety of replacing the scalar load with a larger vector load.
   unsigned MinVecNumElts = MinVectorSize / ScalarSize;
   auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
+  // TODO: also check getPointerAlignment() of SrcPtr?
   Align Alignment = Load->getAlign();
-  if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Alignment, DL, Load, &DT))
-    return false;
+  unsigned OffsetInBits = 0;
+  if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Alignment, DL, Load,
+                                   &DT)) {
+    // It is not safe to load directly from the pointer, but we can still peek
+    // through gep offsets and check if it safe to load from a base address with
+    // updated alignment. If it is, we can shuffle the element(s) into place
+    // after loading.
+    unsigned OffsetBitWidth = DL.getIndexTypeSizeInBits(SrcPtr->getType());
+    APInt Offset(OffsetBitWidth, 0);
+    SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
+    Alignment = SrcPtr->getPointerAlignment(DL);
+    if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Alignment, DL, Load,
+                                     &DT))
+      return false;
+
+    // The offset must be within a vector-length to allow shuffling into place.
+    if (Offset.uge(MinVectorSize / 8))
+      return false;
+
+    OffsetInBits = Offset.getZExtValue() * 8;
+  }
 
   // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
   Type *LoadTy = Load->getType();
@@ -149,6 +169,9 @@
 
   // New pattern: load VecPtr
   int NewCost = TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS);
+  // Optionally, we are shuffling the loaded vector element(s) into place.
+  if (OffsetInBits)
+    NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy);
 
   // We can aggressively convert to the vector form because the backend can
   // invert this transform if it does not result in a performance win.
@@ -164,12 +187,12 @@
   // Set everything but element 0 to undef to prevent poison from propagating
   // from the extra loaded memory. This will also optionally shrink/grow the
   // vector from the loaded size to the output size.
-  // We assume this operation has no cost in codegen.
+  // We assume this operation has no cost in codegen if there was no offset.
   // Note that we could use freeze to avoid poison problems, but then we might
   // still need a shuffle to change the vector size.
   unsigned OutputNumElts = Ty->getNumElements();
   SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem);
-  Mask[0] = 0;
+  Mask[0] = OffsetInBits / ScalarSize;
   VecLd = Builder.CreateShuffleVector(VecLd, Mask);
 
   replaceValue(I, *VecLd);
Index: llvm/test/Transforms/VectorCombine/X86/load.ll
===================================================================
--- llvm/test/Transforms/VectorCombine/X86/load.ll
+++ llvm/test/Transforms/VectorCombine/X86/load.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s
-; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s
+; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -269,14 +269,19 @@
   ret <8 x i16> %r
 }
 
-; Negative test - can't safely load the offset vector, but could load+shuffle.
+; Can't safely load the offset vector, but can load+shuffle if it is profitable.
 
 define <8 x i16> @gep01_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(17) %p) {
-; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 2
-; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
-; CHECK-NEXT:    ret <8 x i16> [[R]]
+; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 2
+; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
+; SSE2-NEXT:    ret <8 x i16> [[R]]
+;
+; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[P:%.*]], align 16
+; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1
   %s = load i16, i16* %gep, align 2
@@ -284,14 +289,19 @@
   ret <8 x i16> %r
 }
 
-; TODO: Verify that alignment of the new load is not over-specified.
+; Verify that alignment of the new load is not over-specified.
 
 define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(<8 x i16>* align 2 dereferenceable(16) %p) {
-; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 8
-; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
-; CHECK-NEXT:    ret <8 x i16> [[R]]
+; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 8
+; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
+; SSE2-NEXT:    ret <8 x i16> [[R]]
+;
+; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[P:%.*]], align 2
+; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1
   %s = load i16, i16* %gep, align 8
@@ -403,12 +413,14 @@
   ret <4 x float> %r
 }
 
-; Negative test? - pointer is not as aligned as load.
+; Pointer is not as aligned as load, but that's ok.
+; TODO: Should we choose the maximum alignment instead?
 
 define <4 x float> @load_f32_insert_v4f32_align(float* align 1 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_f32_insert_v4f32_align(
-; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 1
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %s = load float, float* %p, align 4
@@ -561,15 +573,21 @@
   ret <8 x i32> %r
 }
 
-; TODO: Can't safely load the offset vector, but can load+shuffle if it is profitable.
+; Can't safely load the offset vector, but can load+shuffle if it is profitable.
 
 define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(<2 x i16>* align 16 dereferenceable(16) %p) {
-; CHECK-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[P:%.*]], i64 1
-; CHECK-NEXT:    [[L:%.*]] = load <2 x i16>, <2 x i16>* [[GEP]], align 2
-; CHECK-NEXT:    [[S:%.*]] = extractelement <2 x i16> [[L]], i32 0
-; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
-; CHECK-NEXT:    ret <8 x i16> [[R]]
+; SSE2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[P:%.*]], i64 1
+; SSE2-NEXT:    [[L:%.*]] = load <2 x i16>, <2 x i16>* [[GEP]], align 2
+; SSE2-NEXT:    [[S:%.*]] = extractelement <2 x i16> [[L]], i32 0
+; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
+; SSE2-NEXT:    ret <8 x i16> [[R]]
+;
+; AVX2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16>* [[P:%.*]] to <8 x i16>*
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 16
+; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> undef, <8 x i32> <i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <2 x i16>, <2 x i16>* %p, i64 1
   %l = load <2 x i16>, <2 x i16>* %gep, align 2