diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -152,12 +152,7 @@
   Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
   assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
 
-  // If original AS != Load's AS, we can't bitcast the original pointer and have
-  // to use Load's operand instead. Ideally we would want to strip pointer casts
-  // without changing AS, but there's no API to do that ATM.
   unsigned AS = Load->getPointerAddressSpace();
-  if (AS != SrcPtr->getType()->getPointerAddressSpace())
-    SrcPtr = Load->getPointerOperand();
 
   // We are potentially transforming byte-sized (8-bit) memory accesses, so make
   // sure we have all of our type-based constraints in place for this target.
@@ -245,7 +240,8 @@
   // It is safe and potentially profitable to load a vector directly:
   // inselt undef, load Scalar, 0 --> load VecPtr
   IRBuilder<> Builder(Load);
-  Value *CastedPtr = Builder.CreateBitCast(SrcPtr, MinVecTy->getPointerTo(AS));
+  Value *CastedPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      SrcPtr, MinVecTy->getPointerTo(AS));
   Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
   VecLd = Builder.CreateShuffleVector(VecLd, Mask);
 
diff --git a/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition-inseltpoison.ll
--- a/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition-inseltpoison.ll
@@ -11,9 +11,7 @@
 ; CHECK-LABEL: @load_from_other_as(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_HOGE:%.*]], align 4, addrspace(5)
-; CHECK-NEXT:    [[B:%.*]] = addrspacecast [[STRUCT_HOGE]] addrspace(5)* [[A]] to %struct.hoge*
-; CHECK-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT_HOGE]], %struct.hoge* [[B]], i64 0, i32 0
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[C]] to <1 x float>*
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast [[STRUCT_HOGE]] addrspace(5)* [[A]] to <1 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x float>, <1 x float>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[E:%.*]] = shufflevector <1 x float> [[TMP1]], <1 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    store <4 x float> [[E]], <4 x float>* [[RESULTPTR:%.*]], align 16
diff --git a/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition.ll b/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition.ll
--- a/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition.ll
+++ b/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition.ll
@@ -11,9 +11,7 @@
 ; CHECK-LABEL: @load_from_other_as(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_HOGE:%.*]], align 4, addrspace(5)
-; CHECK-NEXT:    [[B:%.*]] = addrspacecast [[STRUCT_HOGE]] addrspace(5)* [[A]] to %struct.hoge*
-; CHECK-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT_HOGE]], %struct.hoge* [[B]], i64 0, i32 0
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[C]] to <1 x float>*
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast [[STRUCT_HOGE]] addrspace(5)* [[A]] to <1 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x float>, <1 x float>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[E:%.*]] = shufflevector <1 x float> [[TMP1]], <1 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    store <4 x float> [[E]], <4 x float>* [[RESULTPTR:%.*]], align 16
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -253,6 +253,23 @@
   ret <4 x float> %r
 }
 
+; Should work with addrspace even when peeking past unsafe loads through geps
+
+define <4 x i32> @unsafe_load_i32_insert_v4i32_addrspace(i32* align 16 dereferenceable(16) %v3) {
+; CHECK-LABEL: @unsafe_load_i32_insert_v4i32_addrspace(
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast i32* [[V3:%.*]] to <4 x i32> addrspace(42)*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32> addrspace(42)* [[TMP1]], align 16
+; CHECK-NEXT:    [[INSELT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <4 x i32> [[INSELT]]
+;
+  %t0 = getelementptr inbounds i32, i32* %v3, i32 1
+  %t1 = addrspacecast i32* %t0 to i32 addrspace(42)*
+  %t2 = getelementptr inbounds i32, i32 addrspace(42)* %t1, i64 1
+  %val = load i32, i32 addrspace(42)* %t2, align 4
+  %inselt = insertelement <4 x i32> poison, i32 %val, i32 0
+  ret <4 x i32> %inselt
+}
+
 ; If there are enough dereferenceable bytes, we can offset the vector load.
 
 define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(18) %p) nofree nosync {