diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -1665,7 +1665,58 @@
   case Instruction::FPExt:
     simplifyAndSetOp(I, 0, DemandedElts, UndefElts);
     break;
+  case Instruction::Load: {
+    // See if we can reduce the number of elements being loaded.
+    auto *LI = cast<LoadInst>(I);
+    unsigned UsedBegin = DemandedElts.countTrailingZeros();
+    unsigned ActiveWidth = DemandedElts.getActiveBits();
+    auto *OldTy = cast<FixedVectorType>(LI->getType());
+    unsigned OldNumElts = OldTy->getNumElements();
+
+    // TODO: Optimize leading elements, which needs adjusting pointers and
+    // alignment.
+    if (ActiveWidth >= OldNumElts)
+      return nullptr;
+
+    unsigned NewNumElts = ActiveWidth;
+    Type *EltTy = OldTy->getElementType();
+    Type *NewTy =
+        (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
+
+    auto *OldPtr = LI->getPointerOperand();
+    Instruction *NewPtr = nullptr;
+    if (!cast<PointerType>(OldPtr->getType())->isOpaque()) {
+      NewPtr = CastInst::CreatePointerCast(
+          OldPtr, PointerType::get(NewTy, LI->getPointerAddressSpace()));
+    }
+
+    auto *NewLI = new LoadInst(NewTy, NewPtr ? NewPtr : OldPtr, LI->getName(),
+                               LI->isVolatile(), LI->getAlign(),
+                               LI->getOrdering(), LI->getSyncScopeID());
 
+    Instruction *NewI = nullptr;
+    if (NewNumElts == 1) {
+      NewI = InsertElementInst::Create(
+          PoisonValue::get(OldTy), NewLI,
+          ConstantInt::get(Type::getInt32Ty(EltTy->getContext()), UsedBegin));
+    } else {
+      SmallVector<int> Mask;
+      for (unsigned I = 0; I < OldNumElts; ++I) {
+        // TODO: This needs to be adjusted when optimizing leading elements.
+        if (!!DemandedElts[I])
+          Mask.push_back(I);
+        else
+          Mask.push_back(NewNumElts);
+      }
+      NewI = new ShuffleVectorInst(NewLI, PoisonValue::get(NewTy), Mask);
+    }
+    copyMetadataForLoad(*NewLI, *LI);
+    if (NewPtr)
+      InsertNewInstWith(NewPtr, *LI);
+    InsertNewInstWith(NewLI, *LI);
+    InsertNewInstWith(NewI, *LI);
+    return NewI;
+  }
   case Instruction::Call: {
     IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
     if (!II) break;
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/simplify-load.ll b/llvm/test/Transforms/InstCombine/AMDGPU/simplify-load.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/simplify-load.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck %s
+
+; Make sure the load simplification works for typed pointers.
+define amdgpu_kernel void @simplify_load_vector(<2 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
+; CHECK-LABEL: @simplify_load_vector(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> addrspace(1)* [[IN0:%.*]] to <2 x i32> addrspace(1)*
+; CHECK-NEXT:    [[A1:%.*]] = load <2 x i32>, <2 x i32> addrspace(1)* [[TMP1]], align 16
+; CHECK-NEXT:    store <2 x i32> [[A1]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT:    ret void
+;
+  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in0, align 16
+  %r = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  store <2 x i32> %r, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/llvm/test/Transforms/InstCombine/obfuscated_splat-inseltpoison.ll b/llvm/test/Transforms/InstCombine/obfuscated_splat-inseltpoison.ll
--- a/llvm/test/Transforms/InstCombine/obfuscated_splat-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/obfuscated_splat-inseltpoison.ll
@@ -5,7 +5,7 @@
   %B = shufflevector <4 x float> %A, <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
   %C = shufflevector <4 x float> %B, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
   %D = shufflevector <4 x float> %C, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
-; CHECK:  %D = shufflevector <4 x float> %A, <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK:  %D = shufflevector <4 x float> {{%.*}}, <4 x float> poison, <4 x i32> zeroinitializer
   store <4 x float> %D, ptr %out_ptr
   ret void
 }
diff --git a/llvm/test/Transforms/InstCombine/obfuscated_splat.ll b/llvm/test/Transforms/InstCombine/obfuscated_splat.ll
--- a/llvm/test/Transforms/InstCombine/obfuscated_splat.ll
+++ b/llvm/test/Transforms/InstCombine/obfuscated_splat.ll
@@ -5,7 +5,7 @@
   %B = shufflevector <4 x float> %A, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
   %C = shufflevector <4 x float> %B, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
   %D = shufflevector <4 x float> %C, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
-; CHECK:  %D = shufflevector <4 x float> %A, <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK:  %D = shufflevector <4 x float> {{%.*}}, <4 x float> poison, <4 x i32> zeroinitializer
   store <4 x float> %D, ptr %out_ptr
   ret void
 }
diff --git a/llvm/test/Transforms/InstCombine/scalarization-inseltpoison.ll b/llvm/test/Transforms/InstCombine/scalarization-inseltpoison.ll
--- a/llvm/test/Transforms/InstCombine/scalarization-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/scalarization-inseltpoison.ll
@@ -4,8 +4,8 @@
 define i32 @extract_load(ptr %p) {
 ;
 ; CHECK-LABEL: @extract_load(
-; CHECK-NEXT:    [[X:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4
-; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x i32> [[X]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[EXT:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1
 ; CHECK-NEXT:    ret i32 [[EXT]]
 ;
   %x = load <4 x i32>, ptr %p, align 4
@@ -28,8 +28,9 @@
 define double @extract_load_volatile(ptr %p) {
 ;
 ; CHECK-LABEL: @extract_load_volatile(
-; CHECK-NEXT:    [[X:%.*]] = load volatile <4 x double>, ptr [[P:%.*]], align 32
-; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x double> [[X]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load volatile <3 x double>, ptr [[P:%.*]], align 32
+; CHECK-NEXT:    [[X:%.*]] = load volatile <4 x double>, ptr [[P]], align 32
+; CHECK-NEXT:    [[EXT:%.*]] = extractelement <3 x double> [[TMP1]], i64 2
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
   %x = load volatile <4 x double>, ptr %p
@@ -183,10 +184,10 @@
 define float @extract_element_load(<4 x float> %x, ptr %ptr) {
 ;
 ; CHECK-LABEL: @extract_element_load(
-; CHECK-NEXT:    [[LOAD:%.*]] = load <4 x float>, ptr [[PTR:%.*]], align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[LOAD]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[X:%.*]], i64 2
-; CHECK-NEXT:    [[R:%.*]] = fadd float [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[PTR:%.*]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x float> [[TMP1]], i64 2
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[X:%.*]], i64 2
+; CHECK-NEXT:    [[R:%.*]] = fadd float [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %load = load <4 x float>, ptr %ptr
diff --git a/llvm/test/Transforms/InstCombine/scalarization.ll b/llvm/test/Transforms/InstCombine/scalarization.ll
--- a/llvm/test/Transforms/InstCombine/scalarization.ll
+++ b/llvm/test/Transforms/InstCombine/scalarization.ll
@@ -4,8 +4,8 @@
 define i32 @extract_load(ptr %p) {
 ;
 ; CHECK-LABEL: @extract_load(
-; CHECK-NEXT:    [[X:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4
-; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x i32> [[X]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[EXT:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1
 ; CHECK-NEXT:    ret i32 [[EXT]]
 ;
   %x = load <4 x i32>, ptr %p, align 4
@@ -28,8 +28,9 @@
 define double @extract_load_volatile(ptr %p) {
 ;
 ; CHECK-LABEL: @extract_load_volatile(
-; CHECK-NEXT:    [[X:%.*]] = load volatile <4 x double>, ptr [[P:%.*]], align 32
-; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x double> [[X]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load volatile <3 x double>, ptr [[P:%.*]], align 32
+; CHECK-NEXT:    [[X:%.*]] = load volatile <4 x double>, ptr [[P]], align 32
+; CHECK-NEXT:    [[EXT:%.*]] = extractelement <3 x double> [[TMP1]], i64 2
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
   %x = load volatile <4 x double>, ptr %p
@@ -183,10 +184,10 @@
 define float @extract_element_load(<4 x float> %x, ptr %ptr) {
 ;
 ; CHECK-LABEL: @extract_element_load(
-; CHECK-NEXT:    [[LOAD:%.*]] = load <4 x float>, ptr [[PTR:%.*]], align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[LOAD]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[X:%.*]], i64 2
-; CHECK-NEXT:    [[R:%.*]] = fadd float [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[PTR:%.*]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x float> [[TMP1]], i64 2
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[X:%.*]], i64 2
+; CHECK-NEXT:    [[R:%.*]] = fadd float [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %load = load <4 x float>, ptr %ptr
diff --git a/llvm/test/Transforms/InstCombine/shuffle-binop.ll b/llvm/test/Transforms/InstCombine/shuffle-binop.ll
--- a/llvm/test/Transforms/InstCombine/shuffle-binop.ll
+++ b/llvm/test/Transforms/InstCombine/shuffle-binop.ll
@@ -188,10 +188,12 @@
 define <2 x double> @shuffle_op2_0th_element_mask(ptr %a, ptr %b) {
   ;%0 = load <2 x double>, ptr @d, align 16
 ; CHECK-LABEL: @shuffle_op2_0th_element_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[A:%.*]], align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[B:%.*]], align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[A:%.*]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[B:%.*]], align 16
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x double> [[SHUFFLE]]
 ;
   %1 = load <2 x double>, ptr %a, align 16
diff --git a/llvm/test/Transforms/InstCombine/sink-into-catchswitch.ll b/llvm/test/Transforms/InstCombine/sink-into-catchswitch.ll
--- a/llvm/test/Transforms/InstCombine/sink-into-catchswitch.ll
+++ b/llvm/test/Transforms/InstCombine/sink-into-catchswitch.ll
@@ -9,8 +9,7 @@
 define void @test1(ptr %p) personality ptr @__CxxFrameHandler3 {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  invoke.cont:
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[P:%.*]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    invoke void @throw()
 ; CHECK-NEXT:    to label [[UNREACHABLE:%.*]] unwind label [[CATCH_DISPATCH:%.*]]
 ; CHECK:       catch.dispatch:
@@ -20,7 +19,7 @@
 ; CHECK-NEXT:    invoke void @throw() [ "funclet"(token [[CATCH]]) ]
 ; CHECK-NEXT:    to label [[UNREACHABLE]] unwind label [[EHCLEANUP]]
 ; CHECK:       ehcleanup:
-; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[TMP2]], [[CATCH_DISPATCH]] ], [ 9, [[INVOKE_CONT1:%.*]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[TMP0]], [[CATCH_DISPATCH]] ], [ 9, [[INVOKE_CONT1:%.*]] ]
 ; CHECK-NEXT:    [[CLEANUP:%.*]] = cleanuppad within none []
 ; CHECK-NEXT:    call void @release(i64 [[PHI]]) [ "funclet"(token [[CLEANUP]]) ]
 ; CHECK-NEXT:    cleanupret from [[CLEANUP]] unwind to caller