diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -89,6 +89,7 @@
   bool scalarizeBinopOrCmp(Instruction &I);
   bool foldExtractedCmps(Instruction &I);
   bool foldSingleElementStore(Instruction &I);
+  bool scalarizeLoadExtract(Instruction &I);
 };
 } // namespace
 
@@ -818,6 +819,80 @@
   return false;
 }
 
+/// Try to scalarize vector loads feeding extractelement instructions.
+bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
+  auto *LI = dyn_cast<LoadInst>(&I);
+  const DataLayout &DL = I.getModule()->getDataLayout();
+  if (!LI || LI->isVolatile() || !DL.typeSizeEqualsStoreSize(LI->getType()))
+    return false;
+
+  auto *FixedVT = dyn_cast<FixedVectorType>(LI->getType());
+  if (!FixedVT)
+    return false;
+
+  InstructionCost OriginalCost = TTI.getMemoryOpCost(
+      Instruction::Load, LI->getType(), Align(LI->getAlignment()),
+      LI->getPointerAddressSpace());
+  InstructionCost ScalarizedCost = 0;
+
+  Instruction *LastCheckedInst = LI;
+  unsigned NumInstChecked = 0;
+  // Check if all users of the load are extracts with no memory modifications
+  // between the load and the extract. Compute the cost of both the original
+  // code and the scalarized version.
+  for (User *U : LI->users()) {
+    auto *UI = dyn_cast<ExtractElementInst>(U);
+    if (!UI || UI->getParent() != LI->getParent())
+      return false;
+
+    // Check if any instruction between the load and the extract may modify
+    // memory.
+    if (LastCheckedInst->comesBefore(UI)) {
+      for (Instruction &I :
+           make_range(std::next(LI->getIterator()), UI->getIterator())) {
+        // Bail out if we reached the check limit or the instruction may write
+        // to memory.
+        if (NumInstChecked == 6 || I.mayWriteToMemory())
+          return false;
+        NumInstChecked++;
+      }
+    }
+
+    if (!LastCheckedInst)
+      LastCheckedInst = UI;
+    else if (LastCheckedInst->comesBefore(UI))
+      LastCheckedInst = UI;
+
+    auto *Index = dyn_cast<ConstantInt>(UI->getOperand(1));
+    OriginalCost +=
+        TTI.getVectorInstrCost(Instruction::ExtractElement, LI->getType(),
+                               Index ? Index->getZExtValue() : -1);
+    ScalarizedCost +=
+        TTI.getMemoryOpCost(Instruction::Load, FixedVT->getElementType(),
+                            Align(1), LI->getPointerAddressSpace());
+    ScalarizedCost += TTI.getAddressComputationCost(FixedVT->getElementType());
+  }
+
+  if (ScalarizedCost >= OriginalCost)
+    return false;
+
+  // Replace extracts with narrow scalar loads.
+  for (User *U : LI->users()) {
+    auto *EI = cast<ExtractElementInst>(U);
+    IRBuilder<>::InsertPointGuard Guard(Builder);
+    Builder.SetInsertPoint(EI);
+    Value *GEP = Builder.CreateInBoundsGEP(
+        FixedVT, LI->getOperand(0), {Builder.getInt32(0), EI->getOperand(1)});
+    auto *NewLoad = cast<LoadInst>(Builder.CreateLoad(
+        FixedVT->getElementType(), GEP, EI->getName() + ".scalar"));
+    NewLoad->setAlignment(commonAlignment(
+        Align(LI->getAlignment()), DL.getABITypeAlign(NewLoad->getType())));
+    replaceValue(*EI, *NewLoad);
+  }
+
+  return true;
+}
+
 /// This is the entry point for all transforms. Pass manager differences are
 /// handled in the callers of this function.
 bool VectorCombine::run() {
@@ -843,6 +918,7 @@
       MadeChange |= foldBitcastShuf(I);
       MadeChange |= scalarizeBinopOrCmp(I);
       MadeChange |= foldExtractedCmps(I);
+      MadeChange |= scalarizeLoadExtract(I);
       MadeChange |= foldSingleElementStore(I);
     }
   }
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll b/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
--- a/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
@@ -3,8 +3,8 @@
 
 define i32 @load_extract_idx_0(<4 x i32>* %x) {
 ; CHECK-LABEL: @load_extract_idx_0(
-; CHECK-NEXT:    [[LV:%.*]] = load <4 x i32>, <4 x i32>* [[X:%.*]], align 16
-; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X:%.*]], i32 0, i32 3
+; CHECK-NEXT:    [[R:%.*]] = load i32, i32* [[TMP1]], align 4
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %lv = load <4 x i32>, <4 x i32>* %x
@@ -12,10 +12,34 @@
   ret i32 %r
 }
 
+define i32 @load_extract_idx_0_align_1(<4 x i32>* %x) {
+; CHECK-LABEL: @load_extract_idx_0_align_1(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X:%.*]], i32 0, i32 3
+; CHECK-NEXT:    [[R:%.*]] = load i32, i32* [[TMP1]], align 1
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %lv = load <4 x i32>, <4 x i32>* %x, align 1
+  %r = extractelement <4 x i32> %lv, i32 3
+  ret i32 %r
+}
+
+define i32 @load_extract_idx_0_align_32(<4 x i32>* %x) {
+; CHECK-LABEL: @load_extract_idx_0_align_32(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X:%.*]], i32 0, i32 3
+; CHECK-NEXT:    [[R:%.*]] = load i32, i32* [[TMP1]], align 4
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %lv = load <4 x i32>, <4 x i32>* %x, align 32
+  %r = extractelement <4 x i32> %lv, i32 3
+  ret i32 %r
+}
+
+
+
 define i32 @load_extract_idx_1(<4 x i32>* %x) {
 ; CHECK-LABEL: @load_extract_idx_1(
-; CHECK-NEXT:    [[LV:%.*]] = load <4 x i32>, <4 x i32>* [[X:%.*]], align 16
-; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X:%.*]], i32 0, i32 1
+; CHECK-NEXT:    [[R:%.*]] = load i32, i32* [[TMP1]], align 4
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %lv = load <4 x i32>, <4 x i32>* %x
@@ -25,8 +49,8 @@
 
 define i32 @load_extract_idx_2(<4 x i32>* %x) {
 ; CHECK-LABEL: @load_extract_idx_2(
-; CHECK-NEXT:    [[LV:%.*]] = load <4 x i32>, <4 x i32>* [[X:%.*]], align 16
-; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X:%.*]], i32 0, i32 2
+; CHECK-NEXT:    [[R:%.*]] = load i32, i32* [[TMP1]], align 4
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %lv = load <4 x i32>, <4 x i32>* %x
@@ -36,8 +60,8 @@
 
 define i32 @load_extract_idx_3(<4 x i32>* %x) {
 ; CHECK-LABEL: @load_extract_idx_3(
-; CHECK-NEXT:    [[LV:%.*]] = load <4 x i32>, <4 x i32>* [[X:%.*]], align 16
-; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X:%.*]], i32 0, i32 3
+; CHECK-NEXT:    [[R:%.*]] = load i32, i32* [[TMP1]], align 4
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %lv = load <4 x i32>, <4 x i32>* %x
@@ -47,8 +71,8 @@
 
 define i32 @load_extract_idx_var_i64(<4 x i32>* %x, i64 %idx) {
 ; CHECK-LABEL: @load_extract_idx_var_i64(
-; CHECK-NEXT:    [[LV:%.*]] = load <4 x i32>, <4 x i32>* [[X:%.*]], align 16
-; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i32> [[LV]], i64 [[IDX:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X:%.*]], i32 0, i64 [[IDX:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = load i32, i32* [[TMP1]], align 4
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %lv = load <4 x i32>, <4 x i32>* %x
@@ -58,8 +82,8 @@
 
 define i32 @load_extract_idx_var_i32(<4 x i32>* %x, i32 %idx) {
 ; CHECK-LABEL: @load_extract_idx_var_i32(
-; CHECK-NEXT:    [[LV:%.*]] = load <4 x i32>, <4 x i32>* [[X:%.*]], align 16
-; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 [[IDX:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X:%.*]], i32 0, i32 [[IDX:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = load i32, i32* [[TMP1]], align 4
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %lv = load <4 x i32>, <4 x i32>* %x
@@ -72,8 +96,8 @@
 define i32 @load_extract_clobber_call_before(<4 x i32>* %x) {
 ; CHECK-LABEL: @load_extract_clobber_call_before(
 ; CHECK-NEXT:    call void @clobber()
-; CHECK-NEXT:    [[LV:%.*]] = load <4 x i32>, <4 x i32>* [[X:%.*]], align 16
-; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X:%.*]], i32 0, i32 2
+; CHECK-NEXT:    [[R:%.*]] = load i32, i32* [[TMP1]], align 4
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   call void @clobber()
@@ -97,8 +121,8 @@
 
 define i32 @load_extract_clobber_call_after(<4 x i32>* %x) {
 ; CHECK-LABEL: @load_extract_clobber_call_after(
-; CHECK-NEXT:    [[LV:%.*]] = load <4 x i32>, <4 x i32>* [[X:%.*]], align 16
-; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X:%.*]], i32 0, i32 2
+; CHECK-NEXT:    [[R:%.*]] = load i32, i32* [[TMP1]], align 4
 ; CHECK-NEXT:    call void @clobber()
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
@@ -111,8 +135,8 @@
 define i32 @load_extract_clobber_store_before(<4 x i32>* %x, i8* %y) {
 ; CHECK-LABEL: @load_extract_clobber_store_before(
 ; CHECK-NEXT:    store i8 0, i8* [[Y:%.*]], align 1
-; CHECK-NEXT:    [[LV:%.*]] = load <4 x i32>, <4 x i32>* [[X:%.*]], align 16
-; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X:%.*]], i32 0, i32 2
+; CHECK-NEXT:    [[R:%.*]] = load i32, i32* [[TMP1]], align 4
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   store i8 0, i8* %y
@@ -255,9 +279,10 @@
 ; Scalarizing may or may not be profitable, depending on the target.
 define i32 @load_multiple_2_with_variable_indices(<4 x i32>* %x, i64 %idx.0, i64 %idx.1) {
 ; CHECK-LABEL: @load_multiple_2_with_variable_indices(
-; CHECK-NEXT:    [[LV:%.*]] = load <4 x i32>, <4 x i32>* [[X:%.*]], align 16
-; CHECK-NEXT:    [[E_0:%.*]] = extractelement <4 x i32> [[LV]], i64 [[IDX_0:%.*]]
-; CHECK-NEXT:    [[E_1:%.*]] = extractelement <4 x i32> [[LV]], i64 [[IDX_1:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X:%.*]], i32 0, i64 [[IDX_0:%.*]]
+; CHECK-NEXT:    [[E_0:%.*]] = load i32, i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X]], i32 0, i64 [[IDX_1:%.*]]
+; CHECK-NEXT:    [[E_1:%.*]] = load i32, i32* [[TMP2]], align 4
 ; CHECK-NEXT:    [[RES:%.*]] = add i32 [[E_0]], [[E_1]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
@@ -270,11 +295,14 @@
 
 define i32 @load_4_extracts_with_variable_indices_short_vector(<4 x i32>* %x, i64 %idx.0, i64 %idx.1, i64 %idx.2, i64 %idx.3) {
 ; CHECK-LABEL: @load_4_extracts_with_variable_indices_short_vector(
-; CHECK-NEXT:    [[LV:%.*]] = load <4 x i32>, <4 x i32>* [[X:%.*]], align 16
-; CHECK-NEXT:    [[E_0:%.*]] = extractelement <4 x i32> [[LV]], i64 [[IDX_0:%.*]]
-; CHECK-NEXT:    [[E_1:%.*]] = extractelement <4 x i32> [[LV]], i64 [[IDX_1:%.*]]
-; CHECK-NEXT:    [[E_2:%.*]] = extractelement <4 x i32> [[LV]], i64 [[IDX_2:%.*]]
-; CHECK-NEXT:    [[E_3:%.*]] = extractelement <4 x i32> [[LV]], i64 [[IDX_3:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X:%.*]], i32 0, i64 [[IDX_0:%.*]]
+; CHECK-NEXT:    [[E_0:%.*]] = load i32, i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X]], i32 0, i64 [[IDX_1:%.*]]
+; CHECK-NEXT:    [[E_1:%.*]] = load i32, i32* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X]], i32 0, i64 [[IDX_2:%.*]]
+; CHECK-NEXT:    [[E_2:%.*]] = load i32, i32* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X]], i32 0, i64 [[IDX_3:%.*]]
+; CHECK-NEXT:    [[E_3:%.*]] = load i32, i32* [[TMP4]], align 4
 ; CHECK-NEXT:    [[RES_0:%.*]] = add i32 [[E_0]], [[E_1]]
 ; CHECK-NEXT:    [[RES_1:%.*]] = add i32 [[RES_0]], [[E_2]]
 ; CHECK-NEXT:    [[RES_2:%.*]] = add i32 [[RES_1]], [[E_3]]
@@ -293,9 +321,10 @@
 
 define i32 @load_multiple_extracts_with_variable_indices_large_vector(<16 x i32>* %x, i64 %idx.0, i64 %idx.1) {
 ; CHECK-LABEL: @load_multiple_extracts_with_variable_indices_large_vector(
-; CHECK-NEXT:    [[LV:%.*]] = load <16 x i32>, <16 x i32>* [[X:%.*]], align 64
-; CHECK-NEXT:    [[E_0:%.*]] = extractelement <16 x i32> [[LV]], i64 [[IDX_0:%.*]]
-; CHECK-NEXT:    [[E_1:%.*]] = extractelement <16 x i32> [[LV]], i64 [[IDX_1:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <16 x i32>, <16 x i32>* [[X:%.*]], i32 0, i64 [[IDX_0:%.*]]
+; CHECK-NEXT:    [[E_0:%.*]] = load i32, i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <16 x i32>, <16 x i32>* [[X]], i32 0, i64 [[IDX_1:%.*]]
+; CHECK-NEXT:    [[E_1:%.*]] = load i32, i32* [[TMP2]], align 4
 ; CHECK-NEXT:    [[RES:%.*]] = add i32 [[E_0]], [[E_1]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -630,14 +630,14 @@
 define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(<2 x i16>* align 1 dereferenceable(16) %p) {
 ; SSE2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
 ; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[P:%.*]], i64 1
-; SSE2-NEXT:    [[L:%.*]] = load <2 x i16>, <2 x i16>* [[GEP]], align 8
-; SSE2-NEXT:    [[S:%.*]] = extractelement <2 x i16> [[L]], i32 0
+; SSE2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[GEP]], i32 0, i32 0
+; SSE2-NEXT:    [[S:%.*]] = load i16, i16* [[TMP1]], align 2
 ; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
 ; SSE2-NEXT:    ret <8 x i16> [[R]]
 ;
 ; AVX2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
 ; AVX2-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16>* [[P:%.*]] to <8 x i16>*
-; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2
 ; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:    ret <8 x i16> [[R]]
 ;
diff --git a/llvm/test/Transforms/VectorCombine/X86/load.ll b/llvm/test/Transforms/VectorCombine/X86/load.ll
--- a/llvm/test/Transforms/VectorCombine/X86/load.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load.ll
@@ -630,14 +630,14 @@
 define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(<2 x i16>* align 1 dereferenceable(16) %p) {
 ; SSE2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
 ; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[P:%.*]], i64 1
-; SSE2-NEXT:    [[L:%.*]] = load <2 x i16>, <2 x i16>* [[GEP]], align 8
-; SSE2-NEXT:    [[S:%.*]] = extractelement <2 x i16> [[L]], i32 0
+; SSE2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[GEP]], i32 0, i32 0
+; SSE2-NEXT:    [[S:%.*]] = load i16, i16* [[TMP1]], align 2
 ; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
 ; SSE2-NEXT:    ret <8 x i16> [[R]]
 ;
 ; AVX2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
 ; AVX2-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16>* [[P:%.*]] to <8 x i16>*
-; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2
 ; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:    ret <8 x i16> [[R]]
 ;