Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1894,6 +1894,24 @@
          });
 }
 
+template <typename T>
+static unsigned getAlignment(ArrayRef<Value *> VL, unsigned DefaultAlignment) {
+  unsigned Alignment = DefaultAlignment;
+  auto *MinAlignedStore = cast<T>(*std::min_element(
+      VL.begin(), VL.end(), [Alignment](const Value *V1, const Value *V2) {
+        unsigned Alignment1 = cast<T>(V1)->getAlignment();
+        unsigned Alignment2 = cast<T>(V2)->getAlignment();
+        if (!Alignment1)
+          Alignment1 = Alignment;
+        if (!Alignment2)
+          Alignment2 = Alignment;
+        return Alignment1 < Alignment2;
+      }));
+  if (unsigned MinAlignment = MinAlignedStore->getAlignment())
+    Alignment = MinAlignment;
+  return Alignment;
+}
+
 int BoUpSLP::getEntryCost(TreeEntry *E) {
   ArrayRef<Value*> VL = E->Scalars;
 
@@ -2075,11 +2093,12 @@
     }
     case Instruction::Load: {
       // Cost of wide load - cost of scalar loads.
-      unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
+      unsigned Alignment =
+          getAlignment<LoadInst>(VL, DL->getABITypeAlignment(VL0->getType()));
       int ScalarLdCost = VecTy->getNumElements() *
-          TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
-      int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
-                                           VecTy, alignment, 0, VL0);
+          TTI->getMemoryOpCost(Instruction::Load, ScalarTy, Alignment, 0, VL0);
+      int VecLdCost =
+          TTI->getMemoryOpCost(Instruction::Load, VecTy, Alignment, 0, VL0);
       return VecLdCost - ScalarLdCost;
     }
     case Instruction::Store: {
@@ -2888,12 +2907,9 @@
       if (getTreeEntry(PO))
         ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
 
-      unsigned Alignment = LI->getAlignment();
       LI = Builder.CreateLoad(VecPtr);
-      if (!Alignment) {
-        Alignment = DL->getABITypeAlignment(ScalarLoadTy);
-      }
-      LI->setAlignment(Alignment);
+      LI->setAlignment(getAlignment<LoadInst>(
+          E->Scalars, DL->getABITypeAlignment(ScalarLoadTy)));
       E->VectorizedValue = LI;
       ++NumVectorInstructions;
       return propagateMetadata(LI, E->Scalars);
Index: test/Transforms/SLPVectorizer/X86/horizontal-list.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -13,15 +13,15 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
 ; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
 ; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]]
@@ -40,15 +40,15 @@
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
 ; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
 ; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 4
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 4
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]]
 ; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
 ; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]]
 ; THRESHOLD-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
 ; THRESHOLD-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]]
-; THRESHOLD-NEXT:    [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8
-; THRESHOLD-NEXT:    [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 4
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 4
 ; THRESHOLD-NEXT:    [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]]
 ; THRESHOLD-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
 ; THRESHOLD-NEXT:    [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]]
@@ -97,8 +97,8 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float undef, [[CONV]]
 ; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
@@ -128,8 +128,8 @@
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
 ; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
 ; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 16
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 4
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 4
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
 ; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float undef, [[CONV]]
 ; THRESHOLD-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
@@ -202,8 +202,8 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast float undef, undef
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast float undef, [[TMP4]]
@@ -221,8 +221,8 @@
 ; THRESHOLD-NEXT:  entry:
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
 ; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 4
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 4
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
 ; THRESHOLD-NEXT:    [[TMP4:%.*]] = fadd fast float undef, undef
 ; THRESHOLD-NEXT:    [[TMP5:%.*]] = fadd fast float undef, [[TMP4]]
@@ -264,8 +264,8 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast float undef, undef
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast float undef, [[TMP4]]
@@ -284,8 +284,8 @@
 ; THRESHOLD-NEXT:  entry:
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
 ; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 4
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 4
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
 ; THRESHOLD-NEXT:    [[TMP4:%.*]] = fadd fast float undef, undef
 ; THRESHOLD-NEXT:    [[TMP5:%.*]] = fadd fast float undef, [[TMP4]]
@@ -327,8 +327,8 @@
 define float @bar() {
 ; CHECK-LABEL: @bar(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
@@ -349,8 +349,8 @@
 ;
 ; THRESHOLD-LABEL: @bar(
 ; THRESHOLD-NEXT:  entry:
-; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 4
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 4
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
 ; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
Index: test/Transforms/SLPVectorizer/X86/horizontal.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/horizontal.ll
+++ test/Transforms/SLPVectorizer/X86/horizontal.ll
@@ -515,7 +515,7 @@
 
 define void @float_red_example4(float* %res) {
 ; STORE-LABEL: @float_red_example4(
-; STORE:         [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([32 x float]* @arr_float to <4 x float>*), align 16
+; STORE:         [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([32 x float]* @arr_float to <4 x float>*), align 4
 ; STORE:         [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 ; STORE-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP0]], [[RDX_SHUF]]
 ; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
@@ -538,7 +538,7 @@
 
 define void @float_red_example8(float* %res) {
 ; STORE-LABEL: @float_red_example8(
-; STORE:         [[TMP0:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr_float to <8 x float>*), align 16
+; STORE:         [[TMP0:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr_float to <8 x float>*), align 4
 ; STORE:         [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
 ; STORE-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP0]], [[RDX_SHUF]]
 ; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -571,7 +571,7 @@
 
 define void @float_red_example16(float* %res) {
 ; STORE-LABEL: @float_red_example16(
-; STORE:         [[TMP0:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr_float to <16 x float>*), align 16
+; STORE:         [[TMP0:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr_float to <16 x float>*), align 4
 ; STORE:         [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; STORE-NEXT:    [[BIN_RDX:%.*]] = fadd fast <16 x float> [[TMP0]], [[RDX_SHUF]]
 ; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -622,7 +622,7 @@
 
 define void @i32_red_example4(i32* %res) {
 ; STORE-LABEL: @i32_red_example4(
-; STORE:         [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr_i32 to <4 x i32>*), align 16
+; STORE:         [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr_i32 to <4 x i32>*), align 4
 ; STORE:         [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 ; STORE-NEXT:    [[BIN_RDX:%.*]] = add nsw <4 x i32> [[TMP0]], [[RDX_SHUF]]
 ; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
@@ -645,7 +645,7 @@
 
 define void @i32_red_example8(i32* %res) {
 ; STORE-LABEL: @i32_red_example8(
-; STORE:         [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
+; STORE:         [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 4
 ; STORE:         [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
 ; STORE-NEXT:    [[BIN_RDX:%.*]] = add nsw <8 x i32> [[TMP0]], [[RDX_SHUF]]
 ; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -678,7 +678,7 @@
 
 define void @i32_red_example16(i32* %res) {
 ; STORE-LABEL: @i32_red_example16(
-; STORE:         [[TMP0:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr_i32 to <16 x i32>*), align 16
+; STORE:         [[TMP0:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr_i32 to <16 x i32>*), align 4
 ; STORE:         [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; STORE-NEXT:    [[BIN_RDX:%.*]] = add nsw <16 x i32> [[TMP0]], [[RDX_SHUF]]
 ; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[BIN_RDX]], <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -729,7 +729,7 @@
 
 define void @i32_red_example32(i32* %res) {
 ; STORE-LABEL: @i32_red_example32(
-; STORE:         [[TMP0:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr_i32 to <32 x i32>*), align 16
+; STORE:         [[TMP0:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr_i32 to <32 x i32>*), align 4
 ; STORE:         [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; STORE-NEXT:    [[BIN_RDX:%.*]] = add nsw <32 x i32> [[TMP0]], [[RDX_SHUF]]
 ; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[BIN_RDX]], <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -817,7 +817,7 @@
 define void @i32_red_call(i32 %val) {
 ; CHECK-LABEL: @i32_red_call(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 undef, undef
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 undef, [[ADD]]
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 undef, [[ADD_1]]
@@ -858,7 +858,7 @@
 define void @i32_red_invoke(i32 %val) personality i32 (...)* @__gxx_personality_v0 {
 ; CHECK-LABEL: @i32_red_invoke(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 undef, undef
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 undef, [[ADD]]
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 undef, [[ADD_1]]
Index: test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
+++ test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
@@ -412,52 +412,52 @@
 define i32 @foo1() local_unnamed_addr #0 {
 ; CHECK-LABEL: @foo1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([64 x i32]* @ib to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([64 x i32]* @ib to <4 x i32>*), align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP0]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* bitcast ([64 x i32]* @ia to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 4) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 4) to <4 x i32>*), align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP2]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 4) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 8) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 8) to <4 x i32>*), align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP4]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 8) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 12) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 12) to <4 x i32>*), align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP6]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 12) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 16) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 16) to <4 x i32>*), align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP8]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 16) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 20) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 20) to <4 x i32>*), align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP10]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 20) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 24) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 24) to <4 x i32>*), align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP12]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 24) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP14:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 28) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 28) to <4 x i32>*), align 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP14]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 28) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP16:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 32) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP16:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 32) to <4 x i32>*), align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP16]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP17]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 32) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 36) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 36) to <4 x i32>*), align 4
 ; CHECK-NEXT:    [[TMP19:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP18]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP19]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 36) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP20:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 40) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP20:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 40) to <4 x i32>*), align 4
 ; CHECK-NEXT:    [[TMP21:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP20]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP21]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 40) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP22:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 44) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP22:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 44) to <4 x i32>*), align 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP22]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP23]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 44) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP24:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 48) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP24:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 48) to <4 x i32>*), align 4
 ; CHECK-NEXT:    [[TMP25:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP24]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 48) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP26:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 52) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 52) to <4 x i32>*), align 4
 ; CHECK-NEXT:    [[TMP27:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP26]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP27]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 52) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP28:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 56) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP28:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 56) to <4 x i32>*), align 4
 ; CHECK-NEXT:    [[TMP29:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP28]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP29]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 56) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP30:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 60) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP30:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 60) to <4 x i32>*), align 4
 ; CHECK-NEXT:    [[TMP31:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP30]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP31]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 60) to <4 x i32>*), align 16
 ; CHECK-NEXT:    br label [[FOR_BODY5:%.*]]
Index: test/Transforms/SLPVectorizer/X86/limit.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/limit.ll
+++ test/Transforms/SLPVectorizer/X86/limit.ll
@@ -18,11 +18,11 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @b to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @c to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @b to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @c to <4 x i32>*), align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @d to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @e to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @d to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @e to <4 x i32>*), align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP2]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw <4 x i32> [[TMP2]], [[TMP5]]
Index: test/Transforms/SLPVectorizer/X86/sitofp.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/sitofp.ll
+++ test/Transforms/SLPVectorizer/X86/sitofp.ll
@@ -39,7 +39,7 @@
 ; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @sitofp_2i64_2f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 8
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x double>
 ; AVX512-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 ; AVX512-NEXT:    ret void
@@ -85,7 +85,7 @@
 ; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @sitofp_4i64_4f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 8
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double>
 ; AVX512-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX512-NEXT:    ret void
@@ -161,7 +161,7 @@
 ; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @sitofp_8i64_8f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 8
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x double>
 ; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
 ; AVX512-NEXT:    ret void
@@ -229,7 +229,7 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @sitofp_4i32_4f64(
-; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 4
 ; AVX-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x double>
 ; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX-NEXT:    ret void
@@ -278,8 +278,8 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_8i32_8f64(
-; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 4
 ; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x double>
 ; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x double>
 ; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
@@ -287,7 +287,7 @@
 ; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @sitofp_8i32_8f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 4
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x double>
 ; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
 ; AVX512-NEXT:    ret void
@@ -355,7 +355,7 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @sitofp_4i16_4f64(
-; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 2
 ; AVX-NEXT:    [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x double>
 ; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX-NEXT:    ret void
@@ -404,8 +404,8 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_8i16_8f64(
-; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 2
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 2
 ; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x double>
 ; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x double>
 ; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
@@ -413,7 +413,7 @@
 ; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @sitofp_8i16_8f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 2
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x double>
 ; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
 ; AVX512-NEXT:    ret void
@@ -481,7 +481,7 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @sitofp_4i8_4f64(
-; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 1
 ; AVX-NEXT:    [[TMP2:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x double>
 ; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX-NEXT:    ret void
@@ -530,8 +530,8 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_8i8_8f64(
-; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 1
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 1
 ; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x double>
 ; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x double>
 ; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
@@ -539,7 +539,7 @@
 ; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @sitofp_8i8_8f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 1
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x double>
 ; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
 ; AVX512-NEXT:    ret void
@@ -626,7 +626,7 @@
 ; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @sitofp_4i64_4f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 8
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float>
 ; AVX512-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; AVX512-NEXT:    ret void
@@ -702,7 +702,7 @@
 ; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @sitofp_8i64_8f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 8
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x float>
 ; AVX512-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 ; AVX512-NEXT:    ret void
@@ -736,7 +736,7 @@
 
 define void @sitofp_4i32_4f32() #0 {
 ; CHECK-LABEL: @sitofp_4i32_4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
 ; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; CHECK-NEXT:    ret void
@@ -758,8 +758,8 @@
 
 define void @sitofp_8i32_8f32() #0 {
 ; SSE-LABEL: @sitofp_8i32_8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 4
 ; SSE-NEXT:    [[TMP3:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
 ; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x float>
 ; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
@@ -767,7 +767,7 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @sitofp_8i32_8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 4
 ; AVX-NEXT:    [[TMP2:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x float>
 ; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 ; AVX-NEXT:    ret void
@@ -801,10 +801,10 @@
 
 define void @sitofp_16i32_16f32() #0 {
 ; SSE-LABEL: @sitofp_16i32_16f32(
-; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 32
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    [[TMP5:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
 ; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x float>
 ; SSE-NEXT:    [[TMP7:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x float>
@@ -816,8 +816,8 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_16i32_16f32(
-; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32
+; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x float>
 ; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i32> [[TMP2]] to <8 x float>
 ; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
@@ -825,7 +825,7 @@
 ; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @sitofp_16i32_16f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @src32 to <16 x i32>*), align 64
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @src32 to <16 x i32>*), align 4
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <16 x i32> [[TMP1]] to <16 x float>
 ; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
 ; AVX512-NEXT:    ret void
@@ -883,7 +883,7 @@
 
 define void @sitofp_4i16_4f32() #0 {
 ; CHECK-LABEL: @sitofp_4i16_4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
 ; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; CHECK-NEXT:    ret void
@@ -905,8 +905,8 @@
 
 define void @sitofp_8i16_8f32() #0 {
 ; SSE-LABEL: @sitofp_8i16_8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 2
 ; SSE-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
 ; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
 ; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
@@ -914,7 +914,7 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @sitofp_8i16_8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 2
 ; AVX-NEXT:    [[TMP2:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x float>
 ; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 ; AVX-NEXT:    ret void
@@ -948,10 +948,10 @@
 
 define void @sitofp_16i16_16f32() #0 {
 ; SSE-LABEL: @sitofp_16i16_16f32(
-; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 2
 ; SSE-NEXT:    [[TMP5:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
 ; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
 ; SSE-NEXT:    [[TMP7:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x float>
@@ -963,8 +963,8 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_16i16_16f32(
-; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16
+; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 2
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 2
 ; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x float>
 ; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i16> [[TMP2]] to <8 x float>
 ; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
@@ -972,7 +972,7 @@
 ; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @sitofp_16i16_16f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @src16 to <16 x i16>*), align 64
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @src16 to <16 x i16>*), align 2
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <16 x i16> [[TMP1]] to <16 x float>
 ; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
 ; AVX512-NEXT:    ret void
@@ -1030,7 +1030,7 @@
 
 define void @sitofp_4i8_4f32() #0 {
 ; CHECK-LABEL: @sitofp_4i8_4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float>
 ; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; CHECK-NEXT:    ret void
@@ -1052,8 +1052,8 @@
 
 define void @sitofp_8i8_8f32() #0 {
 ; SSE-LABEL: @sitofp_8i8_8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 1
 ; SSE-NEXT:    [[TMP3:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float>
 ; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x float>
 ; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
@@ -1061,7 +1061,7 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @sitofp_8i8_8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 1
 ; AVX-NEXT:    [[TMP2:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x float>
 ; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 ; AVX-NEXT:    ret void
@@ -1095,10 +1095,10 @@
 
 define void @sitofp_16i8_16f32() #0 {
 ; SSE-LABEL: @sitofp_16i8_16f32(
-; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 1
 ; SSE-NEXT:    [[TMP5:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float>
 ; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x float>
 ; SSE-NEXT:    [[TMP7:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x float>
@@ -1110,8 +1110,8 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_16i8_16f32(
-; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8
+; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 1
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 1
 ; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x float>
 ; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i8> [[TMP2]] to <8 x float>
 ; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
@@ -1119,7 +1119,7 @@
 ; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @sitofp_16i8_16f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @src8 to <16 x i8>*), align 64
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @src8 to <16 x i8>*), align 1
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <16 x i8> [[TMP1]] to <16 x float>
 ; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
 ; AVX512-NEXT:    ret void
@@ -1181,10 +1181,10 @@
 
 define <4 x double> @sitofp_4xi32_4f64(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 {
 ; CHECK-LABEL: @sitofp_4xi32_4f64(
-; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i32 %a0 to double
-; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i32 %a1 to double
-; CHECK-NEXT:    [[CVT2:%.*]] = sitofp i32 %a2 to double
-; CHECK-NEXT:    [[CVT3:%.*]] = sitofp i32 %a3 to double
+; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i32 [[A0:%.*]] to double
+; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i32 [[A1:%.*]] to double
+; CHECK-NEXT:    [[CVT2:%.*]] = sitofp i32 [[A2:%.*]] to double
+; CHECK-NEXT:    [[CVT3:%.*]] = sitofp i32 [[A3:%.*]] to double
 ; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x double> undef, double [[CVT0]], i32 0
 ; CHECK-NEXT:    [[RES1:%.*]] = insertelement <4 x double> [[RES0]], double [[CVT1]], i32 1
 ; CHECK-NEXT:    [[RES2:%.*]] = insertelement <4 x double> [[RES1]], double [[CVT2]], i32 2
@@ -1204,10 +1204,10 @@
 
 define <4 x float> @sitofp_4xi32_4f32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 {
 ; CHECK-LABEL: @sitofp_4xi32_4f32(
-; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i32 %a0 to float
-; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i32 %a1 to float
-; CHECK-NEXT:    [[CVT2:%.*]] = sitofp i32 %a2 to float
-; CHECK-NEXT:    [[CVT3:%.*]] = sitofp i32 %a3 to float
+; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i32 [[A0:%.*]] to float
+; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i32 [[A1:%.*]] to float
+; CHECK-NEXT:    [[CVT2:%.*]] = sitofp i32 [[A2:%.*]] to float
+; CHECK-NEXT:    [[CVT3:%.*]] = sitofp i32 [[A3:%.*]] to float
 ; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x float> undef, float [[CVT0]], i32 0
 ; CHECK-NEXT:    [[RES1:%.*]] = insertelement <4 x float> [[RES0]], float [[CVT1]], i32 1
 ; CHECK-NEXT:    [[RES2:%.*]] = insertelement <4 x float> [[RES1]], float [[CVT2]], i32 2
Index: test/Transforms/SLPVectorizer/X86/uitofp.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/uitofp.ll
+++ test/Transforms/SLPVectorizer/X86/uitofp.ll
@@ -39,7 +39,7 @@
 ; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @uitofp_2i64_2f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 8
 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
 ; AVX512-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 ; AVX512-NEXT:    ret void
@@ -85,7 +85,7 @@
 ; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @uitofp_4i64_4f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 8
 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
 ; AVX512-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX512-NEXT:    ret void
@@ -161,7 +161,7 @@
 ; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @uitofp_8i64_8f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 8
 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x double>
 ; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
 ; AVX512-NEXT:    ret void
@@ -213,7 +213,7 @@
 ; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @uitofp_2i32_2f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
+; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 4
 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
 ; AVX512-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 ; AVX512-NEXT:    ret void
@@ -244,7 +244,7 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @uitofp_4i32_4f64(
-; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 4
 ; AVX-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x double>
 ; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX-NEXT:    ret void
@@ -293,8 +293,8 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @uitofp_8i32_8f64(
-; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 4
 ; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x double>
 ; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP2]] to <4 x double>
 ; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
@@ -302,7 +302,7 @@
 ; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @uitofp_8i32_8f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 4
 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x double>
 ; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
 ; AVX512-NEXT:    ret void
@@ -370,7 +370,7 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @uitofp_4i16_4f64(
-; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 2
 ; AVX-NEXT:    [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x double>
 ; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX-NEXT:    ret void
@@ -419,8 +419,8 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @uitofp_8i16_8f64(
-; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 2
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 2
 ; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x double>
 ; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x double>
 ; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
@@ -428,7 +428,7 @@
 ; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @uitofp_8i16_8f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 2
 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x double>
 ; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
 ; AVX512-NEXT:    ret void
@@ -480,7 +480,7 @@
 ; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @uitofp_2i8_2f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64
+; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 1
 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double>
 ; AVX512-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 ; AVX512-NEXT:    ret void
@@ -511,7 +511,7 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @uitofp_4i8_4f64(
-; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 1
 ; AVX-NEXT:    [[TMP2:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x double>
 ; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX-NEXT:    ret void
@@ -560,8 +560,8 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @uitofp_8i8_8f64(
-; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 1
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 1
 ; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x double>
 ; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i8> [[TMP2]] to <4 x double>
 ; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
@@ -569,7 +569,7 @@
 ; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @uitofp_8i8_8f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 1
 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x double>
 ; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
 ; AVX512-NEXT:    ret void
@@ -656,7 +656,7 @@
 ; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @uitofp_4i64_4f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 8
 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
 ; AVX512-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; AVX512-NEXT:    ret void
@@ -732,7 +732,7 @@
 ; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @uitofp_8i64_8f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 8
 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float>
 ; AVX512-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 ; AVX512-NEXT:    ret void
@@ -766,7 +766,7 @@
 
 define void @uitofp_4i32_4f32() #0 {
 ; CHECK-LABEL: @uitofp_4i32_4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
 ; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; CHECK-NEXT:    ret void
@@ -788,8 +788,8 @@
 
 define void @uitofp_8i32_8f32() #0 {
 ; SSE-LABEL: @uitofp_8i32_8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 4
 ; SSE-NEXT:    [[TMP3:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
 ; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP2]] to <4 x float>
 ; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
@@ -797,7 +797,7 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @uitofp_8i32_8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 4
 ; AVX-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x float>
 ; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 ; AVX-NEXT:    ret void
@@ -831,10 +831,10 @@
 
 define void @uitofp_16i32_16f32() #0 {
 ; SSE-LABEL: @uitofp_16i32_16f32(
-; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 32
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    [[TMP5:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
 ; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i32> [[TMP2]] to <4 x float>
 ; SSE-NEXT:    [[TMP7:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
@@ -846,8 +846,8 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @uitofp_16i32_16f32(
-; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32
+; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x float>
 ; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <8 x i32> [[TMP2]] to <8 x float>
 ; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
@@ -855,7 +855,7 @@
 ; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @uitofp_16i32_16f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @src32 to <16 x i32>*), align 64
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @src32 to <16 x i32>*), align 4
 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <16 x i32> [[TMP1]] to <16 x float>
 ; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
 ; AVX512-NEXT:    ret void
@@ -913,7 +913,7 @@
 
 define void @uitofp_4i16_4f32() #0 {
 ; CHECK-LABEL: @uitofp_4i16_4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
 ; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; CHECK-NEXT:    ret void
@@ -935,8 +935,8 @@
 
 define void @uitofp_8i16_8f32() #0 {
 ; SSE-LABEL: @uitofp_8i16_8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 2
 ; SSE-NEXT:    [[TMP3:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
 ; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x float>
 ; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
@@ -944,7 +944,7 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @uitofp_8i16_8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 2
 ; AVX-NEXT:    [[TMP2:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x float>
 ; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 ; AVX-NEXT:    ret void
@@ -978,10 +978,10 @@
 
 define void @uitofp_16i16_16f32() #0 {
 ; SSE-LABEL: @uitofp_16i16_16f32(
-; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 2
 ; SSE-NEXT:    [[TMP5:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
 ; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x float>
 ; SSE-NEXT:    [[TMP7:%.*]] = uitofp <4 x i16> [[TMP3]] to <4 x float>
@@ -993,8 +993,8 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @uitofp_16i16_16f32(
-; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16
+; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 2
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 2
 ; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x float>
 ; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <8 x i16> [[TMP2]] to <8 x float>
 ; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
@@ -1002,7 +1002,7 @@
 ; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @uitofp_16i16_16f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @src16 to <16 x i16>*), align 64
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @src16 to <16 x i16>*), align 2
 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <16 x i16> [[TMP1]] to <16 x float>
 ; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
 ; AVX512-NEXT:    ret void
@@ -1060,7 +1060,7 @@
 
 define void @uitofp_4i8_4f32() #0 {
 ; CHECK-LABEL: @uitofp_4i8_4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float>
 ; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; CHECK-NEXT:    ret void
@@ -1082,8 +1082,8 @@
 
 define void @uitofp_8i8_8f32() #0 {
 ; SSE-LABEL: @uitofp_8i8_8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 1
 ; SSE-NEXT:    [[TMP3:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float>
 ; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i8> [[TMP2]] to <4 x float>
 ; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
@@ -1091,7 +1091,7 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @uitofp_8i8_8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 1
 ; AVX-NEXT:    [[TMP2:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x float>
 ; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 ; AVX-NEXT:    ret void
@@ -1125,10 +1125,10 @@
 
 define void @uitofp_16i8_16f32() #0 {
 ; SSE-LABEL: @uitofp_16i8_16f32(
-; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 1
 ; SSE-NEXT:    [[TMP5:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float>
 ; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i8> [[TMP2]] to <4 x float>
 ; SSE-NEXT:    [[TMP7:%.*]] = uitofp <4 x i8> [[TMP3]] to <4 x float>
@@ -1140,8 +1140,8 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @uitofp_16i8_16f32(
-; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8
+; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 1
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 1
 ; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x float>
 ; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <8 x i8> [[TMP2]] to <8 x float>
 ; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
@@ -1149,7 +1149,7 @@
 ; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @uitofp_16i8_16f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @src8 to <16 x i8>*), align 64
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @src8 to <16 x i8>*), align 1
 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <16 x i8> [[TMP1]] to <16 x float>
 ; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
 ; AVX512-NEXT:    ret void