diff --git a/llvm/test/Analysis/BasicAA/libfuncs.ll b/llvm/test/Analysis/BasicAA/libfuncs.ll
--- a/llvm/test/Analysis/BasicAA/libfuncs.ll
+++ b/llvm/test/Analysis/BasicAA/libfuncs.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mtriple=i386-pc-linux-gnu -aa-pipeline=basic-aa -passes=inferattrs,aa-eval -print-all-alias-modref-info -disable-output 2>&1 %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-pc-linux-gnu -aa-pipeline=basic-aa -passes=inferattrs,aa-eval -print-all-alias-modref-info -disable-output 2>&1 %s | FileCheck %s
 
 ; CHECK-LABEL: Function: test_memcmp_const_size
 ; CHECK:      Just Ref:  Ptr: i8* %a	<->  %res = tail call i32 @memcmp(ptr %a, ptr %b, i64 4)
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll b/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll
--- a/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll
@@ -1,19 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple aarch64-linux-gnu -mattr=+sve -S -o - < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnu"
 
 define void @sve_fptruncs() {
-  ;CHECK-LABEL: 'sve_fptruncs'
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x half>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x half>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction:   %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x half>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x half>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction:   %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x half>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction:   %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x half>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %nxv2_f32_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x float>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction:   %nxv4_f32_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x float>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction:   %nxv8_f32_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x float>
+; CHECK-LABEL: 'sve_fptruncs'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x half>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x half>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x half>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x half>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x half>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x half>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2_f32_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4_f32_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8_f32_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x half>
   %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x half>
   %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x half>
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll b/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll
--- a/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll
@@ -1,24 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -mtriple=aarch64-linux-gnu -mattr=+sve -passes="print<cost-model>" 2>&1 -disable-output < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @sve_truncs() {
-  ;CHECK-LABEL: 'sve_truncs'
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %trunc_v2i16_to_i1 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %trunc_v2i32_to_i1 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %trunc_v2i64_to_i1 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %trunc_v4i16_to_i1 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i1>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %trunc_v4i32_to_i1 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i1>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %trunc_v4i64_to_i1 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i1>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %trunc_v8i16_to_i1 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i1>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction:   %trunc_v8i32_to_i1 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i1>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction:   %trunc_v8i64_to_i1 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i1>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %trunc_v2i32_to_i16 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i16>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %trunc_v2i64_to_i32 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i32>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %trunc_v4i32_to_i16 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i16>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %trunc_v4i64_to_i32 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i32>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction:   %trunc_v8i32_to_i16 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i16>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction:   %trunc_v8i64_to_i32 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i32>
+; CHECK-LABEL: 'sve_truncs'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i16_to_i1 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i32_to_i1 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i64_to_i1 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v4i16_to_i1 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v4i32_to_i1 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %trunc_v4i64_to_i1 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v8i16_to_i1 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %trunc_v8i32_to_i1 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %trunc_v8i64_to_i1 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i32_to_i16 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i64_to_i32 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v4i32_to_i16 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %trunc_v4i64_to_i32 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %trunc_v8i32_to_i16 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %trunc_v8i64_to_i32 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   %trunc_v2i16_to_i1  = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
   %trunc_v2i32_to_i1  = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
   %trunc_v2i64_to_i1  = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/cast.ll b/llvm/test/Analysis/CostModel/AMDGPU/cast.ll
--- a/llvm/test/Analysis/CostModel/AMDGPU/cast.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/cast.ll
@@ -74,11 +74,11 @@
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %E = trunc <4 x i64> undef to <4 x i32>
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %F = trunc <8 x i32> undef to <8 x i16>
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %F1 = trunc <16 x i16> undef to <16 x i8>
-; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8>
-; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8>
+; FAST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8>
+; FAST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8>
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %G = trunc <8 x i64> undef to <8 x i32>
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %G1 = trunc <16 x i32> undef to <16 x i16>
-; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8>
+; FAST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8>
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; SLOW-LABEL: 'zext_sext'
@@ -134,11 +134,11 @@
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %E = trunc <4 x i64> undef to <4 x i32>
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %F = trunc <8 x i32> undef to <8 x i16>
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %F1 = trunc <16 x i16> undef to <16 x i8>
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8>
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8>
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8>
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8>
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %G = trunc <8 x i64> undef to <8 x i32>
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %G1 = trunc <16 x i32> undef to <16 x i16>
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8>
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8>
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLOW-SIZE-LABEL: 'zext_sext'
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/load-to-trunc.ll b/llvm/test/Analysis/CostModel/AMDGPU/load-to-trunc.ll
--- a/llvm/test/Analysis/CostModel/AMDGPU/load-to-trunc.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/load-to-trunc.ll
@@ -8,7 +8,7 @@
 ; Check that cost is 1 for unusual load to register sized load.
 define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualIntegerWithTrunc'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc
 ;
@@ -19,7 +19,7 @@
 
 define i128 @loadUnusualInteger(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualInteger'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %out = load i128, ptr %ptr, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out
 ;
   %out = load i128, ptr %ptr
diff --git a/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll b/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll
--- a/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll
+++ b/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll
@@ -9,8 +9,8 @@
 ; Check that cost is 1 for unusual load to register sized load.
 define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualIntegerWithTrunc'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc = trunc i128 %out to i32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc
 ;
   %out = load i128, ptr %ptr
@@ -20,7 +20,7 @@
 
 define i128 @loadUnusualInteger(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualInteger'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out
 ;
   %out = load i128, ptr %ptr
diff --git a/llvm/test/Analysis/CostModel/PowerPC/load-to-trunc.ll b/llvm/test/Analysis/CostModel/PowerPC/load-to-trunc.ll
--- a/llvm/test/Analysis/CostModel/PowerPC/load-to-trunc.ll
+++ b/llvm/test/Analysis/CostModel/PowerPC/load-to-trunc.ll
@@ -7,7 +7,7 @@
 ; Check that cost is 1 for unusual load to register sized load.
 define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualIntegerWithTrunc'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc
 ;
@@ -18,7 +18,7 @@
 
 define i128 @loadUnusualInteger(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualInteger'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %out = load i128, ptr %ptr, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out
 ;
   %out = load i128, ptr %ptr
diff --git a/llvm/test/Analysis/CostModel/RISCV/cast.ll b/llvm/test/Analysis/CostModel/RISCV/cast.ll
--- a/llvm/test/Analysis/CostModel/RISCV/cast.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cast.ll
@@ -1075,7 +1075,7 @@
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_v64i1 = trunc <64 x i8> undef to <64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i16_v64i1 = trunc <64 x i16> undef to <64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i16_v128i8 = trunc <128 x i16> undef to <128 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128i32_v128i8 = trunc <128 x i32> undef to <128 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v128i64_v128i8 = trunc <128 x i64> undef to <128 x i8>
@@ -1085,7 +1085,7 @@
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_v128i1 = trunc <128 x i8> undef to <128 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v128i16_v128i1 = trunc <128 x i16> undef to <128 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v256i16_v256i8 = trunc <256 x i16> undef to <256 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v256i32_v256i8 = trunc <256 x i32> undef to <256 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %v256i64_v256i8 = trunc <256 x i64> undef to <256 x i8>
@@ -1095,7 +1095,7 @@
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v256i8_v256i1 = trunc <256 x i8> undef to <256 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v256i16_v256i1 = trunc <256 x i16> undef to <256 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i8 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i8 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i64_nxv1i8 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i8>
@@ -1227,8 +1227,8 @@
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v64i64_v64i32 = trunc <64 x i64> undef to <64 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_v64i1 = trunc <64 x i8> undef to <64 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i16_v64i1 = trunc <64 x i16> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i16_v128i8 = trunc <128 x i16> undef to <128 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128i32_v128i8 = trunc <128 x i32> undef to <128 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v128i64_v128i8 = trunc <128 x i64> undef to <128 x i8>
@@ -1237,8 +1237,8 @@
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v128i64_v128i32 = trunc <128 x i64> undef to <128 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_v128i1 = trunc <128 x i8> undef to <128 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v128i16_v128i1 = trunc <128 x i16> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v256i16_v256i8 = trunc <256 x i16> undef to <256 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v256i32_v256i8 = trunc <256 x i32> undef to <256 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %v256i64_v256i8 = trunc <256 x i64> undef to <256 x i8>
@@ -1247,8 +1247,8 @@
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v256i64_v256i32 = trunc <256 x i64> undef to <256 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v256i8_v256i1 = trunc <256 x i8> undef to <256 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v256i16_v256i1 = trunc <256 x i16> undef to <256 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i8 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i8 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i64_nxv1i8 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i8>
diff --git a/llvm/test/Analysis/CostModel/RISCV/fca-load-store.ll b/llvm/test/Analysis/CostModel/RISCV/fca-load-store.ll
--- a/llvm/test/Analysis/CostModel/RISCV/fca-load-store.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/fca-load-store.ll
@@ -4,10 +4,10 @@
 
 define void @load(ptr %p) {
 ; CHECK-LABEL: 'load'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %1 = load [2 x i64], ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = load [4 x i64], ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %3 = load { i64, i64 }, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %4 = load { i64, i32 }, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %1 = load [2 x i64], ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = load [4 x i64], ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %3 = load { i64, i64 }, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %4 = load { i64, i32 }, ptr %p, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   load [2 x i64], ptr %p
@@ -20,10 +20,10 @@
 
 define void @store(ptr %p) {
 ; CHECK-LABEL: 'store'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store [2 x i64] undef, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store [4 x i64] undef, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store { i64, i64 } undef, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store { i64, i32 } undef, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store [2 x i64] undef, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store [4 x i64] undef, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store { i64, i64 } undef, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store { i64, i32 } undef, ptr %p, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   store [2 x i64] undef, ptr %p
diff --git a/llvm/test/Analysis/CostModel/RISCV/load-to-trunc.ll b/llvm/test/Analysis/CostModel/RISCV/load-to-trunc.ll
--- a/llvm/test/Analysis/CostModel/RISCV/load-to-trunc.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/load-to-trunc.ll
@@ -8,8 +8,8 @@
 ; Check that cost is 1 for unusual load to register sized load.
 define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualIntegerWithTrunc'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc = trunc i128 %out to i32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc
 ;
   %out = load i128, ptr %ptr
@@ -19,7 +19,7 @@
 
 define i128 @loadUnusualInteger(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualInteger'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %out = load i128, ptr %ptr, align 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out
 ;
   %out = load i128, ptr %ptr
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll
@@ -44,7 +44,7 @@
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %37 = load <vscale x 8 x i32>, ptr %p, align 32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %38 = load <vscale x 16 x i32>, ptr %p, align 64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %39 = load <vscale x 32 x i32>, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %40 = load i64, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %40 = load i64, ptr %p, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %41 = load <1 x i64>, ptr %p, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %42 = load <2 x i64>, ptr %p, align 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %43 = load <4 x i64>, ptr %p, align 32
@@ -187,7 +187,7 @@
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <vscale x 8 x i32> undef, ptr %p, align 32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <vscale x 16 x i32> undef, ptr %p, align 64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <vscale x 32 x i32> undef, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i64 undef, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i64 undef, ptr %p, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i64> undef, ptr %p, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr %p, align 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i64> undef, ptr %p, align 32
diff --git a/llvm/test/Analysis/CostModel/SystemZ/load-to-trunc.ll b/llvm/test/Analysis/CostModel/SystemZ/load-to-trunc.ll
--- a/llvm/test/Analysis/CostModel/SystemZ/load-to-trunc.ll
+++ b/llvm/test/Analysis/CostModel/SystemZ/load-to-trunc.ll
@@ -8,7 +8,7 @@
 ; Check that cost is 1 for unusual load to register sized load.
 define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualIntegerWithTrunc'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc
 ;
@@ -19,7 +19,7 @@
 
 define i128 @loadUnusualInteger(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualInteger'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out
 ;
   %out = load i128, ptr %ptr
diff --git a/llvm/test/Analysis/CostModel/X86/load-to-trunc.ll b/llvm/test/Analysis/CostModel/X86/load-to-trunc.ll
--- a/llvm/test/Analysis/CostModel/X86/load-to-trunc.ll
+++ b/llvm/test/Analysis/CostModel/X86/load-to-trunc.ll
@@ -9,7 +9,7 @@
 ; Check that cost is 1 for unusual load to register sized load.
 define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualIntegerWithTrunc'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc
 ;
@@ -20,7 +20,7 @@
 
 define i128 @loadUnusualInteger(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualInteger'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out
 ;
   %out = load i128, ptr %ptr
diff --git a/llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll b/llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll
--- a/llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll
+++ b/llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll
@@ -542,20 +542,20 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
@@ -569,8 +569,8 @@
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
@@ -587,8 +587,8 @@
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1>
-; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
-; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
+; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
+; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512VL512-LABEL: 'trunc_vXi1'
@@ -596,8 +596,8 @@
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
@@ -608,14 +608,14 @@
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1>
-; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
-; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
+; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
+; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SKX256-LABEL: 'trunc_vXi1'
@@ -623,14 +623,14 @@
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; SKX256-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; SKX256-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; SKX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; SKX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; SKX256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; SKX256-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; SKX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; SKX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
@@ -650,14 +650,14 @@
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; SKX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; SKX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; SKX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; SKX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; SKX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; SKX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; SKX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; SKX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
diff --git a/llvm/test/Analysis/CostModel/X86/size-cost.ll b/llvm/test/Analysis/CostModel/X86/size-cost.ll
--- a/llvm/test/Analysis/CostModel/X86/size-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/size-cost.ll
@@ -48,7 +48,7 @@
 
 define ptr @inttoptr_i64_p64(i64 %x) {
 ; CHECK-LABEL: 'inttoptr_i64_p64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = inttoptr i64 %x to ptr
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = inttoptr i64 %x to ptr
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr %r
 ;
   %r = inttoptr i64 %x to ptr
@@ -57,7 +57,7 @@
 
 define i64 @ptrtoint_p64_i64(ptr %x) {
 ; CHECK-LABEL: 'ptrtoint_p64_i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = ptrtoint ptr %x to i64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = ptrtoint ptr %x to i64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
 ;
   %r = ptrtoint ptr %x to i64
diff --git a/llvm/test/Analysis/CostModel/X86/trunc-codesize.ll b/llvm/test/Analysis/CostModel/X86/trunc-codesize.ll
--- a/llvm/test/Analysis/CostModel/X86/trunc-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/trunc-codesize.ll
@@ -537,26 +537,26 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -572,26 +572,26 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
@@ -607,26 +607,26 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
@@ -663,292 +663,6 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX512-LABEL: 'trunc_vXi1'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i16 = trunc <5 x i16> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i16 = trunc <7 x i16> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i8 = trunc <5 x i8> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i8 = trunc <6 x i8> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i8 = trunc <7 x i8> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i8 = trunc <10 x i8> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i8 = trunc <12 x i8> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i8 = trunc <14 x i8> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX256-LABEL: 'trunc_vXi1'
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i16 = trunc <5 x i16> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i16 = trunc <7 x i16> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i8 = trunc <5 x i8> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i8 = trunc <6 x i8> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i8 = trunc <7 x i8> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i8 = trunc <10 x i8> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i8 = trunc <12 x i8> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i8 = trunc <14 x i8> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %i64 = trunc i64 undef to i1
   %V2i64 = trunc <2 x i64> undef to <2 x i1>
@@ -1096,3 +810,6 @@
 
   ret i32 undef
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX256: {{.*}}
+; AVX512: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/X86/trunc-latency.ll b/llvm/test/Analysis/CostModel/X86/trunc-latency.ll
--- a/llvm/test/Analysis/CostModel/X86/trunc-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/trunc-latency.ll
@@ -537,26 +537,26 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -572,26 +572,26 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
@@ -607,26 +607,26 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
@@ -663,292 +663,6 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX512-LABEL: 'trunc_vXi1'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i16 = trunc <5 x i16> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i16 = trunc <7 x i16> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i8 = trunc <5 x i8> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i8 = trunc <6 x i8> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i8 = trunc <7 x i8> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i8 = trunc <10 x i8> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i8 = trunc <12 x i8> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i8 = trunc <14 x i8> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX256-LABEL: 'trunc_vXi1'
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i16 = trunc <5 x i16> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i16 = trunc <7 x i16> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i8 = trunc <5 x i8> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i8 = trunc <6 x i8> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i8 = trunc <7 x i8> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i8 = trunc <10 x i8> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i8 = trunc <12 x i8> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i8 = trunc <14 x i8> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %i64 = trunc i64 undef to i1
   %V2i64 = trunc <2 x i64> undef to <2 x i1>
@@ -1096,3 +810,6 @@
 
   ret i32 undef
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX256: {{.*}}
+; AVX512: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/X86/trunc-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/trunc-sizelatency.ll
--- a/llvm/test/Analysis/CostModel/X86/trunc-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/trunc-sizelatency.ll
@@ -537,26 +537,26 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -572,26 +572,26 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
@@ -607,26 +607,26 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
@@ -663,292 +663,6 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX512-LABEL: 'trunc_vXi1'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i16 = trunc <5 x i16> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i16 = trunc <7 x i16> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i8 = trunc <5 x i8> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i8 = trunc <6 x i8> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i8 = trunc <7 x i8> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i8 = trunc <10 x i8> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i8 = trunc <12 x i8> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i8 = trunc <14 x i8> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX256-LABEL: 'trunc_vXi1'
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i16 = trunc <5 x i16> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i16 = trunc <7 x i16> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i8 = trunc <5 x i8> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i8 = trunc <6 x i8> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i8 = trunc <7 x i8> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i8 = trunc <10 x i8> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i8 = trunc <12 x i8> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i8 = trunc <14 x i8> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %i64 = trunc i64 undef to i1
   %V2i64 = trunc <2 x i64> undef to <2 x i1>
@@ -1096,3 +810,6 @@
 
   ret i32 undef
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX256: {{.*}}
+; AVX512: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/X86/trunc.ll b/llvm/test/Analysis/CostModel/X86/trunc.ll
--- a/llvm/test/Analysis/CostModel/X86/trunc.ll
+++ b/llvm/test/Analysis/CostModel/X86/trunc.ll
@@ -2754,26 +2754,26 @@
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 462 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 924 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 368 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1584 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1848 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 736 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -2789,26 +2789,26 @@
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 462 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 924 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1584 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1848 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
@@ -2824,26 +2824,26 @@
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 462 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 924 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1584 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1848 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
@@ -2897,26 +2897,26 @@
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 462 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 924 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 368 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1584 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1848 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 736 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -2932,26 +2932,26 @@
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 462 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 924 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 272 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1584 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1848 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 544 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
@@ -2967,26 +2967,26 @@
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 462 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 924 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1584 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1848 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
@@ -3040,26 +3040,26 @@
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 704 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -3110,26 +3110,26 @@
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
@@ -3145,26 +3145,26 @@
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512FVEC256-LABEL: 'trunc_vXi1'
@@ -3183,26 +3183,26 @@
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 704 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -3288,26 +3288,26 @@
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 272 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 544 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQVEC512-LABEL: 'trunc_vXi1'
@@ -3396,26 +3396,26 @@
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
@@ -3431,26 +3431,26 @@
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQVEC256-LABEL: 'trunc_vXi1'
@@ -3574,26 +3574,26 @@
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 272 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 544 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BWVEC512-LABEL: 'trunc_vXi1'
@@ -3612,26 +3612,26 @@
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 376 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 752 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -3647,26 +3647,26 @@
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
@@ -3755,26 +3755,26 @@
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 376 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 752 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -3790,26 +3790,26 @@
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 152 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 304 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
@@ -3898,26 +3898,26 @@
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 462 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 924 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 368 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1584 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1848 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 736 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -3933,26 +3933,26 @@
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 462 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 924 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1584 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1848 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
@@ -3968,26 +3968,26 @@
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 462 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 924 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1584 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1848 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
diff --git a/llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll b/llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll
--- a/llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll
+++ b/llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll
@@ -8,8 +8,9 @@
 ; CHECK-NEXT:    [[TST:%.*]] = load i1, ptr [[ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[TST]], label [[NEXT:%.*]], label [[END:%.*]]
 ; CHECK:       next:
-; CHECK-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[OFFSET]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load volatile i1, ptr [[SUNKADDR]], align 1
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = trunc i64 [[OFFSET]] to i32
+; CHECK-NEXT:    [[SUNKADDR1:%.*]] = getelementptr i8, ptr [[BASE]], i32 [[SUNKADDR]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load volatile i1, ptr [[SUNKADDR1]], align 1
 ; CHECK-NEXT:    ret void
 ; CHECK:       end:
 ; CHECK-NEXT:    ret void
@@ -33,8 +34,9 @@
 ; CHECK-NEXT:    [[TST:%.*]] = load i1, ptr [[ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[TST]], label [[NEXT:%.*]], label [[END:%.*]]
 ; CHECK:       next:
-; CHECK-NEXT:    [[SUNKADDR:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[OFFSET]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load volatile i1, ptr [[SUNKADDR]], align 1
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = trunc i64 [[OFFSET]] to i32
+; CHECK-NEXT:    [[SUNKADDR1:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i32 [[SUNKADDR]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load volatile i1, ptr [[SUNKADDR1]], align 1
 ; CHECK-NEXT:    ret void
 ; CHECK:       end:
 ; CHECK-NEXT:    ret void
@@ -61,8 +63,8 @@
 ; CHECK-NEXT:    [[TST:%.*]] = load i1, ptr [[ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[TST]], label [[NEXT:%.*]], label [[END:%.*]]
 ; CHECK:       next:
-; CHECK-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[OFFSET]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load volatile i1, ptr [[SUNKADDR]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[ADDR64]] to ptr
+; CHECK-NEXT:    [[TMP2:%.*]] = load volatile i1, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    ret void
 ; CHECK:       end:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/CodeGen/AArch64/sme-aarch64-svcount-O3.ll b/llvm/test/CodeGen/AArch64/sme-aarch64-svcount-O3.ll
--- a/llvm/test/CodeGen/AArch64/sme-aarch64-svcount-O3.ll
+++ b/llvm/test/CodeGen/AArch64/sme-aarch64-svcount-O3.ll
@@ -5,14 +5,14 @@
 define target("aarch64.svcount") @test_alloca_store_reload(target("aarch64.svcount") %val0, target("aarch64.svcount") %val1, ptr %iptr, ptr %pptr, i64 %N) nounwind {
 ; CHECK-LABEL: @test_alloca_store_reload(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store i64 0, ptr [[IPTR:%.*]], align 4
+; CHECK-NEXT:    store i64 0, ptr [[IPTR:%.*]], align 8
 ; CHECK-NEXT:    store target("aarch64.svcount") [[VAL0:%.*]], ptr [[PPTR:%.*]], align 2
 ; CHECK-NEXT:    [[I1_PEEL:%.*]] = icmp eq i64 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[I1_PEEL]], label [[LOOP_EXIT:%.*]], label [[LOOP_BODY:%.*]]
 ; CHECK:       loop.body:
 ; CHECK-NEXT:    [[IND:%.*]] = phi i64 [ [[IND_NEXT:%.*]], [[LOOP_BODY]] ], [ 1, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[IPTR_GEP:%.*]] = getelementptr i64, ptr [[IPTR]], i64 [[IND]]
-; CHECK-NEXT:    store i64 [[IND]], ptr [[IPTR_GEP]], align 4
+; CHECK-NEXT:    store i64 [[IND]], ptr [[IPTR_GEP]], align 8
 ; CHECK-NEXT:    store target("aarch64.svcount") [[VAL1:%.*]], ptr [[PPTR]], align 2
 ; CHECK-NEXT:    [[IND_NEXT]] = add i64 [[IND]], 1
 ; CHECK-NEXT:    [[I1:%.*]] = icmp eq i64 [[IND]], [[N]]
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
@@ -183,12 +183,12 @@
 define amdgpu_kernel void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(ptr addrspace(1) %out) #1 {
 ; AKF_HSA-LABEL: define {{[^@]+}}@store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat
 ; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT:    store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) [[OUT]], align 4
+; AKF_HSA-NEXT:    store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) [[OUT]], align 8
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat
 ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] {
-; ATTRIBUTOR_HSA-NEXT:    store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) [[OUT]], align 4
+; ATTRIBUTOR_HSA-NEXT:    store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) [[OUT]], align 8
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -69,7 +69,6 @@
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -164,7 +163,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -280,7 +278,6 @@
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -398,7 +395,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -480,7 +476,6 @@
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -551,7 +546,6 @@
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -630,7 +624,6 @@
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -715,7 +708,6 @@
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -788,7 +780,6 @@
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -856,7 +847,6 @@
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -934,7 +924,6 @@
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -1020,7 +1009,6 @@
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -1283,7 +1271,6 @@
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -1591,7 +1578,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -1983,7 +1969,6 @@
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s0, v3
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -2391,7 +2376,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -2657,7 +2641,6 @@
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -2879,7 +2862,6 @@
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -3129,7 +3111,6 @@
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -3411,7 +3392,6 @@
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -3544,7 +3524,6 @@
 ; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -3618,7 +3597,6 @@
 ; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -3700,7 +3678,6 @@
 ; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -3788,7 +3765,6 @@
 ; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -3934,7 +3910,6 @@
 ; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v3i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -4107,7 +4082,6 @@
 ; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v3i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -4302,7 +4276,6 @@
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v3i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -4519,7 +4492,6 @@
 ; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v3i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -4713,7 +4685,6 @@
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v3i15:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -4908,7 +4879,6 @@
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v3i15:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -5123,7 +5093,6 @@
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v3i15:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -5360,7 +5329,6 @@
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v3i15:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -5464,7 +5432,6 @@
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i32_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -5501,7 +5468,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -5537,7 +5503,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -5579,7 +5544,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -5625,7 +5589,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -5771,7 +5734,6 @@
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -5854,7 +5816,6 @@
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i32_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -5893,7 +5854,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -5930,7 +5890,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -5973,7 +5932,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -6105,7 +6063,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -6181,7 +6138,6 @@
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i32_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -6221,7 +6177,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -6289,7 +6244,6 @@
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -6365,7 +6319,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -6420,7 +6373,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -6605,7 +6557,6 @@
 ; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -6707,7 +6658,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i32_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -6750,7 +6700,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -6813,7 +6762,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -6888,7 +6836,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -7060,7 +7007,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -7134,7 +7080,7 @@
 define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @udiv_i64_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_i64_oddk_denom:
@@ -7257,7 +7203,6 @@
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i64_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f176a73
@@ -7397,7 +7342,7 @@
 define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @udiv_i64_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 4096
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_i64_pow2k_denom:
@@ -7413,7 +7358,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -7433,7 +7377,7 @@
 ; CHECK-LABEL: @udiv_i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]]
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_i64_pow2_shl_denom:
@@ -7451,7 +7395,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -7496,7 +7439,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v2i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
@@ -7634,7 +7576,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x457ff000
@@ -7793,7 +7734,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -7819,7 +7759,7 @@
 define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @urem_i64_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_i64_oddk_denom:
@@ -7941,7 +7881,6 @@
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i64_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f1761f8
@@ -8079,7 +8018,7 @@
 define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @urem_i64_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 4096
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_i64_pow2k_denom:
@@ -8095,7 +8034,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -8114,7 +8052,7 @@
 ; CHECK-LABEL: @urem_i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]]
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_i64_pow2_shl_denom:
@@ -8135,7 +8073,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -8183,7 +8120,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v2i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
@@ -8238,7 +8174,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -8269,7 +8204,7 @@
 define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @sdiv_i64_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_i64_oddk_denom:
@@ -8387,7 +8322,6 @@
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i64_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4996c7d8
@@ -8524,7 +8458,7 @@
 define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @sdiv_i64_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 4096
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_i64_pow2k_denom:
@@ -8544,7 +8478,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -8568,7 +8501,7 @@
 ; CHECK-LABEL: @sdiv_i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]]
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_i64_pow2_shl_denom:
@@ -8707,7 +8640,6 @@
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x34
@@ -8899,7 +8831,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v2i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
@@ -9060,7 +8991,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x457ff000
@@ -9473,7 +9403,6 @@
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -9775,7 +9704,7 @@
 define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @srem_i64_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 1235195
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_i64_oddk_denom:
@@ -9891,7 +9820,6 @@
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i64_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4996c7d8
@@ -10024,7 +9952,7 @@
 define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @srem_i64_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 4096
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_i64_pow2k_denom:
@@ -10046,7 +9974,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -10072,7 +9999,7 @@
 ; CHECK-LABEL: @srem_i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]]
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_i64_pow2_shl_denom:
@@ -10209,7 +10136,6 @@
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x34
@@ -10402,7 +10328,6 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v2i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
@@ -10701,7 +10626,6 @@
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll
@@ -56,12 +56,13 @@
 ; CHECK-LABEL: define void @sincos_f32
 ; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[__SINCOS_]])
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[__SINCOS_]], align 4
-; CHECK-NEXT:    store float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
+; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
 ; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract float @_Z3cosf(float [[X]])
-; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 4
+; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -166,13 +166,13 @@
 ; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_id
 ; AKF_HSA-SAME: () #[[ATTR1]] {
 ; AKF_HSA-NEXT:    [[VAL:%.*]] = call i64 @llvm.amdgcn.dispatch.id()
-; AKF_HSA-NEXT:    store volatile i64 [[VAL]], ptr addrspace(1) undef, align 4
+; AKF_HSA-NEXT:    store volatile i64 [[VAL]], ptr addrspace(1) undef, align 8
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_dispatch_id
 ; ATTRIBUTOR_HSA-SAME: () #[[ATTR9:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[VAL:%.*]] = call i64 @llvm.amdgcn.dispatch.id()
-; ATTRIBUTOR_HSA-NEXT:    store volatile i64 [[VAL]], ptr addrspace(1) undef, align 4
+; ATTRIBUTOR_HSA-NEXT:    store volatile i64 [[VAL]], ptr addrspace(1) undef, align 8
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   %val = call i64 @llvm.amdgcn.dispatch.id()
diff --git a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
--- a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
@@ -42,7 +42,7 @@
 ; CHECK-NEXT:    [[VAL8:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
 ; CHECK-NEXT:    store volatile i32 [[VAL8]], ptr addrspace(1) null, align 4
 ; CHECK-NEXT:    [[VAL9:%.*]] = call i64 @llvm.amdgcn.dispatch.id()
-; CHECK-NEXT:    store volatile i64 [[VAL9]], ptr addrspace(1) null, align 4
+; CHECK-NEXT:    store volatile i64 [[VAL9]], ptr addrspace(1) null, align 8
 ; CHECK-NEXT:    ret void
 ;
   %val0 = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/lds-reject-absolute-addresses.ll b/llvm/test/CodeGen/AMDGPU/lds-reject-absolute-addresses.ll
--- a/llvm/test/CodeGen/AMDGPU/lds-reject-absolute-addresses.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-reject-absolute-addresses.ll
@@ -11,5 +11,5 @@
   ret void
 }
 
-!0 = !{i64 0, i64 1}
+!0 = !{i32 0, i32 1}
 
diff --git a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll
@@ -34,7 +34,7 @@
 ; CHECK: @[[__INIT_ARRAY_END:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
 ; CHECK: @[[__FINI_ARRAY_START:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
 ; CHECK: @[[__FINI_ARRAY_END:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
-; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini], section "llvm.metadata"
+; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending addrspace(1) global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini], section "llvm.metadata"
 ; CHECK: @[[FOO_ALIAS:[a-zA-Z0-9_$"\\.-]+]] = hidden alias void (), ptr @foo
 ;.
 ; CHECK-LABEL: define void @foo(
diff --git a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll
@@ -51,7 +51,7 @@
 ; CHECK: @[[__INIT_ARRAY_END:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
 ; CHECK: @[[__FINI_ARRAY_START:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
 ; CHECK: @[[__FINI_ARRAY_END:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
-; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini], section "llvm.metadata"
+; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending addrspace(1) global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini], section "llvm.metadata"
 ;.
 ; CHECK-LABEL: define internal void @foo() {
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
@@ -15,7 +15,7 @@
 
 ;.
 ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 8, !absolute_symbol !0
-; CHECK: @llvm.compiler.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
+; CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
 ; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t poison, align 16, !absolute_symbol !0
 ; CHECK: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t poison, align 16, !absolute_symbol !0
 ; CHECK: @llvm.amdgcn.kernel.k2.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k2.lds.t poison, align 2, !absolute_symbol !0
@@ -99,4 +99,4 @@
 ; CHECK: attributes #3 = { "amdgpu-lds-size"="4" }
 ; CHECK: attributes #4 = { "amdgpu-lds-size"="9" }
 
-; CHECK: !0 = !{i64 0, i64 1}
+; CHECK: !0 = !{i32 0, i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll
@@ -24,7 +24,7 @@
 ; SUPER-ALIGN_ON:  @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t poison, align 16
 ; SUPER-ALIGN_OFF: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t poison, align 8
 
-; SUPER-ALIGN_ON:  @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t poison, align 16
+; SUPER-ALIGN_ON:  @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t poison, align 8
 ; SUPER-ALIGN_OFF: @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t poison, align 4
 
 ; CHECK-LABEL: @k1
@@ -133,11 +133,9 @@
 ; Check that aligment is not propagated if use is not a pointer operand.
 
 ; CHECK-LABEL: @k4
-; SUPER-ALIGN_ON:  store i32 poison, ptr addrspace(3) %gep, align 8
-; SUPER-ALIGN_OFF: store i32 poison, ptr addrspace(3) %gep, align 4
+; CHECK:           store i32 poison, ptr addrspace(3) %gep, align 4
 ; CHECK:           store ptr addrspace(3) %gep, ptr poison, align 4
-; SUPER-ALIGN_ON:  %val1 = cmpxchg volatile ptr addrspace(3) %gep, i32 1, i32 2 monotonic monotonic, align 8
-; SUPER-ALIGN_OFF: %val1 = cmpxchg volatile ptr addrspace(3) %gep, i32 1, i32 2 monotonic monotonic, align 4
+; CHECK:           %val1 = cmpxchg volatile ptr addrspace(3) %gep, i32 1, i32 2 monotonic monotonic, align 4
 ; CHECK:           %val2 = cmpxchg volatile ptr poison, ptr addrspace(3) %gep, ptr addrspace(3) poison monotonic monotonic, align 4
 define amdgpu_kernel void @k4() {
   %gep = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @lds.6, i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll
@@ -62,4 +62,4 @@
   ret void
 }
 
-; CHECK: !0 = !{i64 0, i64 1}
+; CHECK: !0 = !{i32 0, i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
@@ -69,7 +69,7 @@
 ; CHECK: attributes #[[ATTR0]] = { "amdgpu-lds-size"="7" }
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 ;.
-; CHECK: [[META0:![0-9]+]] = !{i64 0, i64 1}
+; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 1}
 ; CHECK: [[META1:![0-9]+]] = !{!2}
 ; CHECK: [[META2:![0-9]+]] = distinct !{!2, !3}
 ; CHECK: [[META3:![0-9]+]] = distinct !{!3}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
@@ -42,7 +42,7 @@
 !8 = !{!"omnipotent char", !9, i64 0}
 !9 = !{!"Simple C++ TBAA"}
 
-; CHECK:!0 = !{i64 0, i64 1}
+; CHECK:!0 = !{i32 0, i32 1}
 ; CHECK:!1 = !{!2, !3, i64 0}
 ; CHECK:!2 = !{!"no_clobber_ds_load_stores_x2_preexisting_aa", !3, i64 0}
 ; CHECK:!3 = !{!"int", !4, i64 0}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
@@ -110,7 +110,7 @@
   ret void
 }
 
-; CHECK: !0 = !{i64 0, i64 1}
+; CHECK: !0 = !{i32 0, i32 1}
 ; CHECK: !1 = !{!2}
 ; CHECK: !2 = distinct !{!2, !3}
 ; CHECK: !3 = distinct !{!3}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll
@@ -16,7 +16,7 @@
 ; CHECK: @dynamic_kernel_only = external addrspace(3) global [0 x double]
 ; CHECK: @dynamic_shared8 = external addrspace(3) global [0 x i64], align 8
 ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 4, !absolute_symbol !0
-; CHECK: @llvm.compiler.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
+; CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
 
 ; Alignment of these must be the maximum of the alignment of the reachable symbols
 ; CHECK: @llvm.amdgcn.expect_align1.dynlds = external addrspace(3) global [0 x i8], align 1, !absolute_symbol !0
@@ -103,7 +103,7 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[DYNAMIC_SHARED8]], align 4
 ; CHECK-NEXT:    [[DYNAMIC_SHARED81:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i64], ptr addrspace(3) [[DYNAMIC_SHARED81]], i32 0, i32 7
-; CHECK-NEXT:    store i64 3, ptr addrspace(3) [[ARRAYIDX]], align 4
+; CHECK-NEXT:    store i64 3, ptr addrspace(3) [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %arrayidx = getelementptr inbounds [0 x i64], ptr addrspace(3) @dynamic_shared8, i32 0, i32 7
@@ -149,7 +149,7 @@
 ; CHECK-LABEL: define amdgpu_kernel void @expect_align8() !llvm.amdgcn.lds.kernel.id !5 {
 ; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align8.dynlds) ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i64], ptr addrspace(3) @dynamic_shared8, i32 0, i32 9
-; CHECK-NEXT:    store i64 3, ptr addrspace(3) [[ARRAYIDX]], align 4
+; CHECK-NEXT:    store i64 3, ptr addrspace(3) [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    call void @use_shared8()
 ; CHECK-NEXT:    ret void
 ;
@@ -188,8 +188,8 @@
 ; CHECK: attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; CHECK: attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 
-; CHECK: !0 = !{i64 0, i64 1}
-; CHECK: !1 = !{i64 4, i64 5}
+; CHECK: !0 = !{i32 0, i32 1}
+; CHECK: !1 = !{i32 4, i32 5}
 ; CHECK: !2 = !{i32 0}
 ; CHECK: !3 = !{i32 1}
 ; CHECK: !4 = !{i32 2}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
@@ -26,7 +26,7 @@
 @llvm.used = appending global [2 x ptr] [ptr addrspacecast (ptr addrspace(3) @tolower to ptr), ptr addrspacecast (ptr addrspace(1) @ignored to ptr)], section "llvm.metadata"
 
 ; @ignored still in list, @tolower removed, llvm.amdgcn.module.lds appended
-; CHECK: @llvm.compiler.used = appending global [2 x ptr] [ptr addrspacecast (ptr addrspace(1) @ignored to ptr), ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
+; CHECK: @llvm.compiler.used = appending addrspace(1) global [2 x ptr] [ptr addrspacecast (ptr addrspace(1) @ignored to ptr), ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
 
 @llvm.compiler.used = appending global [2 x ptr] [ptr addrspacecast (ptr addrspace(3) @tolower to ptr), ptr addrspacecast (ptr addrspace(1) @ignored to ptr)], section "llvm.metadata"
 
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -mtriple=amdgcn--amdhsa -passes=amdgpu-lower-module-lds < %s --amdgpu-lower-module-lds-strategy=hybrid | FileCheck -check-prefix=OPT %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=hybrid | FileCheck -check-prefix=GCN %s
 
@@ -12,7 +12,7 @@
 @unused = addrspace(3) global i16 poison
 
 ; OPT: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 16, !absolute_symbol !0
-; OPT: @llvm.compiler.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
+; OPT: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
 ; OPT: @llvm.amdgcn.kernel.kernel_no_table.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kernel_no_table.lds.t poison, align 8, !absolute_symbol !0
 ; OPT: @llvm.amdgcn.kernel.k01.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k01.lds.t poison, align 4, !absolute_symbol !1
 ; OPT: @llvm.amdgcn.kernel.k23.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k23.lds.t poison, align 8, !absolute_symbol !0
@@ -73,12 +73,12 @@
 ; OPT-NEXT:    [[V22:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
 ; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[V22]], align 4
 ; OPT-NEXT:    [[V23:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
-; OPT-NEXT:    [[LD:%.*]] = load i64, ptr addrspace(3) [[V23]], align 4
+; OPT-NEXT:    [[LD:%.*]] = load i64, ptr addrspace(3) [[V23]], align 8
 ; OPT-NEXT:    [[MUL:%.*]] = mul i64 [[LD]], 4
 ; OPT-NEXT:    [[V2:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
 ; OPT-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[V2]], align 4
 ; OPT-NEXT:    [[V21:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
-; OPT-NEXT:    store i64 [[MUL]], ptr addrspace(3) [[V21]], align 4
+; OPT-NEXT:    store i64 [[MUL]], ptr addrspace(3) [[V21]], align 8
 ; OPT-NEXT:    ret void
 ;
 ; GCN-LABEL: f2:
@@ -193,7 +193,7 @@
 
 define amdgpu_kernel void @k23() {
 ; OPT-LABEL: @k23(
-; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds) ], !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]]
+; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds) ], !alias.scope !4, !noalias !7
 ; OPT-NEXT:    call void @f2()
 ; OPT-NEXT:    call void @f3()
 ; OPT-NEXT:    ret void
@@ -231,12 +231,12 @@
 ; Access and allocate three variables
 define amdgpu_kernel void @k123() {
 ; OPT-LABEL: @k123(
-; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ], !alias.scope [[META10:![0-9]+]], !noalias [[META13:![0-9]+]]
+; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ], !alias.scope !10, !noalias !13
 ; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
 ; OPT-NEXT:    call void @f1()
-; OPT-NEXT:    [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope [[META13]], !noalias [[META10]]
+; OPT-NEXT:    [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope !13, !noalias !10
 ; OPT-NEXT:    [[MUL:%.*]] = mul i8 [[LD]], 8
-; OPT-NEXT:    store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope [[META13]], !noalias [[META10]]
+; OPT-NEXT:    store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope !13, !noalias !10
 ; OPT-NEXT:    call void @f2()
 ; OPT-NEXT:    ret void
 ;
@@ -285,14 +285,14 @@
 
 
 ; OPT: attributes #0 = { "amdgpu-lds-size"="8" }
-; OPT: attributes #1 = { "amdgpu-lds-size"="12" }
-; OPT: attributes #2 = { "amdgpu-lds-size"="20" }
+; OPT: attributes #1 = { "amdgpu-lds-size"="16" }
+; OPT: attributes #2 = { "amdgpu-lds-size"="24" }
 ; OPT: attributes #3 = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; OPT: attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 
-; OPT: !0 = !{i64 0, i64 1}
-; OPT: !1 = !{i64 4, i64 5}
-; OPT: !2 = !{i64 8, i64 9}
+; OPT: !0 = !{i32 0, i32 1}
+; OPT: !1 = !{i32 4, i32 5}
+; OPT: !2 = !{i32 8, i32 9}
 ; OPT: !3 = !{i32 1}
 ; OPT: !4 = !{!5}
 ; OPT: !5 = distinct !{!5, !6}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
@@ -118,12 +118,12 @@
 ; OPT-NEXT:    [[V22:%.*]] = getelementptr inbounds [3 x [4 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 2
 ; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[V22]], align 4
 ; OPT-NEXT:    [[V23:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
-; OPT-NEXT:    [[LD:%.*]] = load i64, ptr addrspace(3) [[V23]], align 4
+; OPT-NEXT:    [[LD:%.*]] = load i64, ptr addrspace(3) [[V23]], align 8
 ; OPT-NEXT:    [[MUL:%.*]] = mul i64 [[LD]], 4
 ; OPT-NEXT:    [[V2:%.*]] = getelementptr inbounds [3 x [4 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 2
 ; OPT-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[V2]], align 4
 ; OPT-NEXT:    [[V21:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
-; OPT-NEXT:    store i64 [[MUL]], ptr addrspace(3) [[V21]], align 4
+; OPT-NEXT:    store i64 [[MUL]], ptr addrspace(3) [[V21]], align 8
 ; OPT-NEXT:    ret void
 ;
 ; GCN-LABEL: f2:
@@ -300,7 +300,7 @@
 ; Access and allocate three variables
 define amdgpu_kernel void @k123() {
 ; OPT-LABEL: define amdgpu_kernel void @k123(
-; OPT-SAME: ) #[[ATTR2:[0-9]+]] !llvm.amdgcn.lds.kernel.id !13 {
+; OPT-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id !13 {
 ; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ], !alias.scope !14, !noalias !17
 ; OPT-NEXT:    call void @f1()
 ; OPT-NEXT:    [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 2, !alias.scope !20, !noalias !21
@@ -352,8 +352,7 @@
 ; OPT: declare i32 @llvm.amdgcn.lds.kernel.id()
 
 ; OPT: attributes #0 = { "amdgpu-lds-size"="8" }
-; OPT: attributes #1 = { "amdgpu-lds-size"="12" }
-; OPT: attributes #2 = { "amdgpu-lds-size"="16" }
+; OPT: attributes #1 = { "amdgpu-lds-size"="16" }
 
 !0 = !{i64 0, i64 1}
 !1 = !{i32 0}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll b/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll
@@ -11,7 +11,7 @@
 ; CHECK: @__init_array_end = external addrspace(1) constant [0 x ptr addrspace(1)]
 ; CHECK: @__fini_array_start = external addrspace(1) constant [0 x ptr addrspace(1)]
 ; CHECK: @__fini_array_end = external addrspace(1) constant [0 x ptr addrspace(1)]
-; CHECK: @llvm.used = appending global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini]
+; CHECK: @llvm.used = appending addrspace(1) global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini]
 ; UTC_ARGS: --enable
 
 
diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
--- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -50,7 +50,7 @@
 ; IR-NEXT:  bb:
 ; IR-NEXT:    [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #[[ATTR4:[0-9]+]]
 ; IR-NEXT:    [[MY_TMP1:%.*]] = getelementptr inbounds i64, ptr addrspace(3) [[ARG:%.*]], i32 [[MY_TMP]]
-; IR-NEXT:    [[MY_TMP2:%.*]] = load volatile i64, ptr addrspace(3) [[MY_TMP1]], align 4
+; IR-NEXT:    [[MY_TMP2:%.*]] = load volatile i64, ptr addrspace(3) [[MY_TMP1]], align 8
 ; IR-NEXT:    br label [[BB5:%.*]]
 ; IR:       bb3:
 ; IR-NEXT:    br i1 true, label [[BB4:%.*]], label [[BB13:%.*]]
@@ -93,6 +93,7 @@
 ; IR:       bb23:
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP6]])
 ; IR-NEXT:    ret void
+;
 bb:
   %my.tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %my.tmp1 = getelementptr inbounds i64, ptr addrspace(3) %arg, i32 %my.tmp
@@ -276,6 +277,7 @@
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP7]])
 ; IR-NEXT:    store volatile i32 0, ptr addrspace(1) undef, align 4
 ; IR-NEXT:    ret void
+;
 bb:
   %my.tmp1134 = load volatile i32, ptr addrspace(1) undef
   %my.tmp1235 = icmp slt i32 %my.tmp1134, 9
diff --git a/llvm/test/CodeGen/AMDGPU/opencl-printf.ll b/llvm/test/CodeGen/AMDGPU/opencl-printf.ll
--- a/llvm/test/CodeGen/AMDGPU/opencl-printf.ll
+++ b/llvm/test/CodeGen/AMDGPU/opencl-printf.ll
@@ -86,7 +86,7 @@
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], i32 4
 ; GCN-NEXT:    store i32 [[I32:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], i32 4
-; GCN-NEXT:    store i64 [[I64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 4
+; GCN-NEXT:    store i64 [[I64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 8
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], i32 8
 ; GCN-NEXT:    store <2 x float> <float 1.000000e+00, float 2.000000e+00>, ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 8
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], i32 8
@@ -108,7 +108,7 @@
 ; R600-NEXT:    ret void
 ;
 ; GCN-LABEL: @format_str_ptr(
-; GCN-NEXT:    [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 44)
+; GCN-NEXT:    [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 36)
 ; GCN-NEXT:    br label [[DOTSPLIT:%.*]]
 ; GCN:       .split:
 ; GCN-NEXT:    [[TMP1:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null
@@ -120,12 +120,12 @@
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store ptr [[PTR_FLAT:%.*]], ptr addrspace(1) [[PRINTBUFFGEP]], align 8
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 8
-; GCN-NEXT:    store ptr addrspace(3) [[PTR_LDS:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR]], align 8
-; GCN-NEXT:    [[PRINTBUFFNEXTPTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], i32 8
+; GCN-NEXT:    store ptr addrspace(3) [[PTR_LDS:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR]], align 4
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], i32 4
 ; GCN-NEXT:    store ptr addrspace(1) [[PTR_GLOBAL:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR1]], align 8
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR1]], i32 8
-; GCN-NEXT:    store ptr addrspace(5) [[PTR_STACK:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], align 8
-; GCN-NEXT:    [[PRINTBUFFNEXTPTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], i32 8
+; GCN-NEXT:    store ptr addrspace(5) [[PTR_STACK:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], align 4
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], i32 4
 ; GCN-NEXT:    store ptr addrspace(4) [[PTR_CONST:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], align 8
 ; GCN-NEXT:    br label [[TMP3]]
 ; GCN:       3:
@@ -145,7 +145,7 @@
 ; GCN-NEXT:    [[TMP2:%.*]] = sext i4 [[I4:%.*]] to i32
 ; GCN-NEXT:    [[TMP3:%.*]] = sext i8 [[I8:%.*]] to i32
 ; GCN-NEXT:    [[TMP4:%.*]] = sext i16 [[I16:%.*]] to i32
-; GCN-NEXT:    [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 68)
+; GCN-NEXT:    [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 72)
 ; GCN-NEXT:    br label [[DOTSPLIT:%.*]]
 ; GCN:       .split:
 ; GCN-NEXT:    [[TMP5:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null
@@ -167,11 +167,11 @@
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], i32 4
 ; GCN-NEXT:    store i32 [[I32:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], i32 4
-; GCN-NEXT:    store i64 [[I64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 4
+; GCN-NEXT:    store i64 [[I64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 8
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], i32 8
-; GCN-NEXT:    store i96 [[I96:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 4
-; GCN-NEXT:    [[PRINTBUFFNEXTPTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], i32 12
-; GCN-NEXT:    store i128 [[I128:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 4
+; GCN-NEXT:    store i96 [[I96:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 8
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], i32 16
+; GCN-NEXT:    store i128 [[I128:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 8
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], i32 16
 ; GCN-NEXT:    store i32 1234, ptr addrspace(1) [[PRINTBUFFNEXTPTR8]], align 4
 ; GCN-NEXT:    br label [[TMP7]]
@@ -192,7 +192,7 @@
 ; GCN-NEXT:    [[TMP2:%.*]] = zext i4 [[I4:%.*]] to i32
 ; GCN-NEXT:    [[TMP3:%.*]] = zext i8 [[I8:%.*]] to i32
 ; GCN-NEXT:    [[TMP4:%.*]] = zext i16 [[I16:%.*]] to i32
-; GCN-NEXT:    [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 68)
+; GCN-NEXT:    [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 72)
 ; GCN-NEXT:    br label [[DOTSPLIT:%.*]]
 ; GCN:       .split:
 ; GCN-NEXT:    [[TMP5:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null
@@ -214,11 +214,11 @@
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], i32 4
 ; GCN-NEXT:    store i32 [[I32:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], i32 4
-; GCN-NEXT:    store i64 [[I64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 4
+; GCN-NEXT:    store i64 [[I64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 8
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], i32 8
-; GCN-NEXT:    store i96 [[I96:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 4
-; GCN-NEXT:    [[PRINTBUFFNEXTPTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], i32 12
-; GCN-NEXT:    store i128 [[I128:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 4
+; GCN-NEXT:    store i96 [[I96:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 8
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], i32 16
+; GCN-NEXT:    store i128 [[I128:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 8
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], i32 16
 ; GCN-NEXT:    store i32 1234, ptr addrspace(1) [[PRINTBUFFNEXTPTR8]], align 4
 ; GCN-NEXT:    br label [[TMP7]]
diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
--- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
@@ -134,10 +134,10 @@
 
 define i64 @cmpxchg_private_i64(ptr addrspace(5) %ptr) {
 ; IR-LABEL: @cmpxchg_private_i64(
-; IR-NEXT:    [[TMP1:%.*]] = load i64, ptr addrspace(5) [[PTR:%.*]], align 4
+; IR-NEXT:    [[TMP1:%.*]] = load i64, ptr addrspace(5) [[PTR:%.*]], align 8
 ; IR-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 0
 ; IR-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 1, i64 [[TMP1]]
-; IR-NEXT:    store i64 [[TMP3]], ptr addrspace(5) [[PTR]], align 4
+; IR-NEXT:    store i64 [[TMP3]], ptr addrspace(5) [[PTR]], align 8
 ; IR-NEXT:    [[TMP4:%.*]] = insertvalue { i64, i1 } poison, i64 [[TMP1]], 0
 ; IR-NEXT:    [[TMP5:%.*]] = insertvalue { i64, i1 } [[TMP4]], i1 [[TMP2]], 1
 ; IR-NEXT:    [[RESULT_0:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
@@ -93,7 +93,7 @@
 ; CHECK-NEXT:    [[FOO3_FCA_0_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float [[FOO3_FCA_0_EXTRACT]], i32 0
 ; CHECK-NEXT:    [[FOO3_FCA_1_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3_FCA_1_EXTRACT]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3_FCA_1_EXTRACT]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 [[FOO1]]
 ; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
 ; CHECK-NEXT:    [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP3]], i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
@@ -7,7 +7,7 @@
 ; CHECK-LABEL: @memset_all_zero(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <6 x i64> zeroinitializer, i64 [[VAL:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <6 x i64> [[TMP0]], i64 [[VAL]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <6 x i64> [[TMP0]], i64 [[VAL]], i32 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -24,7 +24,7 @@
 ; CHECK-LABEL: @memset_all_5(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i64> <i64 361700864190383365, i64 361700864190383365, i64 361700864190383365, i64 361700864190383365>, i64 [[VAL:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[VAL]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[VAL]], i32 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -42,7 +42,7 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
 ; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 32, i1 true)
-; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4
+; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -57,7 +57,7 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
 ; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 31, i1 true)
-; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4
+; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -73,7 +73,7 @@
 ; CHECK-NEXT:    [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [4 x i64], ptr addrspace(5) [[STACK]], i64 0, i64 1
 ; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[GEP]], i8 0, i64 24, i1 true)
-; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4
+; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll b/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll
--- a/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll
@@ -483,75 +483,41 @@
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_1_use.body
-; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    ret [[VOID_ONE_OUT_ARG_I32_1_USE:%.*]] zeroinitializer
-;
-;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_1_use
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_I32_1_USE:%.*]] @void_one_out_arg_i32_1_use.body(ptr poison)
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[VOID_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 0
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
+; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store i32 0, ptr [[VAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_1_use_align.body
-; CHECK-SAME: (ptr align 8 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    ret [[VOID_ONE_OUT_ARG_I32_1_USE_ALIGN:%.*]] zeroinitializer
-;
-;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_1_use_align
-; CHECK-SAME: (ptr align 8 [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_I32_1_USE_ALIGN:%.*]] @void_one_out_arg_i32_1_use_align.body(ptr poison)
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[VOID_ONE_OUT_ARG_I32_1_USE_ALIGN]] [[TMP2]], 0
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 8
+; CHECK-SAME: (ptr align 8 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store i32 0, ptr [[VAL]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_2_use.body
+; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_2_use
 ; CHECK-SAME: (i1 [[ARG0:%.*]], ptr [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    br i1 [[ARG0]], label [[RET0:%.*]], label [[RET1:%.*]]
 ; CHECK:       ret0:
-; CHECK-NEXT:    ret [[VOID_ONE_OUT_ARG_I32_2_USE:%.*]] zeroinitializer
+; CHECK-NEXT:    store i32 0, ptr [[VAL]], align 4
+; CHECK-NEXT:    ret void
 ; CHECK:       ret1:
-; CHECK-NEXT:    ret [[VOID_ONE_OUT_ARG_I32_2_USE]] { i32 9 }
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_2_use
-; CHECK-SAME: (i1 [[TMP0:%.*]], ptr [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[VOID_ONE_OUT_ARG_I32_2_USE:%.*]] @void_one_out_arg_i32_2_use.body(i1 [[TMP0]], ptr poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[VOID_ONE_OUT_ARG_I32_2_USE]] [[TMP3]], 0
-; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP1]], align 4
+; CHECK-NEXT:    store i32 9, ptr [[VAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_2_stores.body
+; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_2_stores
 ; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    store i32 0, ptr [[VAL]], align 4
-; CHECK-NEXT:    ret [[VOID_ONE_OUT_ARG_I32_2_STORES:%.*]] { i32 1 }
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_2_stores
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_I32_2_STORES:%.*]] @void_one_out_arg_i32_2_stores.body(ptr poison)
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[VOID_ONE_OUT_ARG_I32_2_STORES]] [[TMP2]], 0
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    store i32 1, ptr [[VAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_2_stores_clobber.body
+; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_2_stores_clobber
 ; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    store i32 0, ptr [[VAL]], align 4
 ; CHECK-NEXT:    call void @may.clobber()
-; CHECK-NEXT:    ret [[VOID_ONE_OUT_ARG_I32_2_STORES_CLOBBER:%.*]] { i32 1 }
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_2_stores_clobber
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_I32_2_STORES_CLOBBER:%.*]] @void_one_out_arg_i32_2_stores_clobber.body(ptr poison)
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[VOID_ONE_OUT_ARG_I32_2_STORES_CLOBBER]] [[TMP2]], 0
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    store i32 1, ptr [[VAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
@@ -562,17 +528,10 @@
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_pre_call_may_clobber.body
+; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_pre_call_may_clobber
 ; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @may.clobber()
-; CHECK-NEXT:    ret [[VOID_ONE_OUT_ARG_I32_PRE_CALL_MAY_CLOBBER:%.*]] zeroinitializer
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_pre_call_may_clobber
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_I32_PRE_CALL_MAY_CLOBBER:%.*]] @void_one_out_arg_i32_pre_call_may_clobber.body(ptr poison)
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[VOID_ONE_OUT_ARG_I32_PRE_CALL_MAY_CLOBBER]] [[TMP2]], 0
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    store i32 0, ptr [[VAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
@@ -602,101 +561,48 @@
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_v2i32_1_use.body
-; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    ret [[VOID_ONE_OUT_ARG_V2I32_1_USE:%.*]] { <2 x i32> <i32 17, i32 9> }
-;
-;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_v2i32_1_use
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_V2I32_1_USE:%.*]] @void_one_out_arg_v2i32_1_use.body(ptr poison)
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[VOID_ONE_OUT_ARG_V2I32_1_USE]] [[TMP2]], 0
-; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr [[TMP0]], align 8
+; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store <2 x i32> <i32 17, i32 9>, ptr [[VAL]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_struct_1_use.body
-; CHECK-SAME: (ptr [[OUT:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    ret [[VOID_ONE_OUT_ARG_STRUCT_1_USE:%.*]] { [[STRUCT:%.*]] { i32 9, i8 99, float 4.000000e+00 } }
-;
-;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_struct_1_use
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_STRUCT_1_USE:%.*]] @void_one_out_arg_struct_1_use.body(ptr poison)
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[VOID_ONE_OUT_ARG_STRUCT_1_USE]] [[TMP2]], 0
-; CHECK-NEXT:    store [[STRUCT:%.*]] [[TMP3]], ptr [[TMP0]], align 4
+; CHECK-SAME: (ptr [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store [[STRUCT:%.*]] { i32 9, i8 99, float 4.000000e+00 }, ptr [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@i32_one_out_arg_i32_1_use.body
-; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    ret [[I32_ONE_OUT_ARG_I32_1_USE:%.*]] { i32 9, i32 24 }
-;
-;
 ; CHECK-LABEL: define {{[^@]+}}@i32_one_out_arg_i32_1_use
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[I32_ONE_OUT_ARG_I32_1_USE:%.*]] @i32_one_out_arg_i32_1_use.body(ptr poison)
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[I32_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 1
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[I32_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 0
-; CHECK-NEXT:    ret i32 [[TMP4]]
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@unused_different_type.body
-; CHECK-SAME: (ptr [[ARG0:%.*]], ptr nocapture [[ARG1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    ret [[UNUSED_DIFFERENT_TYPE:%.*]] { float 4.000000e+00 }
+; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store i32 24, ptr [[VAL]], align 4
+; CHECK-NEXT:    ret i32 9
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@unused_different_type
-; CHECK-SAME: (ptr [[TMP0:%.*]], ptr nocapture [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[UNUSED_DIFFERENT_TYPE:%.*]] @unused_different_type.body(ptr [[TMP0]], ptr poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[UNUSED_DIFFERENT_TYPE]] [[TMP3]], 0
-; CHECK-NEXT:    store float [[TMP4]], ptr [[TMP1]], align 4
+; CHECK-SAME: (ptr [[ARG0:%.*]], ptr nocapture [[ARG1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store float 4.000000e+00, ptr [[ARG1]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@multiple_same_return_noalias.body
-; CHECK-SAME: (ptr noalias [[OUT0:%.*]], ptr noalias [[OUT1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    ret [[MULTIPLE_SAME_RETURN_NOALIAS:%.*]] { i32 1, i32 2 }
-;
-;
 ; CHECK-LABEL: define {{[^@]+}}@multiple_same_return_noalias
-; CHECK-SAME: (ptr noalias [[TMP0:%.*]], ptr noalias [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[MULTIPLE_SAME_RETURN_NOALIAS:%.*]] @multiple_same_return_noalias.body(ptr poison, ptr poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[MULTIPLE_SAME_RETURN_NOALIAS]] [[TMP3]], 0
-; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[MULTIPLE_SAME_RETURN_NOALIAS]] [[TMP3]], 1
-; CHECK-NEXT:    store i32 [[TMP5]], ptr [[TMP1]], align 4
+; CHECK-SAME: (ptr noalias [[OUT0:%.*]], ptr noalias [[OUT1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store i32 1, ptr [[OUT0]], align 4
+; CHECK-NEXT:    store i32 2, ptr [[OUT1]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@multiple_same_return_mayalias.body
-; CHECK-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    ret [[MULTIPLE_SAME_RETURN_MAYALIAS:%.*]] { i32 2, i32 1 }
-;
-;
 ; CHECK-LABEL: define {{[^@]+}}@multiple_same_return_mayalias
-; CHECK-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[MULTIPLE_SAME_RETURN_MAYALIAS:%.*]] @multiple_same_return_mayalias.body(ptr poison, ptr poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[MULTIPLE_SAME_RETURN_MAYALIAS]] [[TMP3]], 0
-; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[MULTIPLE_SAME_RETURN_MAYALIAS]] [[TMP3]], 1
-; CHECK-NEXT:    store i32 [[TMP5]], ptr [[TMP1]], align 4
-; CHECK-NEXT:    ret void
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@multiple_same_return_mayalias_order.body
 ; CHECK-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    ret [[MULTIPLE_SAME_RETURN_MAYALIAS_ORDER:%.*]] { i32 1, i32 2 }
+; CHECK-NEXT:    store i32 1, ptr [[OUT0]], align 4
+; CHECK-NEXT:    store i32 2, ptr [[OUT1]], align 4
+; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@multiple_same_return_mayalias_order
-; CHECK-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[MULTIPLE_SAME_RETURN_MAYALIAS_ORDER:%.*]] @multiple_same_return_mayalias_order.body(ptr poison, ptr poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[MULTIPLE_SAME_RETURN_MAYALIAS_ORDER]] [[TMP3]], 0
-; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[MULTIPLE_SAME_RETURN_MAYALIAS_ORDER]] [[TMP3]], 1
-; CHECK-NEXT:    store i32 [[TMP5]], ptr [[TMP1]], align 4
+; CHECK-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store i32 2, ptr [[OUT1]], align 4
+; CHECK-NEXT:    store i32 1, ptr [[OUT0]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
@@ -714,60 +620,28 @@
 ; CHECK-NEXT:    ret i32 [[PHI]]
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@i1_one_out_arg_i32_1_use.body
-; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    ret [[I1_ONE_OUT_ARG_I32_1_USE:%.*]] { i1 true, i32 24 }
-;
-;
 ; CHECK-LABEL: define {{[^@]+}}@i1_one_out_arg_i32_1_use
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[I1_ONE_OUT_ARG_I32_1_USE:%.*]] @i1_one_out_arg_i32_1_use.body(ptr poison)
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[I1_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 1
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[I1_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 0
-; CHECK-NEXT:    ret i1 [[TMP4]]
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@i1_zeroext_one_out_arg_i32_1_use.body
 ; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    ret [[I1_ZEROEXT_ONE_OUT_ARG_I32_1_USE:%.*]] { i1 true, i32 24 }
+; CHECK-NEXT:    store i32 24, ptr [[VAL]], align 4
+; CHECK-NEXT:    ret i1 true
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@i1_zeroext_one_out_arg_i32_1_use
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[I1_ZEROEXT_ONE_OUT_ARG_I32_1_USE:%.*]] @i1_zeroext_one_out_arg_i32_1_use.body(ptr poison)
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[I1_ZEROEXT_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 1
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[I1_ZEROEXT_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 0
-; CHECK-NEXT:    ret i1 [[TMP4]]
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@i1_signext_one_out_arg_i32_1_use.body
 ; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    ret [[I1_SIGNEXT_ONE_OUT_ARG_I32_1_USE:%.*]] { i1 true, i32 24 }
+; CHECK-NEXT:    store i32 24, ptr [[VAL]], align 4
+; CHECK-NEXT:    ret i1 true
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@i1_signext_one_out_arg_i32_1_use
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[I1_SIGNEXT_ONE_OUT_ARG_I32_1_USE:%.*]] @i1_signext_one_out_arg_i32_1_use.body(ptr poison)
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[I1_SIGNEXT_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 1
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[I1_SIGNEXT_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 0
-; CHECK-NEXT:    ret i1 [[TMP4]]
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@p1i32_noalias_one_out_arg_i32_1_use.body
 ; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    ret [[P1I32_NOALIAS_ONE_OUT_ARG_I32_1_USE:%.*]] { ptr addrspace(1) null, i32 24 }
+; CHECK-NEXT:    store i32 24, ptr [[VAL]], align 4
+; CHECK-NEXT:    ret i1 true
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@p1i32_noalias_one_out_arg_i32_1_use
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[P1I32_NOALIAS_ONE_OUT_ARG_I32_1_USE:%.*]] @p1i32_noalias_one_out_arg_i32_1_use.body(ptr poison)
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[P1I32_NOALIAS_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 1
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[P1I32_NOALIAS_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 0
-; CHECK-NEXT:    ret ptr addrspace(1) [[TMP4]]
+; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store i32 24, ptr [[VAL]], align 4
+; CHECK-NEXT:    ret ptr addrspace(1) null
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_non_private_arg_i32_1_use
@@ -776,46 +650,23 @@
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@func_ptr_type.body
+; CHECK-LABEL: define {{[^@]+}}@func_ptr_type
 ; CHECK-SAME: (ptr [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FUNC:%.*]] = load ptr, ptr poison, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[FUNC_PTR_TYPE:%.*]] poison, ptr [[FUNC]], 0
-; CHECK-NEXT:    ret [[FUNC_PTR_TYPE]] [[TMP1]]
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@func_ptr_type
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[FUNC_PTR_TYPE:%.*]] @func_ptr_type.body(ptr poison)
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[FUNC_PTR_TYPE]] [[TMP2]], 0
-; CHECK-NEXT:    store ptr [[TMP3]], ptr [[TMP0]], align 8
+; CHECK-NEXT:    store ptr [[FUNC]], ptr [[OUT]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_func_ptr_type.body
+; CHECK-LABEL: define {{[^@]+}}@bitcast_func_ptr_type
 ; CHECK-SAME: (ptr [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FUNC:%.*]] = load ptr, ptr poison, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_FUNC_PTR_TYPE:%.*]] poison, ptr [[FUNC]], 0
-; CHECK-NEXT:    ret [[BITCAST_FUNC_PTR_TYPE]] [[TMP1]]
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_func_ptr_type
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_FUNC_PTR_TYPE:%.*]] @bitcast_func_ptr_type.body(ptr poison)
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_FUNC_PTR_TYPE]] [[TMP2]], 0
-; CHECK-NEXT:    store ptr [[TMP3]], ptr [[TMP0]], align 8
+; CHECK-NEXT:    store ptr [[FUNC]], ptr [[OUT]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@out_arg_small_array.body
-; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    ret [[OUT_ARG_SMALL_ARRAY:%.*]] { [4 x i32] [i32 0, i32 1, i32 2, i32 3] }
-;
-;
 ; CHECK-LABEL: define {{[^@]+}}@out_arg_small_array
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[OUT_ARG_SMALL_ARRAY:%.*]] @out_arg_small_array.body(ptr poison)
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[OUT_ARG_SMALL_ARRAY]] [[TMP2]], 0
-; CHECK-NEXT:    store [4 x i32] [[TMP3]], ptr [[TMP0]], align 4
+; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store [4 x i32] [i32 0, i32 1, i32 2, i32 3], ptr [[VAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
@@ -832,325 +683,151 @@
 ; CHECK-NEXT:    ret <16 x i32> [[LOAD]]
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@num_regs_reach_limit.body
+; CHECK-LABEL: define {{[^@]+}}@num_regs_reach_limit
 ; CHECK-SAME: (ptr [[OUT:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LOAD:%.*]] = load volatile [15 x i32], ptr addrspace(1) poison, align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[NUM_REGS_REACH_LIMIT:%.*]] poison, [15 x i32] [[LOAD]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[NUM_REGS_REACH_LIMIT]] [[TMP1]], i32 [[VAL]], 1
-; CHECK-NEXT:    ret [[NUM_REGS_REACH_LIMIT]] [[TMP2]]
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@num_regs_reach_limit
-; CHECK-SAME: (ptr [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[NUM_REGS_REACH_LIMIT:%.*]] @num_regs_reach_limit.body(ptr poison, i32 [[TMP1]])
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[NUM_REGS_REACH_LIMIT]] [[TMP3]], 1
-; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[NUM_REGS_REACH_LIMIT]] [[TMP3]], 0
-; CHECK-NEXT:    ret [15 x i32] [[TMP5]]
+; CHECK-NEXT:    store i32 [[VAL]], ptr [[OUT]], align 4
+; CHECK-NEXT:    ret [15 x i32] [[LOAD]]
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@num_regs_reach_limit_leftover.body
+; CHECK-LABEL: define {{[^@]+}}@num_regs_reach_limit_leftover
 ; CHECK-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], i32 [[VAL0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LOAD0:%.*]] = load volatile [15 x i32], ptr addrspace(1) poison, align 4
 ; CHECK-NEXT:    [[LOAD1:%.*]] = load volatile i32, ptr addrspace(1) poison, align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[NUM_REGS_REACH_LIMIT_LEFTOVER:%.*]] poison, [15 x i32] [[LOAD0]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[NUM_REGS_REACH_LIMIT_LEFTOVER]] [[TMP1]], i32 [[LOAD1]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[NUM_REGS_REACH_LIMIT_LEFTOVER]] [[TMP2]], i32 [[VAL0]], 2
-; CHECK-NEXT:    ret [[NUM_REGS_REACH_LIMIT_LEFTOVER]] [[TMP3]]
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@num_regs_reach_limit_leftover
-; CHECK-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], i32 [[TMP2:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = call [[NUM_REGS_REACH_LIMIT_LEFTOVER:%.*]] @num_regs_reach_limit_leftover.body(ptr poison, ptr poison, i32 [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[NUM_REGS_REACH_LIMIT_LEFTOVER]] [[TMP4]], 1
-; CHECK-NEXT:    store i32 [[TMP5]], ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue [[NUM_REGS_REACH_LIMIT_LEFTOVER]] [[TMP4]], 2
-; CHECK-NEXT:    store i32 [[TMP6]], ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue [[NUM_REGS_REACH_LIMIT_LEFTOVER]] [[TMP4]], 0
-; CHECK-NEXT:    ret [15 x i32] [[TMP7]]
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@preserve_debug_info.body
-; CHECK-SAME: (i32 [[ARG0:%.*]], ptr [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @may.clobber(), !dbg [[DBG5:![0-9]+]]
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[PRESERVE_DEBUG_INFO:%.*]] poison, i32 [[ARG0]], 0, !dbg [[DBG11:![0-9]+]]
-; CHECK-NEXT:    ret [[PRESERVE_DEBUG_INFO]] [[TMP1]], !dbg [[DBG11]]
+; CHECK-NEXT:    store i32 [[VAL0]], ptr [[OUT0]], align 4
+; CHECK-NEXT:    store i32 [[LOAD1]], ptr [[OUT1]], align 4
+; CHECK-NEXT:    ret [15 x i32] [[LOAD0]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@preserve_debug_info
-; CHECK-SAME: (i32 [[TMP0:%.*]], ptr [[TMP1:%.*]]) #[[ATTR2]] !dbg [[DBG6:![0-9]+]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[PRESERVE_DEBUG_INFO:%.*]] @preserve_debug_info.body(i32 [[TMP0]], ptr poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[PRESERVE_DEBUG_INFO]] [[TMP3]], 0
-; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP1]], align 4
-; CHECK-NEXT:    ret void
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@preserve_metadata.body
-; CHECK-SAME: (i32 [[ARG0:%.*]], ptr [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @may.clobber()
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[PRESERVE_METADATA:%.*]] poison, i32 [[ARG0]], 0
-; CHECK-NEXT:    ret [[PRESERVE_METADATA]] [[TMP1]]
+; CHECK-SAME: (i32 [[ARG0:%.*]], ptr [[VAL:%.*]]) #[[ATTR0]] !dbg [[DBG5:![0-9]+]] {
+; CHECK-NEXT:    call void @may.clobber(), !dbg [[DBG10:![0-9]+]]
+; CHECK-NEXT:    store i32 [[ARG0]], ptr [[VAL]], align 4, !dbg [[DBG11:![0-9]+]]
+; CHECK-NEXT:    ret void, !dbg [[DBG12:![0-9]+]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@preserve_metadata
-; CHECK-SAME: (i32 [[TMP0:%.*]], ptr [[TMP1:%.*]]) #[[ATTR2]] !kernel_arg_access_qual !12 {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[PRESERVE_METADATA:%.*]] @preserve_metadata.body(i32 [[TMP0]], ptr poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[PRESERVE_METADATA]] [[TMP3]], 0
-; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP1]], align 4
+; CHECK-SAME: (i32 [[ARG0:%.*]], ptr [[VAL:%.*]]) #[[ATTR0]] !kernel_arg_access_qual !13 {
+; CHECK-NEXT:    call void @may.clobber()
+; CHECK-NEXT:    store i32 [[ARG0]], ptr [[VAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_v4i32_v3i32.body
+; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_v4i32_v3i32
 ; CHECK-SAME: (ptr [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LOAD:%.*]] = load volatile <4 x i32>, ptr addrspace(1) poison, align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_V4I32_V3I32:%.*]] poison, <4 x i32> [[LOAD]], 0
-; CHECK-NEXT:    ret [[BITCAST_POINTER_V4I32_V3I32]] [[TMP1]]
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_v4i32_v3i32
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_V4I32_V3I32:%.*]] @bitcast_pointer_v4i32_v3i32.body(ptr poison)
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_V4I32_V3I32]] [[TMP2]], 0
-; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr [[TMP0]], align 16
+; CHECK-NEXT:    store <4 x i32> [[LOAD]], ptr [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_v4i32_v3f32.body
+; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_v4i32_v3f32
 ; CHECK-SAME: (ptr [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LOAD:%.*]] = load volatile <4 x i32>, ptr addrspace(1) poison, align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_V4I32_V3F32:%.*]] poison, <4 x i32> [[LOAD]], 0
-; CHECK-NEXT:    ret [[BITCAST_POINTER_V4I32_V3F32]] [[TMP1]]
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_v4i32_v3f32
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_V4I32_V3F32:%.*]] @bitcast_pointer_v4i32_v3f32.body(ptr poison)
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_V4I32_V3F32]] [[TMP2]], 0
-; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr [[TMP0]], align 16
+; CHECK-NEXT:    store <4 x i32> [[LOAD]], ptr [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_i32_f32.body
+; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_i32_f32
 ; CHECK-SAME: (ptr [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LOAD:%.*]] = load volatile i32, ptr addrspace(1) poison, align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_I32_F32:%.*]] poison, i32 [[LOAD]], 0
-; CHECK-NEXT:    ret [[BITCAST_POINTER_I32_F32]] [[TMP1]]
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_i32_f32
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_I32_F32:%.*]] @bitcast_pointer_i32_f32.body(ptr poison)
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_I32_F32]] [[TMP2]], 0
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_i32_f16.body
+; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_i32_f16
 ; CHECK-SAME: (ptr [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LOAD:%.*]] = load volatile i32, ptr addrspace(1) poison, align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_I32_F16:%.*]] poison, i32 [[LOAD]], 0
-; CHECK-NEXT:    ret [[BITCAST_POINTER_I32_F16]] [[TMP1]]
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_i32_f16
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_I32_F16:%.*]] @bitcast_pointer_i32_f16.body(ptr poison)
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_I32_F16]] [[TMP2]], 0
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_f16_i32.body
+; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_f16_i32
 ; CHECK-SAME: (ptr [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LOAD:%.*]] = load volatile half, ptr addrspace(1) poison, align 2
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_F16_I32:%.*]] poison, half [[LOAD]], 0
-; CHECK-NEXT:    ret [[BITCAST_POINTER_F16_I32]] [[TMP1]]
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_f16_i32
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_F16_I32:%.*]] @bitcast_pointer_f16_i32.body(ptr poison)
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_F16_I32]] [[TMP2]], 0
-; CHECK-NEXT:    store half [[TMP3]], ptr [[TMP0]], align 2
+; CHECK-NEXT:    store half [[LOAD]], ptr [[OUT]], align 2
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v3f32.body
+; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v3f32
 ; CHECK-SAME: (ptr [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[VALUE]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V3F32:%.*]] poison, <4 x float> [[EXTRACTVEC]], 0
-; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_V3F32]] [[TMP1]]
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v3f32
-; CHECK-SAME: (ptr [[TMP0:%.*]], <3 x float> [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V3F32:%.*]] @bitcast_struct_v3f32_v3f32.body(ptr poison, <3 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_V3F32]] [[TMP3]], 0
-; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP0]], align 16
+; CHECK-NEXT:    store <4 x float> [[EXTRACTVEC]], ptr [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v3i32.body
+; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v3i32
 ; CHECK-SAME: (ptr [[OUT:%.*]], <3 x i32> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i32> [[VALUE]], <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V3I32:%.*]] poison, <4 x i32> [[EXTRACTVEC]], 0
-; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_V3I32]] [[TMP1]]
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v3i32
-; CHECK-SAME: (ptr [[TMP0:%.*]], <3 x i32> [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V3I32:%.*]] @bitcast_struct_v3f32_v3i32.body(ptr poison, <3 x i32> [[TMP1]])
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_V3I32]] [[TMP3]], 0
-; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP0]], align 16
+; CHECK-NEXT:    store <4 x i32> [[EXTRACTVEC]], ptr [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v4f32_v4f32.body
-; CHECK-SAME: (ptr [[OUT:%.*]], <4 x float> [[VALUE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V4F32_V4F32:%.*]] poison, <4 x float> [[VALUE]], 0
-; CHECK-NEXT:    ret [[BITCAST_STRUCT_V4F32_V4F32]] [[TMP1]]
-;
-;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v4f32_v4f32
-; CHECK-SAME: (ptr [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V4F32_V4F32:%.*]] @bitcast_struct_v4f32_v4f32.body(ptr poison, <4 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V4F32_V4F32]] [[TMP3]], 0
-; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP0]], align 16
+; CHECK-SAME: (ptr [[OUT:%.*]], <4 x float> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store <4 x float> [[VALUE]], ptr [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v4i32.body
-; CHECK-SAME: (ptr [[OUT:%.*]], <4 x i32> [[VALUE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V4I32:%.*]] poison, <4 x i32> [[VALUE]], 0
-; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_V4I32]] [[TMP1]]
-;
-;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v4i32
-; CHECK-SAME: (ptr [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V4I32:%.*]] @bitcast_struct_v3f32_v4i32.body(ptr poison, <4 x i32> [[TMP1]])
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_V4I32]] [[TMP3]], 0
-; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP0]], align 16
+; CHECK-SAME: (ptr [[OUT:%.*]], <4 x i32> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store <4 x i32> [[VALUE]], ptr [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v4f32_v3f32.body
+; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v4f32_v3f32
 ; CHECK-SAME: (ptr [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[VALUE]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V4F32_V3F32:%.*]] poison, <4 x float> [[EXTRACTVEC]], 0
-; CHECK-NEXT:    ret [[BITCAST_STRUCT_V4F32_V3F32]] [[TMP1]]
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v4f32_v3f32
-; CHECK-SAME: (ptr [[TMP0:%.*]], <3 x float> [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V4F32_V3F32:%.*]] @bitcast_struct_v4f32_v3f32.body(ptr poison, <3 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V4F32_V3F32]] [[TMP3]], 0
-; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP0]], align 16
+; CHECK-NEXT:    store <4 x float> [[EXTRACTVEC]], ptr [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v2f32.body
-; CHECK-SAME: (ptr [[OUT:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V2F32:%.*]] poison, <2 x float> [[VALUE]], 0
-; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_V2F32]] [[TMP1]]
-;
-;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v2f32
-; CHECK-SAME: (ptr [[TMP0:%.*]], <2 x float> [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V2F32:%.*]] @bitcast_struct_v3f32_v2f32.body(ptr poison, <2 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_V2F32]] [[TMP3]], 0
-; CHECK-NEXT:    store <2 x float> [[TMP4]], ptr [[TMP0]], align 8
+; CHECK-SAME: (ptr [[OUT:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store <2 x float> [[VALUE]], ptr [[OUT]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v3f32.body
+; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v3f32
 ; CHECK-SAME: (ptr [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[VALUE]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_F32_V3F32:%.*]] poison, <4 x float> [[EXTRACTVEC]], 0
-; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_F32_V3F32]] [[TMP1]]
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v3f32
-; CHECK-SAME: (ptr [[TMP0:%.*]], <3 x float> [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_F32_V3F32:%.*]] @bitcast_struct_v3f32_f32_v3f32.body(ptr poison, <3 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_F32_V3F32]] [[TMP3]], 0
-; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP0]], align 16
+; CHECK-NEXT:    store <4 x float> [[EXTRACTVEC]], ptr [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v4f32.body
-; CHECK-SAME: (ptr [[OUT:%.*]], <4 x float> [[VALUE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_F32_V4F32:%.*]] poison, <4 x float> [[VALUE]], 0
-; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_F32_V4F32]] [[TMP1]]
-;
-;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v4f32
-; CHECK-SAME: (ptr [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_F32_V4F32:%.*]] @bitcast_struct_v3f32_f32_v4f32.body(ptr poison, <4 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_F32_V4F32]] [[TMP3]], 0
-; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP0]], align 16
-; CHECK-NEXT:    ret void
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_i128_v4f32.body
 ; CHECK-SAME: (ptr [[OUT:%.*]], <4 x float> [[VALUE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_I128_V4F32:%.*]] poison, <4 x float> [[VALUE]], 0
-; CHECK-NEXT:    ret [[BITCAST_STRUCT_I128_V4F32]] [[TMP1]]
+; CHECK-NEXT:    store <4 x float> [[VALUE]], ptr [[OUT]], align 16
+; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_i128_v4f32
-; CHECK-SAME: (ptr [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_I128_V4F32:%.*]] @bitcast_struct_i128_v4f32.body(ptr poison, <4 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_I128_V4F32]] [[TMP3]], 0
-; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP0]], align 16
+; CHECK-SAME: (ptr [[OUT:%.*]], <4 x float> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store <4 x float> [[VALUE]], ptr [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_array_v4i32_v4f32.body
-; CHECK-SAME: (ptr [[OUT:%.*]], [4 x float] [[VALUE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_ARRAY_V4I32_V4F32:%.*]] poison, [4 x float] [[VALUE]], 0
-; CHECK-NEXT:    ret [[BITCAST_ARRAY_V4I32_V4F32]] [[TMP1]]
-;
-;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_array_v4i32_v4f32
-; CHECK-SAME: (ptr [[TMP0:%.*]], [4 x float] [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_ARRAY_V4I32_V4F32:%.*]] @bitcast_array_v4i32_v4f32.body(ptr poison, [4 x float] [[TMP1]])
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_ARRAY_V4I32_V4F32]] [[TMP3]], 0
-; CHECK-NEXT:    store [4 x float] [[TMP4]], ptr [[TMP0]], align 4
+; CHECK-SAME: (ptr [[OUT:%.*]], [4 x float] [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store [4 x float] [[VALUE]], ptr [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@multi_return_bitcast_struct_v3f32_v3f32.body
+; CHECK-LABEL: define {{[^@]+}}@multi_return_bitcast_struct_v3f32_v3f32
 ; CHECK-SAME: (i1 [[COND:%.*]], ptr [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[COND]], label [[RET0:%.*]], label [[RET1:%.*]]
 ; CHECK:       ret0:
 ; CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[VALUE]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-; CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32:%.*]] poison, <4 x float> [[EXTRACTVEC]], 0
-; CHECK-NEXT:    ret [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] [[TMP0]]
+; CHECK-NEXT:    store <4 x float> [[EXTRACTVEC]], ptr [[OUT]], align 16
+; CHECK-NEXT:    ret void
 ; CHECK:       ret1:
 ; CHECK-NEXT:    [[LOAD:%.*]] = load <4 x float>, ptr addrspace(1) poison, align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] poison, <4 x float> [[LOAD]], 0
-; CHECK-NEXT:    ret [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] [[TMP1]]
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@multi_return_bitcast_struct_v3f32_v3f32
-; CHECK-SAME: (i1 [[TMP0:%.*]], ptr [[TMP1:%.*]], <3 x float> [[TMP2:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = call [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32:%.*]] @multi_return_bitcast_struct_v3f32_v3f32.body(i1 [[TMP0]], ptr poison, <3 x float> [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] [[TMP4]], 0
-; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr [[TMP1]], align 16
+; CHECK-NEXT:    store <4 x float> [[LOAD]], ptr [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_v3f32_struct_v3f32.body
-; CHECK-SAME: (ptr [[OUT:%.*]], [[STRUCT_V3F32:%.*]] [[VALUE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_V3F32_STRUCT_V3F32:%.*]] poison, [[STRUCT_V3F32]] [[VALUE]], 0
-; CHECK-NEXT:    ret [[BITCAST_V3F32_STRUCT_V3F32]] [[TMP1]]
-;
-;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_v3f32_struct_v3f32
-; CHECK-SAME: (ptr [[TMP0:%.*]], [[STRUCT_V3F32:%.*]] [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_V3F32_STRUCT_V3F32:%.*]] @bitcast_v3f32_struct_v3f32.body(ptr poison, [[STRUCT_V3F32]] [[TMP1]])
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_V3F32_STRUCT_V3F32]] [[TMP3]], 0
-; CHECK-NEXT:    store [[STRUCT_V3F32]] [[TMP4]], ptr [[TMP0]], align 16
+; CHECK-SAME: (ptr [[OUT:%.*]], [[STRUCT_V3F32:%.*]] [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store [[STRUCT_V3F32]] [[VALUE]], ptr [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
@@ -36,11 +36,11 @@
 ; CHECK-ALU64-NEXT:  # %bb.0: # %entry
 ; CHECK-ALU64-NEXT:    #DEBUG_VALUE: test:arg <- $r1
 ; CHECK-ALU64-NEXT:  .Ltmp0:
-; CHECK-ALU64-NEXT:    r1 = 20
+; CHECK-ALU64-NEXT:    r1 = 16
 ; CHECK-ALU64-NEXT:  .Ltmp1:
 ; CHECK-ALU64-NEXT:  .Ltmp2:
 ; CHECK-ALU64-NEXT:  .Ltmp3:
-; CHECK-ALU64-NEXT:    r0 = 4
+; CHECK-ALU64-NEXT:    r0 = 8
 ; CHECK-ALU64-NEXT:  .Ltmp4:
 ; CHECK-ALU64-NEXT:    .loc 1 12 69 prologue_end # test.c:12:69
 ; CHECK-ALU64-NEXT:  .Ltmp5:
@@ -67,11 +67,11 @@
 ; CHECK-ALU32-NEXT:  # %bb.0: # %entry
 ; CHECK-ALU32-NEXT:    #DEBUG_VALUE: test:arg <- $r1
 ; CHECK-ALU32-NEXT:  .Ltmp0:
-; CHECK-ALU32-NEXT:    r1 = 20
+; CHECK-ALU32-NEXT:    r1 = 16
 ; CHECK-ALU32-NEXT:  .Ltmp1:
 ; CHECK-ALU32-NEXT:  .Ltmp2:
 ; CHECK-ALU32-NEXT:  .Ltmp3:
-; CHECK-ALU32-NEXT:    r0 = 4
+; CHECK-ALU32-NEXT:    r0 = 8
 ; CHECK-ALU32-NEXT:  .Ltmp4:
 ; CHECK-ALU32-NEXT:    .loc 1 12 69 prologue_end # test.c:12:69
 ; CHECK-ALU32-NEXT:  .Ltmp5:
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
@@ -36,18 +36,18 @@
 ; CHECK-ALU64-NEXT:  # %bb.0: # %entry
 ; CHECK-ALU64-NEXT:    #DEBUG_VALUE: test:arg <- $r1
 ; CHECK-ALU64-NEXT:  .Ltmp0:
-; CHECK-ALU64-NEXT:    r1 = 20
+; CHECK-ALU64-NEXT:    r1 = 16
 ; CHECK-ALU64-NEXT:  .Ltmp1:
 ; CHECK-ALU64-NEXT:  .Ltmp2:
 ; CHECK-ALU64-NEXT:  .Ltmp3:
-; CHECK-ALU64-NEXT:    r0 = 4
+; CHECK-ALU64-NEXT:    r0 = 8
 ; CHECK-ALU64-NEXT:  .Ltmp4:
 ; CHECK-ALU64-NEXT:    .loc 1 12 69 prologue_end # test.c:12:69
 ; CHECK-ALU64-NEXT:  .Ltmp5:
 ; CHECK-ALU64-NEXT:  .Ltmp6:
 ; CHECK-ALU64-NEXT:    r0 += r1
 ; CHECK-ALU64-NEXT:  .Ltmp7:
-; CHECK-ALU64-NEXT:    r1 = 50
+; CHECK-ALU64-NEXT:    r1 = 18
 ; CHECK-ALU64-NEXT:    .loc 1 13 67 # test.c:13:67
 ; CHECK-ALU64-NEXT:  .Ltmp8:
 ; CHECK-ALU64-NEXT:    r0 += r1
@@ -67,18 +67,18 @@
 ; CHECK-ALU32-NEXT:  # %bb.0: # %entry
 ; CHECK-ALU32-NEXT:    #DEBUG_VALUE: test:arg <- $r1
 ; CHECK-ALU32-NEXT:  .Ltmp0:
-; CHECK-ALU32-NEXT:    r1 = 20
+; CHECK-ALU32-NEXT:    r1 = 16
 ; CHECK-ALU32-NEXT:  .Ltmp1:
 ; CHECK-ALU32-NEXT:  .Ltmp2:
 ; CHECK-ALU32-NEXT:  .Ltmp3:
-; CHECK-ALU32-NEXT:    r0 = 4
+; CHECK-ALU32-NEXT:    r0 = 8
 ; CHECK-ALU32-NEXT:  .Ltmp4:
 ; CHECK-ALU32-NEXT:    .loc 1 12 69 prologue_end # test.c:12:69
 ; CHECK-ALU32-NEXT:  .Ltmp5:
 ; CHECK-ALU32-NEXT:  .Ltmp6:
 ; CHECK-ALU32-NEXT:    w0 += w1
 ; CHECK-ALU32-NEXT:  .Ltmp7:
-; CHECK-ALU32-NEXT:    r1 = 50
+; CHECK-ALU32-NEXT:    r1 = 18
 ; CHECK-ALU32-NEXT:    .loc 1 13 67 # test.c:13:67
 ; CHECK-ALU32-NEXT:  .Ltmp8:
 ; CHECK-ALU32-NEXT:    w0 += w1
diff --git a/llvm/test/CodeGen/X86/expand-large-div-rem-sdiv129.ll b/llvm/test/CodeGen/X86/expand-large-div-rem-sdiv129.ll
--- a/llvm/test/CodeGen/X86/expand-large-div-rem-sdiv129.ll
+++ b/llvm/test/CodeGen/X86/expand-large-div-rem-sdiv129.ll
@@ -4,7 +4,7 @@
 define void @sdiv129(ptr %ptr, ptr %out) nounwind {
 ; CHECK-LABEL: @sdiv129(
 ; CHECK-NEXT:  _udiv-special-cases:
-; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = freeze i129 [[A]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i129 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = ashr i129 [[TMP0]], 128
@@ -66,7 +66,7 @@
 ; CHECK-NEXT:    [[TMP48:%.*]] = phi i129 [ [[TMP25]], [[UDIV_LOOP_EXIT]] ], [ [[TMP20]], [[_UDIV_SPECIAL_CASES:%.*]] ]
 ; CHECK-NEXT:    [[TMP49:%.*]] = xor i129 [[TMP48]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP50:%.*]] = sub i129 [[TMP49]], [[TMP8]]
-; CHECK-NEXT:    store i129 [[TMP50]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i129 [[TMP50]], ptr [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %a = load i129, ptr %ptr
diff --git a/llvm/test/CodeGen/X86/expand-large-div-rem-srem129.ll b/llvm/test/CodeGen/X86/expand-large-div-rem-srem129.ll
--- a/llvm/test/CodeGen/X86/expand-large-div-rem-srem129.ll
+++ b/llvm/test/CodeGen/X86/expand-large-div-rem-srem129.ll
@@ -4,7 +4,7 @@
 define void @test(ptr %ptr, ptr %out) nounwind {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  _udiv-special-cases:
-; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = freeze i129 [[A]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i129 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = ashr i129 [[TMP0]], 128
@@ -69,7 +69,7 @@
 ; CHECK-NEXT:    [[TMP51:%.*]] = sub i129 [[TMP8]], [[TMP50]]
 ; CHECK-NEXT:    [[TMP52:%.*]] = xor i129 [[TMP51]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP53:%.*]] = sub i129 [[TMP52]], [[TMP2]]
-; CHECK-NEXT:    store i129 [[TMP53]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i129 [[TMP53]], ptr [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %a = load i129, ptr %ptr
diff --git a/llvm/test/CodeGen/X86/expand-large-div-rem-udiv129.ll b/llvm/test/CodeGen/X86/expand-large-div-rem-udiv129.ll
--- a/llvm/test/CodeGen/X86/expand-large-div-rem-udiv129.ll
+++ b/llvm/test/CodeGen/X86/expand-large-div-rem-udiv129.ll
@@ -4,7 +4,7 @@
 define void @test(ptr %ptr, ptr %out) nounwind {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  _udiv-special-cases:
-; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = freeze i129 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i129 [[A]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i129 [[TMP0]], 0
@@ -55,7 +55,7 @@
 ; CHECK-NEXT:    br i1 [[TMP38]], label [[UDIV_LOOP_EXIT]], label [[UDIV_PREHEADER]]
 ; CHECK:       udiv-end:
 ; CHECK-NEXT:    [[TMP39:%.*]] = phi i129 [ [[TMP16]], [[UDIV_LOOP_EXIT]] ], [ [[TMP11]], [[_UDIV_SPECIAL_CASES:%.*]] ]
-; CHECK-NEXT:    store i129 [[TMP39]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i129 [[TMP39]], ptr [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %a = load i129, ptr %ptr
diff --git a/llvm/test/CodeGen/X86/expand-large-div-rem-urem129.ll b/llvm/test/CodeGen/X86/expand-large-div-rem-urem129.ll
--- a/llvm/test/CodeGen/X86/expand-large-div-rem-urem129.ll
+++ b/llvm/test/CodeGen/X86/expand-large-div-rem-urem129.ll
@@ -4,7 +4,7 @@
 define void @test(ptr %ptr, ptr %out) nounwind {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  _udiv-special-cases:
-; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = freeze i129 [[A]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i129 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = freeze i129 [[TMP1]]
@@ -59,7 +59,7 @@
 ; CHECK-NEXT:    [[TMP41:%.*]] = phi i129 [ [[TMP18]], [[UDIV_LOOP_EXIT]] ], [ [[TMP13]], [[_UDIV_SPECIAL_CASES:%.*]] ]
 ; CHECK-NEXT:    [[TMP42:%.*]] = mul i129 [[TMP1]], [[TMP41]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = sub i129 [[TMP0]], [[TMP42]]
-; CHECK-NEXT:    store i129 [[TMP43]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i129 [[TMP43]], ptr [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %a = load i129, ptr %ptr
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll b/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll
--- a/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll
@@ -80,7 +80,7 @@
 ; AARCH64-SCOPE-LABEL: @standard_lifetime(
 ; AARCH64-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1:![0-9]+]])
 ; AARCH64-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -88,13 +88,13 @@
 ; AARCH64-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -132,7 +132,7 @@
 ; AARCH64-NOSCOPE-LABEL: @standard_lifetime(
 ; AARCH64-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1:![0-9]+]])
 ; AARCH64-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -140,13 +140,13 @@
 ; AARCH64-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -182,7 +182,7 @@
 ; AARCH64-SHORT-SCOPE-LABEL: @standard_lifetime(
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1:![0-9]+]])
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -190,13 +190,13 @@
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -237,7 +237,7 @@
 ; AARCH64-SHORT-NOSCOPE-LABEL: @standard_lifetime(
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1:![0-9]+]])
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -245,13 +245,13 @@
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -361,7 +361,7 @@
 ; AARCH64-SCOPE-LABEL: @standard_lifetime_optnone(
 ; AARCH64-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -369,13 +369,13 @@
 ; AARCH64-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -413,7 +413,7 @@
 ; AARCH64-NOSCOPE-LABEL: @standard_lifetime_optnone(
 ; AARCH64-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -421,13 +421,13 @@
 ; AARCH64-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -463,7 +463,7 @@
 ; AARCH64-SHORT-SCOPE-LABEL: @standard_lifetime_optnone(
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -471,13 +471,13 @@
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -518,7 +518,7 @@
 ; AARCH64-SHORT-NOSCOPE-LABEL: @standard_lifetime_optnone(
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -526,13 +526,13 @@
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -632,7 +632,7 @@
 ; AARCH64-SCOPE-LABEL: @multiple_lifetimes(
 ; AARCH64-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -640,13 +640,13 @@
 ; AARCH64-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -678,7 +678,7 @@
 ; AARCH64-NOSCOPE-LABEL: @multiple_lifetimes(
 ; AARCH64-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -686,13 +686,13 @@
 ; AARCH64-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -724,7 +724,7 @@
 ; AARCH64-SHORT-SCOPE-LABEL: @multiple_lifetimes(
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -732,13 +732,13 @@
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -773,7 +773,7 @@
 ; AARCH64-SHORT-NOSCOPE-LABEL: @multiple_lifetimes(
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -781,13 +781,13 @@
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -892,7 +892,7 @@
 ; AARCH64-SCOPE-LABEL: @unreachable_exit(
 ; AARCH64-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -900,13 +900,13 @@
 ; AARCH64-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -949,7 +949,7 @@
 ; AARCH64-NOSCOPE-LABEL: @unreachable_exit(
 ; AARCH64-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -957,13 +957,13 @@
 ; AARCH64-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -1005,7 +1005,7 @@
 ; AARCH64-SHORT-SCOPE-LABEL: @unreachable_exit(
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -1013,13 +1013,13 @@
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -1065,7 +1065,7 @@
 ; AARCH64-SHORT-NOSCOPE-LABEL: @unreachable_exit(
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -1073,13 +1073,13 @@
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -1200,7 +1200,7 @@
 ; AARCH64-SCOPE-LABEL: @diamond_lifetime(
 ; AARCH64-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -1208,13 +1208,13 @@
 ; AARCH64-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -1261,7 +1261,7 @@
 ; AARCH64-NOSCOPE-LABEL: @diamond_lifetime(
 ; AARCH64-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -1269,13 +1269,13 @@
 ; AARCH64-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -1313,7 +1313,7 @@
 ; AARCH64-SHORT-SCOPE-LABEL: @diamond_lifetime(
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -1321,13 +1321,13 @@
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -1377,7 +1377,7 @@
 ; AARCH64-SHORT-NOSCOPE-LABEL: @diamond_lifetime(
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -1385,13 +1385,13 @@
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
diff --git a/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll b/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll
--- a/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll
+++ b/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll
@@ -6,14 +6,14 @@
 define i32 @foo() #0 {
 ; CHECK-LABEL: define i32 @foo() comdat {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @__sanitizer_cov_trace_pc_guard(ptr @__sancov_gen_) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    call void @__sanitizer_cov_trace_pc_guard(ptr inttoptr (i32 ptrtoint (ptr @__sancov_gen_ to i32) to ptr)) #[[ATTR1:[0-9]+]]
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
   ret i32 0
 }
 
-; CHECK-DAG: declare void @__sanitizer_cov_trace_pc_indir(i64)
+; CHECK-DAG: declare void @__sanitizer_cov_trace_pc_indir(i32)
 ; CHECK-DAG: declare void @__sanitizer_cov_trace_cmp1(i8 zeroext, i8 zeroext)
 ; CHECK-DAG: declare void @__sanitizer_cov_trace_cmp2(i16 zeroext, i16 zeroext)
 ; CHECK-DAG: declare void @__sanitizer_cov_trace_cmp4(i32 zeroext, i32 zeroext)
@@ -24,7 +24,7 @@
 ; CHECK-DAG: declare void @__sanitizer_cov_trace_const_cmp8(i64, i64)
 ; CHECK-DAG: declare void @__sanitizer_cov_trace_div4(i32 zeroext)
 ; CHECK-DAG: declare void @__sanitizer_cov_trace_div8(i64)
-; CHECK-DAG: declare void @__sanitizer_cov_trace_gep(i64)
+; CHECK-DAG: declare void @__sanitizer_cov_trace_gep(i32)
 ; CHECK-DAG: declare void @__sanitizer_cov_trace_switch(i64, ptr)
 ; CHECK-DAG: declare void @__sanitizer_cov_trace_pc()
 ; CHECK-DAG: declare void @__sanitizer_cov_trace_pc_guard(ptr)
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -atomic-expand %s | FileCheck %s
-; RUN: opt -mtriple=r600-mesa-mesa3d -S -atomic-expand %s | FileCheck %s
+; RUN: opt -mtriple=r600-mesa-mesa3d -S -atomic-expand %s | FileCheck %s --check-prefix=R600
 
 define i8 @test_atomicrmw_xchg_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
 ; CHECK-LABEL: @test_atomicrmw_xchg_i8_global_agent(
@@ -27,6 +27,30 @@
 ; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+;
+; R600-LABEL: @test_atomicrmw_xchg_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; R600-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[TMP5:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[TMP6:%.*]] = or i32 [[TMP5]], [[VALOPERAND_SHIFTED]]
+; R600-NEXT:    [[TMP7:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP6]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP7]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED]]
 ;
   %res = atomicrmw xchg ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
@@ -59,6 +83,32 @@
 ; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+;
+; R600-LABEL: @test_atomicrmw_add_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; R600-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
+; R600-NEXT:    [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]]
+; R600-NEXT:    [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]]
+; R600-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED]]
 ;
   %res = atomicrmw add ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
@@ -91,6 +141,32 @@
 ; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+;
+; R600-LABEL: @test_atomicrmw_add_i8_global_agent_align2(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; R600-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
+; R600-NEXT:    [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]]
+; R600-NEXT:    [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]]
+; R600-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED]]
 ;
   %res = atomicrmw add ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst, align 2
   ret i8 %res
@@ -114,6 +190,24 @@
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[NEWLOADED]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+;
+; R600-LABEL: @test_atomicrmw_add_i8_global_agent_align4(
+; R600-NEXT:    [[TMP1:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; R600-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP2]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[NEW:%.*]] = add i32 [[LOADED]], [[TMP1]]
+; R600-NEXT:    [[TMP3:%.*]] = and i32 [[NEW]], 255
+; R600-NEXT:    [[TMP4:%.*]] = and i32 [[LOADED]], -256
+; R600-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], [[TMP3]]
+; R600-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[NEWLOADED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED]]
 ;
   %res = atomicrmw add ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst, align 4
   ret i8 %res
@@ -146,6 +240,32 @@
 ; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+;
+; R600-LABEL: @test_atomicrmw_sub_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; R600-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[NEW:%.*]] = sub i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
+; R600-NEXT:    [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]]
+; R600-NEXT:    [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]]
+; R600-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED]]
 ;
   %res = atomicrmw sub ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
@@ -167,6 +287,21 @@
 ; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+;
+; R600-LABEL: @test_atomicrmw_and_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; R600-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[INV_MASK]], [[VALOPERAND_SHIFTED]]
+; R600-NEXT:    [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED]]
 ;
   %res = atomicrmw and ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
@@ -200,6 +335,33 @@
 ; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+;
+; R600-LABEL: @test_atomicrmw_nand_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; R600-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[TMP5:%.*]] = and i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
+; R600-NEXT:    [[NEW:%.*]] = xor i32 [[TMP5]], -1
+; R600-NEXT:    [[TMP6:%.*]] = and i32 [[NEW]], [[MASK]]
+; R600-NEXT:    [[TMP7:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[TMP8:%.*]] = or i32 [[TMP7]], [[TMP6]]
+; R600-NEXT:    [[TMP9:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP8]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP9]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP9]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED]]
 ;
   %res = atomicrmw nand ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
@@ -220,6 +382,20 @@
 ; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+;
+; R600-LABEL: @test_atomicrmw_or_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; R600-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[TMP4:%.*]] = atomicrmw or ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] syncscope("agent") seq_cst, align 4
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED]]
 ;
   %res = atomicrmw or ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
@@ -240,6 +416,20 @@
 ; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+;
+; R600-LABEL: @test_atomicrmw_xor_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; R600-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[TMP4:%.*]] = atomicrmw xor ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] syncscope("agent") seq_cst, align 4
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED]]
 ;
   %res = atomicrmw xor ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
@@ -274,6 +464,34 @@
 ; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_max_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = icmp sgt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw max ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
@@ -308,6 +526,34 @@
 ; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_min_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = icmp sle i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw min ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
@@ -342,6 +588,34 @@
 ; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_umax_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw umax ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
@@ -376,6 +650,34 @@
 ; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_umin_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = icmp ule i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw umin ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
@@ -417,6 +719,41 @@
 ; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue { i8, i1 } [[TMP17]], i1 [[TMP14]], 1
 ; CHECK-NEXT:    [[EXTRACT:%.*]] = extractvalue { i8, i1 } [[TMP18]], 0
 ; CHECK-NEXT:    ret i8 [[EXTRACT]]
+;
+; R600-LABEL: @test_cmpxchg_i8_global_agent(
+; R600-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT:%.*]], i64 4
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[GEP]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[GEP]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[IN:%.*]] to i32
+; R600-NEXT:    [[TMP4:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[TMP5:%.*]] = zext i8 [[OLD:%.*]] to i32
+; R600-NEXT:    [[TMP6:%.*]] = shl i32 [[TMP5]], [[TMP2]]
+; R600-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    [[TMP8:%.*]] = and i32 [[TMP7]], [[INV_MASK]]
+; R600-NEXT:    br label [[PARTWORD_CMPXCHG_LOOP:%.*]]
+; R600:       partword.cmpxchg.loop:
+; R600-NEXT:    [[TMP9:%.*]] = phi i32 [ [[TMP8]], [[TMP0:%.*]] ], [ [[TMP15:%.*]], [[PARTWORD_CMPXCHG_FAILURE:%.*]] ]
+; R600-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], [[TMP4]]
+; R600-NEXT:    [[TMP11:%.*]] = or i32 [[TMP9]], [[TMP6]]
+; R600-NEXT:    [[TMP12:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[TMP11]], i32 [[TMP10]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[TMP13:%.*]] = extractvalue { i32, i1 } [[TMP12]], 0
+; R600-NEXT:    [[TMP14:%.*]] = extractvalue { i32, i1 } [[TMP12]], 1
+; R600-NEXT:    br i1 [[TMP14]], label [[PARTWORD_CMPXCHG_END:%.*]], label [[PARTWORD_CMPXCHG_FAILURE]]
+; R600:       partword.cmpxchg.failure:
+; R600-NEXT:    [[TMP15]] = and i32 [[TMP13]], [[INV_MASK]]
+; R600-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP9]], [[TMP15]]
+; R600-NEXT:    br i1 [[TMP16]], label [[PARTWORD_CMPXCHG_LOOP]], label [[PARTWORD_CMPXCHG_END]]
+; R600:       partword.cmpxchg.end:
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP13]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP17:%.*]] = insertvalue { i8, i1 } poison, i8 [[EXTRACTED]], 0
+; R600-NEXT:    [[TMP18:%.*]] = insertvalue { i8, i1 } [[TMP17]], i1 [[TMP14]], 1
+; R600-NEXT:    [[EXTRACT:%.*]] = extractvalue { i8, i1 } [[TMP18]], 0
+; R600-NEXT:    ret i8 [[EXTRACT]]
 ;
   %gep = getelementptr i8, ptr addrspace(1) %out, i64 4
   %res = cmpxchg ptr addrspace(1) %gep, i8 %old, i8 %in syncscope("agent") seq_cst seq_cst
@@ -427,17 +764,16 @@
 define i8 @test_cmpxchg_i8_local_align2(ptr addrspace(3) %out, i8 %in, i8 %old) {
 ; CHECK-LABEL: @test_cmpxchg_i8_local_align2(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT:%.*]], i64 4
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[GEP]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[GEP]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[GEP]], i32 -4)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[GEP]] to i32
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
 ; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[IN:%.*]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shl i32 [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext i8 [[OLD:%.*]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = shl i32 [[TMP5]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shl i32 [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = and i32 [[TMP7]], [[INV_MASK]]
 ; CHECK-NEXT:    br label [[PARTWORD_CMPXCHG_LOOP:%.*]]
@@ -454,12 +790,47 @@
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP9]], [[TMP15]]
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[PARTWORD_CMPXCHG_LOOP]], label [[PARTWORD_CMPXCHG_END]]
 ; CHECK:       partword.cmpxchg.end:
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP13]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP13]], [[TMP2]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
 ; CHECK-NEXT:    [[TMP17:%.*]] = insertvalue { i8, i1 } poison, i8 [[EXTRACTED]], 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue { i8, i1 } [[TMP17]], i1 [[TMP14]], 1
 ; CHECK-NEXT:    [[EXTRACT:%.*]] = extractvalue { i8, i1 } [[TMP18]], 0
 ; CHECK-NEXT:    ret i8 [[EXTRACT]]
+;
+; R600-LABEL: @test_cmpxchg_i8_local_align2(
+; R600-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT:%.*]], i64 4
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[GEP]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[GEP]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[IN:%.*]] to i32
+; R600-NEXT:    [[TMP4:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[TMP5:%.*]] = zext i8 [[OLD:%.*]] to i32
+; R600-NEXT:    [[TMP6:%.*]] = shl i32 [[TMP5]], [[TMP2]]
+; R600-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    [[TMP8:%.*]] = and i32 [[TMP7]], [[INV_MASK]]
+; R600-NEXT:    br label [[PARTWORD_CMPXCHG_LOOP:%.*]]
+; R600:       partword.cmpxchg.loop:
+; R600-NEXT:    [[TMP9:%.*]] = phi i32 [ [[TMP8]], [[TMP0:%.*]] ], [ [[TMP15:%.*]], [[PARTWORD_CMPXCHG_FAILURE:%.*]] ]
+; R600-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], [[TMP4]]
+; R600-NEXT:    [[TMP11:%.*]] = or i32 [[TMP9]], [[TMP6]]
+; R600-NEXT:    [[TMP12:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[TMP11]], i32 [[TMP10]] seq_cst seq_cst, align 4
+; R600-NEXT:    [[TMP13:%.*]] = extractvalue { i32, i1 } [[TMP12]], 0
+; R600-NEXT:    [[TMP14:%.*]] = extractvalue { i32, i1 } [[TMP12]], 1
+; R600-NEXT:    br i1 [[TMP14]], label [[PARTWORD_CMPXCHG_END:%.*]], label [[PARTWORD_CMPXCHG_FAILURE]]
+; R600:       partword.cmpxchg.failure:
+; R600-NEXT:    [[TMP15]] = and i32 [[TMP13]], [[INV_MASK]]
+; R600-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP9]], [[TMP15]]
+; R600-NEXT:    br i1 [[TMP16]], label [[PARTWORD_CMPXCHG_LOOP]], label [[PARTWORD_CMPXCHG_END]]
+; R600:       partword.cmpxchg.end:
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP13]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP17:%.*]] = insertvalue { i8, i1 } poison, i8 [[EXTRACTED]], 0
+; R600-NEXT:    [[TMP18:%.*]] = insertvalue { i8, i1 } [[TMP17]], i1 [[TMP14]], 1
+; R600-NEXT:    [[EXTRACT:%.*]] = extractvalue { i8, i1 } [[TMP18]], 0
+; R600-NEXT:    ret i8 [[EXTRACT]]
 ;
   %gep = getelementptr i8, ptr addrspace(3) %out, i64 4
   %res = cmpxchg ptr addrspace(3) %gep, i8 %old, i8 %in seq_cst seq_cst, align 2
@@ -497,6 +868,35 @@
 ; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_inc_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
@@ -532,6 +932,35 @@
 ; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_inc_i8_global_agent_align2(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst, align 2
   ret i8 %res
@@ -557,6 +986,26 @@
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED1]]
+;
+; R600-LABEL: @test_atomicrmw_inc_i8_global_agent_align4(
+; R600-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
+; R600-NEXT:    [[TMP2:%.*]] = add i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP3:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP3]], i8 0, i8 [[TMP2]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; R600-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED1]]
 ;
   %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst, align 4
   ret i8 %res
@@ -564,24 +1013,23 @@
 
 define i8 @test_atomicrmw_inc_i8_local(ptr addrspace(3) %ptr, i8 %value) {
 ; CHECK-LABEL: @test_atomicrmw_inc_i8_local(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
 ; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
 ; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
 ; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
 ; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -589,9 +1037,38 @@
 ; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_inc_i8_local(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i8 %value seq_cst
   ret i8 %res
@@ -599,24 +1076,23 @@
 
 define i8 @test_atomicrmw_inc_i8_local_align2(ptr addrspace(3) %ptr, i8 %value) {
 ; CHECK-LABEL: @test_atomicrmw_inc_i8_local_align2(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
 ; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
 ; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
 ; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
 ; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -624,9 +1100,38 @@
 ; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_inc_i8_local_align2(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i8 %value seq_cst, align 2
   ret i8 %res
@@ -652,6 +1157,26 @@
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED1]]
+;
+; R600-LABEL: @test_atomicrmw_inc_i8_local_align4(
+; R600-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) [[PTR:%.*]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
+; R600-NEXT:    [[TMP2:%.*]] = add i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP3:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP3]], i8 0, i8 [[TMP2]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; R600-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED1]]
 ;
   %res = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i8 %value seq_cst, align 4
   ret i8 %res
@@ -687,6 +1212,35 @@
 ; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_inc_i8_flat_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw uinc_wrap ptr %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
@@ -722,6 +1276,35 @@
 ; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_inc_i8_flat_agent_align2(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw uinc_wrap ptr %ptr, i8 %value syncscope("agent") seq_cst, align 2
   ret i8 %res
@@ -747,6 +1330,26 @@
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED1]]
+;
+; R600-LABEL: @test_atomicrmw_inc_i8_flat_agent_align4(
+; R600-NEXT:    [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
+; R600-NEXT:    [[TMP2:%.*]] = add i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP3:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP3]], i8 0, i8 [[TMP2]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; R600-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED1]]
 ;
   %res = atomicrmw uinc_wrap ptr %ptr, i8 %value syncscope("agent") seq_cst, align 4
   ret i8 %res
@@ -784,6 +1387,37 @@
 ; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_dec_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
+; R600-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
@@ -821,6 +1455,37 @@
 ; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_dec_i8_global_agent_align2(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
+; R600-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst, align 2
   ret i8 %res
@@ -848,6 +1513,28 @@
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED1]]
+;
+; R600-LABEL: @test_atomicrmw_dec_i8_global_agent_align4(
+; R600-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
+; R600-NEXT:    [[TMP2:%.*]] = sub i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP3:%.*]] = icmp eq i8 [[EXTRACTED]], 0
+; R600-NEXT:    [[TMP4:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 [[VALUE]], i8 [[TMP2]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; R600-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED1]]
 ;
   %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst, align 4
   ret i8 %res
@@ -855,18 +1542,17 @@
 
 define i8 @test_atomicrmw_dec_i8_local(ptr addrspace(3) %ptr, i8 %value) {
 ; CHECK-LABEL: @test_atomicrmw_dec_i8_local(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
 ; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
 ; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
@@ -874,7 +1560,7 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
 ; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -882,9 +1568,40 @@
 ; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_dec_i8_local(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
+; R600-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw udec_wrap ptr addrspace(3) %ptr, i8 %value seq_cst
   ret i8 %res
@@ -892,18 +1609,17 @@
 
 define i8 @test_atomicrmw_dec_i8_local_align2(ptr addrspace(3) %ptr, i8 %value) {
 ; CHECK-LABEL: @test_atomicrmw_dec_i8_local_align2(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
 ; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
 ; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
@@ -911,7 +1627,7 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
 ; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -919,9 +1635,40 @@
 ; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_dec_i8_local_align2(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
+; R600-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw udec_wrap ptr addrspace(3) %ptr, i8 %value seq_cst, align 2
   ret i8 %res
@@ -949,6 +1696,28 @@
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED1]]
+;
+; R600-LABEL: @test_atomicrmw_dec_i8_local_align4(
+; R600-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) [[PTR:%.*]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
+; R600-NEXT:    [[TMP2:%.*]] = sub i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP3:%.*]] = icmp eq i8 [[EXTRACTED]], 0
+; R600-NEXT:    [[TMP4:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 [[VALUE]], i8 [[TMP2]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; R600-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED1]]
 ;
   %res = atomicrmw udec_wrap ptr addrspace(3) %ptr, i8 %value seq_cst, align 4
   ret i8 %res
@@ -986,6 +1755,37 @@
 ; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_dec_i8_flat_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
+; R600-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw udec_wrap ptr %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
@@ -1023,6 +1823,37 @@
 ; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_dec_i8_flat_agent_align2(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
+; R600-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw udec_wrap ptr %ptr, i8 %value syncscope("agent") seq_cst, align 2
   ret i8 %res
@@ -1050,6 +1881,28 @@
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED1]]
+;
+; R600-LABEL: @test_atomicrmw_dec_i8_flat_agent_align4(
+; R600-NEXT:    [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
+; R600-NEXT:    [[TMP2:%.*]] = sub i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP3:%.*]] = icmp eq i8 [[EXTRACTED]], 0
+; R600-NEXT:    [[TMP4:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 [[VALUE]], i8 [[TMP2]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; R600-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED1]]
 ;
   %res = atomicrmw udec_wrap ptr %ptr, i8 %value syncscope("agent") seq_cst, align 4
   ret i8 %res
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
@@ -1279,24 +1279,23 @@
 
 define half @test_atomicrmw_fadd_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; CI-LABEL: @test_atomicrmw_fadd_f16_local(
-; CI-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; CI-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; CI-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CI-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CI-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CI-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; CI-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CI-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CI-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CI-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CI-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; CI-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; CI-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; CI-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CI:       atomicrmw.start:
 ; CI-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CI-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; CI-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; CI-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
 ; CI-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
 ; CI-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
 ; CI-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; CI-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CI-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; CI-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; CI-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; CI-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -1304,30 +1303,29 @@
 ; CI-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; CI-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CI:       atomicrmw.end:
-; CI-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CI-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; CI-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; CI-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
 ; CI-NEXT:    ret half [[TMP7]]
 ;
 ; GFX9-LABEL: @test_atomicrmw_fadd_f16_local(
-; GFX9-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX9-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX9-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX9-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX9-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX9-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX9-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX9-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX9-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX9-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX9-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX9-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX9-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX9-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX9:       atomicrmw.start:
 ; GFX9-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX9-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX9-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX9-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
 ; GFX9-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
 ; GFX9-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
 ; GFX9-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX9-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX9-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX9-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX9-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX9-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -1335,30 +1333,29 @@
 ; GFX9-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX9-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX9:       atomicrmw.end:
-; GFX9-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX9-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX9-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX9-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
 ; GFX9-NEXT:    ret half [[TMP7]]
 ;
 ; GFX908-LABEL: @test_atomicrmw_fadd_f16_local(
-; GFX908-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX908-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX908-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX908-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX908-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX908-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX908-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX908-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX908-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX908-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX908-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX908-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX908-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX908:       atomicrmw.start:
 ; GFX908-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX908-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX908-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX908-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
 ; GFX908-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
 ; GFX908-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
 ; GFX908-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX908-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX908-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX908-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX908-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX908-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -1366,30 +1363,29 @@
 ; GFX908-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX908:       atomicrmw.end:
-; GFX908-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX908-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX908-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX908-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
 ; GFX908-NEXT:    ret half [[TMP7]]
 ;
 ; GFX90A-LABEL: @test_atomicrmw_fadd_f16_local(
-; GFX90A-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX90A-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX90A-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX90A-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX90A-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX90A-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX90A-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX90A-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX90A-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX90A-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX90A-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX90A-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX90A-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX90A:       atomicrmw.start:
 ; GFX90A-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX90A-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX90A-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX90A-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
 ; GFX90A-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
 ; GFX90A-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
 ; GFX90A-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX90A-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX90A-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX90A-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX90A-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX90A-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -1397,30 +1393,29 @@
 ; GFX90A-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX90A-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX90A-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX90A-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
 ; GFX90A-NEXT:    ret half [[TMP7]]
 ;
 ; GFX940-LABEL: @test_atomicrmw_fadd_f16_local(
-; GFX940-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX940-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX940-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX940-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX940-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX940-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX940-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX940-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX940-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX940-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX940-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX940-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX940-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX940:       atomicrmw.start:
 ; GFX940-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX940-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX940-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX940-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
 ; GFX940-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
 ; GFX940-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
 ; GFX940-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX940-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX940-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX940-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX940-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX940-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -1428,30 +1423,29 @@
 ; GFX940-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX940:       atomicrmw.end:
-; GFX940-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX940-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX940-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX940-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
 ; GFX940-NEXT:    ret half [[TMP7]]
 ;
 ; GFX11-LABEL: @test_atomicrmw_fadd_f16_local(
-; GFX11-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX11-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX11-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX11-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX11-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX11-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX11-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX11-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX11-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX11-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX11-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX11-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX11-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX11:       atomicrmw.start:
 ; GFX11-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX11-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX11-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX11-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX11-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
 ; GFX11-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
 ; GFX11-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
 ; GFX11-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX11-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX11-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX11-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX11-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX11-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -1459,7 +1453,7 @@
 ; GFX11-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX11:       atomicrmw.end:
-; GFX11-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX11-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX11-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX11-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
 ; GFX11-NEXT:    ret half [[TMP7]]
@@ -2074,24 +2068,23 @@
 
 define bfloat @test_atomicrmw_fadd_bf16_local(ptr addrspace(3) %ptr, bfloat %value) {
 ; CI-LABEL: @test_atomicrmw_fadd_bf16_local(
-; CI-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; CI-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; CI-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CI-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CI-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CI-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; CI-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CI-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CI-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CI-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CI-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; CI-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; CI-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; CI-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CI:       atomicrmw.start:
 ; CI-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CI-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; CI-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; CI-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; CI-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; CI-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; CI-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; CI-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CI-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; CI-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; CI-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; CI-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -2099,30 +2092,29 @@
 ; CI-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; CI-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CI:       atomicrmw.end:
-; CI-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CI-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; CI-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; CI-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; CI-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX9-LABEL: @test_atomicrmw_fadd_bf16_local(
-; GFX9-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX9-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX9-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX9-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX9-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX9-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX9-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX9-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX9-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX9-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX9-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX9-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX9-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX9-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX9:       atomicrmw.start:
 ; GFX9-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX9-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX9-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX9-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX9-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX9-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX9-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX9-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX9-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX9-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX9-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX9-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -2130,30 +2122,29 @@
 ; GFX9-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX9-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX9:       atomicrmw.end:
-; GFX9-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX9-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX9-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX9-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX9-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX908-LABEL: @test_atomicrmw_fadd_bf16_local(
-; GFX908-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX908-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX908-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX908-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX908-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX908-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX908-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX908-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX908-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX908-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX908-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX908-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX908-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX908:       atomicrmw.start:
 ; GFX908-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX908-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX908-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX908-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX908-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX908-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX908-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX908-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX908-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX908-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX908-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX908-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -2161,30 +2152,29 @@
 ; GFX908-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX908:       atomicrmw.end:
-; GFX908-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX908-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX908-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX908-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX908-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_local(
-; GFX90A-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX90A-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX90A-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX90A-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX90A-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX90A-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX90A-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX90A-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX90A-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX90A-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX90A-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX90A-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX90A-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX90A:       atomicrmw.start:
 ; GFX90A-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX90A-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX90A-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX90A-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX90A-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX90A-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX90A-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX90A-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX90A-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX90A-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX90A-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX90A-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -2192,30 +2182,29 @@
 ; GFX90A-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX90A-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX90A-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX90A-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX90A-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX940-LABEL: @test_atomicrmw_fadd_bf16_local(
-; GFX940-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX940-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX940-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX940-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX940-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX940-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX940-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX940-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX940-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX940-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX940-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX940-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX940-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX940:       atomicrmw.start:
 ; GFX940-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX940-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX940-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX940-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX940-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX940-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX940-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX940-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX940-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX940-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX940-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX940-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -2223,30 +2212,29 @@
 ; GFX940-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX940:       atomicrmw.end:
-; GFX940-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX940-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX940-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX940-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX940-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX11-LABEL: @test_atomicrmw_fadd_bf16_local(
-; GFX11-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX11-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX11-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX11-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX11-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX11-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX11-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX11-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX11-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX11-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX11-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX11-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX11-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX11:       atomicrmw.start:
 ; GFX11-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX11-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX11-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX11-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX11-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX11-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX11-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX11-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX11-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX11-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX11-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX11-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX11-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -2254,7 +2242,7 @@
 ; GFX11-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX11:       atomicrmw.end:
-; GFX11-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX11-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX11-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX11-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX11-NEXT:    ret bfloat [[TMP7]]
@@ -3040,24 +3028,23 @@
 
 define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bfloat %value) #2 {
 ; CI-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; CI-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; CI-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; CI-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CI-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CI-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CI-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; CI-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CI-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CI-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CI-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CI-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; CI-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; CI-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; CI-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CI:       atomicrmw.start:
 ; CI-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CI-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; CI-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; CI-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; CI-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; CI-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; CI-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; CI-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CI-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; CI-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; CI-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; CI-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -3065,30 +3052,29 @@
 ; CI-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; CI-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CI:       atomicrmw.end:
-; CI-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CI-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; CI-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; CI-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; CI-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX9-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; GFX9-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX9-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX9-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX9-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX9-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX9-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX9-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX9-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX9-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX9-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX9-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX9-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX9-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX9-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX9:       atomicrmw.start:
 ; GFX9-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX9-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX9-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX9-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX9-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX9-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX9-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX9-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX9-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX9-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX9-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX9-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -3096,30 +3082,29 @@
 ; GFX9-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX9-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX9:       atomicrmw.end:
-; GFX9-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX9-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX9-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX9-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX9-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX908-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; GFX908-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX908-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX908-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX908-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX908-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX908-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX908-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX908-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX908-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX908-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX908-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX908-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX908-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX908:       atomicrmw.start:
 ; GFX908-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX908-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX908-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX908-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX908-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX908-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX908-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX908-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX908-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX908-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX908-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX908-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -3127,30 +3112,29 @@
 ; GFX908-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX908:       atomicrmw.end:
-; GFX908-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX908-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX908-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX908-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX908-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; GFX90A-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX90A-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX90A-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX90A-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX90A-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX90A-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX90A-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX90A-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX90A-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX90A-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX90A-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX90A-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX90A-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX90A:       atomicrmw.start:
 ; GFX90A-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX90A-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX90A-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX90A-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX90A-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX90A-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX90A-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX90A-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX90A-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX90A-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX90A-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX90A-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -3158,30 +3142,29 @@
 ; GFX90A-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX90A-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX90A-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX90A-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX90A-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX940-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; GFX940-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX940-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX940-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX940-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX940-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX940-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX940-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX940-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX940-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX940-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX940-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX940-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX940-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX940:       atomicrmw.start:
 ; GFX940-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX940-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX940-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX940-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX940-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX940-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX940-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX940-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX940-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX940-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX940-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX940-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -3189,30 +3172,29 @@
 ; GFX940-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX940:       atomicrmw.end:
-; GFX940-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX940-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX940-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX940-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX940-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX11-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; GFX11-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX11-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX11-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX11-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX11-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX11-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX11-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX11-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX11-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX11-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX11-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX11-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX11-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX11:       atomicrmw.start:
 ; GFX11-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX11-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX11-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX11-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX11-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX11-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX11-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX11-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX11-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX11-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX11-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX11-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX11-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -3220,7 +3202,7 @@
 ; GFX11-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX11:       atomicrmw.end:
-; GFX11-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX11-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX11-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX11-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX11-NEXT:    ret bfloat [[TMP7]]
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
@@ -165,24 +165,23 @@
 
 define half @test_atomicrmw_fmax_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; GCN-LABEL: @test_atomicrmw_fmax_f16_local(
-; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GCN:       atomicrmw.start:
 ; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GCN-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
 ; GCN-NEXT:    [[TMP5:%.*]] = call half @llvm.maxnum.f16(half [[TMP4]], half [[VALUE:%.*]])
 ; GCN-NEXT:    [[TMP6:%.*]] = bitcast half [[TMP5]] to i16
 ; GCN-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP6]] to i32
-; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GCN-NEXT:    [[TMP7:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -190,7 +189,7 @@
 ; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0
 ; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GCN:       atomicrmw.end:
-; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GCN-NEXT:    [[TMP8:%.*]] = bitcast i16 [[EXTRACTED3]] to half
 ; GCN-NEXT:    ret half [[TMP8]]
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
@@ -165,24 +165,23 @@
 
 define half @test_atomicrmw_fmin_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; GCN-LABEL: @test_atomicrmw_fmin_f16_local(
-; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GCN:       atomicrmw.start:
 ; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GCN-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
 ; GCN-NEXT:    [[TMP5:%.*]] = call half @llvm.minnum.f16(half [[TMP4]], half [[VALUE:%.*]])
 ; GCN-NEXT:    [[TMP6:%.*]] = bitcast half [[TMP5]] to i16
 ; GCN-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP6]] to i32
-; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GCN-NEXT:    [[TMP7:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -190,7 +189,7 @@
 ; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0
 ; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GCN:       atomicrmw.end:
-; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GCN-NEXT:    [[TMP8:%.*]] = bitcast i16 [[EXTRACTED3]] to half
 ; GCN-NEXT:    ret half [[TMP8]]
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll
@@ -165,24 +165,23 @@
 
 define half @test_atomicrmw_fsub_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; GCN-LABEL: @test_atomicrmw_fsub_f16_local(
-; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GCN:       atomicrmw.start:
 ; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GCN-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
 ; GCN-NEXT:    [[NEW:%.*]] = fsub half [[TMP4]], [[VALUE:%.*]]
 ; GCN-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
 ; GCN-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GCN-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -190,7 +189,7 @@
 ; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GCN:       atomicrmw.end:
-; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GCN-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
 ; GCN-NEXT:    ret half [[TMP7]]
@@ -285,24 +284,23 @@
 
 define bfloat @test_atomicrmw_fadd_bf16_local(ptr addrspace(3) %ptr, bfloat %value) {
 ; GCN-LABEL: @test_atomicrmw_fadd_bf16_local(
-; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GCN:       atomicrmw.start:
 ; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GCN-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GCN-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GCN-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GCN-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GCN-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -310,7 +308,7 @@
 ; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GCN:       atomicrmw.end:
-; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GCN-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GCN-NEXT:    ret bfloat [[TMP7]]
@@ -471,24 +469,23 @@
 
 define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bfloat %value) #2 {
 ; GCN-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GCN:       atomicrmw.start:
 ; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GCN-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GCN-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GCN-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GCN-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GCN-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -496,7 +493,7 @@
 ; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GCN:       atomicrmw.end:
-; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GCN-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GCN-NEXT:    ret bfloat [[TMP7]]
diff --git a/llvm/test/Transforms/AtomicExpand/PowerPC/cmpxchg.ll b/llvm/test/Transforms/AtomicExpand/PowerPC/cmpxchg.ll
--- a/llvm/test/Transforms/AtomicExpand/PowerPC/cmpxchg.ll
+++ b/llvm/test/Transforms/AtomicExpand/PowerPC/cmpxchg.ll
@@ -33,16 +33,12 @@
 ; PWR7-NEXT:    [[TMP0:%.*]] = alloca i128, align 8
 ; PWR7-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP0]])
 ; PWR7-NEXT:    store i128 [[DESIRE:%.*]], ptr [[TMP0]], align 8
-; PWR7-NEXT:    [[TMP1:%.*]] = alloca i128, align 8
-; PWR7-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP1]])
-; PWR7-NEXT:    store i128 [[NEW:%.*]], ptr [[TMP1]], align 8
-; PWR7-NEXT:    [[TMP2:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 16, ptr [[ADDR:%.*]], ptr [[TMP0]], ptr [[TMP1]], i32 5, i32 5)
-; PWR7-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[TMP1]])
-; PWR7-NEXT:    [[TMP3:%.*]] = load i128, ptr [[TMP0]], align 8
+; PWR7-NEXT:    [[TMP1:%.*]] = call zeroext i1 @__atomic_compare_exchange_16(ptr [[ADDR:%.*]], ptr [[TMP0]], i128 [[NEW:%.*]], i32 5, i32 5)
+; PWR7-NEXT:    [[TMP2:%.*]] = load i128, ptr [[TMP0]], align 8
 ; PWR7-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[TMP0]])
-; PWR7-NEXT:    [[TMP4:%.*]] = insertvalue { i128, i1 } poison, i128 [[TMP3]], 0
-; PWR7-NEXT:    [[TMP5:%.*]] = insertvalue { i128, i1 } [[TMP4]], i1 [[TMP2]], 1
-; PWR7-NEXT:    [[SUCC:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1
+; PWR7-NEXT:    [[TMP3:%.*]] = insertvalue { i128, i1 } poison, i128 [[TMP2]], 0
+; PWR7-NEXT:    [[TMP4:%.*]] = insertvalue { i128, i1 } [[TMP3]], i1 [[TMP1]], 1
+; PWR7-NEXT:    [[SUCC:%.*]] = extractvalue { i128, i1 } [[TMP4]], 1
 ; PWR7-NEXT:    ret i1 [[SUCC]]
 ;
 entry:
diff --git a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-libcall.ll b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-libcall.ll
--- a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-libcall.ll
+++ b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-libcall.ll
@@ -6,7 +6,7 @@
 ; CHECK-LABEL: @atomic_load256_libcall(
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca i256, align 8
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[TMP1]])
-; CHECK-NEXT:    call void @__atomic_load(i64 32, ptr [[PTR:%.*]], ptr [[TMP1]], i32 0)
+; CHECK-NEXT:    call void @__atomic_load(i32 32, ptr [[PTR:%.*]], ptr [[TMP1]], i32 0)
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i256, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[TMP1]])
 ; CHECK-NEXT:    ret i256 [[TMP2]]
@@ -20,7 +20,7 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[PTR:%.*]] to ptr
 ; CHECK-NEXT:    [[TMP2:%.*]] = alloca i256, align 8
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[TMP2]])
-; CHECK-NEXT:    call void @__atomic_load(i64 32, ptr [[TMP1]], ptr [[TMP2]], i32 0)
+; CHECK-NEXT:    call void @__atomic_load(i32 32, ptr [[TMP1]], ptr [[TMP2]], i32 0)
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i256, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[TMP2]])
 ; CHECK-NEXT:    ret i256 [[TMP3]]
diff --git a/llvm/test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll b/llvm/test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll
--- a/llvm/test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll
@@ -7,10 +7,10 @@
 ; CHECK-NEXT:    [[PTR_CAST:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr
 ; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr addrspace(3) [[PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr addrspace(3) [[PTR]], align 8
 ; CHECK-NEXT:    ret i64 [[V1]]
 ; CHECK:       l2:
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[PTR_CAST]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[PTR_CAST]], align 8
 ; CHECK-NEXT:    ret i64 [[V2]]
 ;
   %ptr_cast = addrspacecast ptr addrspace(3) %ptr to ptr
@@ -31,10 +31,10 @@
 ; CHECK-NEXT:    [[PTR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
 ; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr addrspace(5) [[PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr addrspace(5) [[PTR]], align 8
 ; CHECK-NEXT:    ret i64 [[V1]]
 ; CHECK:       l2:
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[PTR_CAST]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[PTR_CAST]], align 8
 ; CHECK-NEXT:    ret i64 [[V2]]
 ;
   %ptr_cast = addrspacecast ptr addrspace(5) %ptr to ptr
@@ -55,11 +55,11 @@
 ; CHECK-SAME: i1 [[PRED:%.*]], ptr addrspace(1) [[PTR:%.*]]) {
 ; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
 ; CHECK-NEXT:    ret i64 [[V1]]
 ; CHECK:       l2:
 ; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    ret i64 [[V2]]
 ;
   %ptr_cast = addrspacecast ptr addrspace(1) %ptr to ptr
@@ -79,11 +79,11 @@
 ; CHECK-SAME: i1 [[PRED:%.*]], ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 8
 ; CHECK-NEXT:    ret i64 [[V1]]
 ; CHECK:       l2:
 ; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1)
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(1) [[TMP1]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(1) [[TMP1]], align 8
 ; CHECK-NEXT:    ret i64 [[V2]]
 ;
   %ptr_cast = addrspacecast ptr %ptr to ptr addrspace(1)
@@ -103,11 +103,11 @@
 ; CHECK-SAME: i1 [[PRED:%.*]], ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 8
 ; CHECK-NEXT:    ret i64 [[V1]]
 ; CHECK:       l2:
 ; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(4)
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(4) [[TMP1]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(4) [[TMP1]], align 8
 ; CHECK-NEXT:    ret i64 [[V2]]
 ;
   %ptr_cast = addrspacecast ptr %ptr to ptr addrspace(4)
@@ -125,13 +125,13 @@
 define i64 @sink_flat_to_local(i1 %pred, ptr %ptr) {
 ; CHECK-LABEL: define i64 @sink_flat_to_local(
 ; CHECK-SAME: i1 [[PRED:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[PTR_CAST:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3)
 ; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 8
 ; CHECK-NEXT:    ret i64 [[V1]]
 ; CHECK:       l2:
-; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3)
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(3) [[TMP1]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(3) [[PTR_CAST]], align 8
 ; CHECK-NEXT:    ret i64 [[V2]]
 ;
   %ptr_cast = addrspacecast ptr %ptr to ptr addrspace(3)
@@ -149,13 +149,13 @@
 define i64 @sink_flat_to_private(i1 %pred, ptr %ptr) {
 ; CHECK-LABEL: define i64 @sink_flat_to_private(
 ; CHECK-SAME: i1 [[PRED:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[PTR_CAST:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5)
 ; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 8
 ; CHECK-NEXT:    ret i64 [[V1]]
 ; CHECK:       l2:
-; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5)
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(5) [[PTR_CAST]], align 8
 ; CHECK-NEXT:    ret i64 [[V2]]
 ;
   %ptr_cast = addrspacecast ptr %ptr to ptr addrspace(5)
diff --git a/llvm/test/Transforms/DivRemPairs/X86/div-expanded-rem-pair.ll b/llvm/test/Transforms/DivRemPairs/X86/div-expanded-rem-pair.ll
--- a/llvm/test/Transforms/DivRemPairs/X86/div-expanded-rem-pair.ll
+++ b/llvm/test/Transforms/DivRemPairs/X86/div-expanded-rem-pair.ll
@@ -214,7 +214,7 @@
 ; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[A]], [[B]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[END:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    store i64 [[REM]], ptr [[RP]], align 4
+; CHECK-NEXT:    store i64 [[REM]], ptr [[RP]], align 8
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret i64 [[DIV]]
@@ -246,7 +246,7 @@
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul i128 [[DIV]], [[B_FROZEN]]
 ; CHECK-NEXT:    [[REM_DECOMPOSED:%.*]] = sub i128 [[A_FROZEN]], [[TMP0]]
-; CHECK-NEXT:    store i128 [[REM_DECOMPOSED]], ptr [[RP]], align 4
+; CHECK-NEXT:    store i128 [[REM_DECOMPOSED]], ptr [[RP]], align 8
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret i128 [[DIV]]
@@ -275,7 +275,7 @@
 ; CHECK-NEXT:    i64 2, label [[SW_BB]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb:
-; CHECK-NEXT:    store i64 [[REM]], ptr [[RP:%.*]], align 4
+; CHECK-NEXT:    store i64 [[REM]], ptr [[RP:%.*]], align 8
 ; CHECK-NEXT:    br label [[SW_DEFAULT]]
 ; CHECK:       sw.default:
 ; CHECK-NEXT:    ret i64 [[DIV]]
@@ -306,7 +306,7 @@
 ; CHECK-NEXT:    i64 2, label [[SW_BB]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.default:
-; CHECK-NEXT:    store i64 [[REM]], ptr [[RP:%.*]], align 4
+; CHECK-NEXT:    store i64 [[REM]], ptr [[RP:%.*]], align 8
 ; CHECK-NEXT:    br label [[SW_BB]]
 ; CHECK:       sw.bb:
 ; CHECK-NEXT:    ret i64 [[DIV]]
@@ -339,7 +339,7 @@
 ; CHECK:       if.then:
 ; CHECK-NEXT:    call void @maythrow()
 ; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    store i64 [[REM]], ptr [[RP]], align 4
+; CHECK-NEXT:    store i64 [[REM]], ptr [[RP]], align 8
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[A]], [[B]]
@@ -369,7 +369,7 @@
 ; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[END:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    store i64 [[REM]], ptr [[RP]], align 4
+; CHECK-NEXT:    store i64 [[REM]], ptr [[RP]], align 8
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    call void @maythrow()
@@ -401,7 +401,7 @@
 ; CHECK:       if.then:
 ; CHECK-NEXT:    call void @maythrow()
 ; CHECK-NEXT:    [[REM:%.*]] = urem i128 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    store i128 [[REM]], ptr [[RP]], align 4
+; CHECK-NEXT:    store i128 [[REM]], ptr [[RP]], align 8
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    [[DIV:%.*]] = udiv i128 [[A]], [[B]]
@@ -431,7 +431,7 @@
 ; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[END:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[REM:%.*]] = urem i128 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    store i128 [[REM]], ptr [[RP]], align 4
+; CHECK-NEXT:    store i128 [[REM]], ptr [[RP]], align 8
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    call void @maythrow()
@@ -464,7 +464,7 @@
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb:
 ; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    store i64 [[REM]], ptr [[RP:%.*]], align 4
+; CHECK-NEXT:    store i64 [[REM]], ptr [[RP:%.*]], align 8
 ; CHECK-NEXT:    br label [[SW_BB1]]
 ; CHECK:       sw.bb1:
 ; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[A]], [[B]]
@@ -501,7 +501,7 @@
 ; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END3:%.*]], label [[IF_THEN:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    store i64 [[REM]], ptr [[RP]], align 4
+; CHECK-NEXT:    store i64 [[REM]], ptr [[RP]], align 8
 ; CHECK-NEXT:    [[TOBOOL1_NOT:%.*]] = icmp eq i64 [[C:%.*]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL1_NOT]], label [[IF_END3]], label [[RETURN:%.*]]
 ; CHECK:       if.end3:
diff --git a/llvm/test/Transforms/IndVarSimplify/ARM/indvar-cost.ll b/llvm/test/Transforms/IndVarSimplify/ARM/indvar-cost.ll
--- a/llvm/test/Transforms/IndVarSimplify/ARM/indvar-cost.ll
+++ b/llvm/test/Transforms/IndVarSimplify/ARM/indvar-cost.ll
@@ -13,6 +13,9 @@
 ; CHECK-T1-NEXT:    [[CMP41080:%.*]] = icmp eq i32 [[SUB]], 0
 ; CHECK-T1-NEXT:    br i1 [[CMP41080]], label [[WHILE_END13:%.*]], label [[WHILE_COND5_PREHEADER_PREHEADER:%.*]]
 ; CHECK-T1:       while.cond5.preheader.preheader:
+; CHECK-T1-NEXT:    [[TMP0:%.*]] = add i32 [[SRCALEN_SRCBLEN]], -2
+; CHECK-T1-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 2)
+; CHECK-T1-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[UMIN]], 2
 ; CHECK-T1-NEXT:    br label [[WHILE_COND5_PREHEADER:%.*]]
 ; CHECK-T1:       while.cond5.preheader:
 ; CHECK-T1-NEXT:    [[COUNT_01084:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_END:%.*]] ], [ 1, [[WHILE_COND5_PREHEADER_PREHEADER]] ]
@@ -26,11 +29,11 @@
 ; CHECK-T1-NEXT:    [[PY_11076:%.*]] = phi ptr [ [[INCDEC_PTR8:%.*]], [[WHILE_BODY7]] ], [ [[PY_01082]], [[WHILE_COND5_PREHEADER]] ]
 ; CHECK-T1-NEXT:    [[PX_11075:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY7]] ], [ [[PSRCB_PSRCA]], [[WHILE_COND5_PREHEADER]] ]
 ; CHECK-T1-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i16, ptr [[PX_11075]], i32 1
-; CHECK-T1-NEXT:    [[TMP0:%.*]] = load i16, ptr [[PX_11075]], align 2
-; CHECK-T1-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+; CHECK-T1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[PX_11075]], align 2
+; CHECK-T1-NEXT:    [[CONV:%.*]] = sext i16 [[TMP2]] to i32
 ; CHECK-T1-NEXT:    [[INCDEC_PTR8]] = getelementptr inbounds i16, ptr [[PY_11076]], i32 -1
-; CHECK-T1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[PY_11076]], align 2
-; CHECK-T1-NEXT:    [[CONV9:%.*]] = sext i16 [[TMP1]] to i32
+; CHECK-T1-NEXT:    [[TMP3:%.*]] = load i16, ptr [[PY_11076]], align 2
+; CHECK-T1-NEXT:    [[CONV9:%.*]] = sext i16 [[TMP3]] to i32
 ; CHECK-T1-NEXT:    [[MUL_I:%.*]] = mul nsw i32 [[CONV9]], [[CONV]]
 ; CHECK-T1-NEXT:    [[SHR3_I:%.*]] = ashr i32 [[CONV]], 16
 ; CHECK-T1-NEXT:    [[SHR4_I:%.*]] = ashr i32 [[CONV9]], 16
@@ -42,17 +45,15 @@
 ; CHECK-T1-NEXT:    br i1 [[CMP6]], label [[WHILE_END]], label [[WHILE_BODY7]]
 ; CHECK-T1:       while.end:
 ; CHECK-T1-NEXT:    [[ADD6_I_LCSSA:%.*]] = phi i32 [ [[ADD6_I]], [[WHILE_BODY7]] ]
-; CHECK-T1-NEXT:    [[TMP2:%.*]] = lshr i32 [[ADD6_I_LCSSA]], 15
-; CHECK-T1-NEXT:    [[CONV10:%.*]] = trunc i32 [[TMP2]] to i16
+; CHECK-T1-NEXT:    [[TMP4:%.*]] = lshr i32 [[ADD6_I_LCSSA]], 15
+; CHECK-T1-NEXT:    [[CONV10:%.*]] = trunc i32 [[TMP4]] to i16
 ; CHECK-T1-NEXT:    [[INCDEC_PTR11]] = getelementptr inbounds i16, ptr [[POUT_01081]], i32 1
 ; CHECK-T1-NEXT:    store i16 [[CONV10]], ptr [[POUT_01081]], align 2
 ; CHECK-T1-NEXT:    [[ADD_PTR]] = getelementptr inbounds i16, ptr [[PSRCA_PSRCB]], i32 [[COUNT_01084]]
 ; CHECK-T1-NEXT:    [[INC]] = add nuw nsw i32 [[COUNT_01084]], 1
 ; CHECK-T1-NEXT:    [[DEC12]] = add i32 [[BLOCKSIZE1_01083]], -1
-; CHECK-T1-NEXT:    [[CMP3:%.*]] = icmp ult i32 [[COUNT_01084]], 3
-; CHECK-T1-NEXT:    [[CMP4:%.*]] = icmp ne i32 [[DEC12]], 0
-; CHECK-T1-NEXT:    [[TMP3:%.*]] = and i1 [[CMP4]], [[CMP3]]
-; CHECK-T1-NEXT:    br i1 [[TMP3]], label [[WHILE_COND5_PREHEADER]], label [[WHILE_END13_LOOPEXIT:%.*]]
+; CHECK-T1-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], [[TMP1]]
+; CHECK-T1-NEXT:    br i1 [[EXITCOND]], label [[WHILE_COND5_PREHEADER]], label [[WHILE_END13_LOOPEXIT:%.*]]
 ; CHECK-T1:       while.end13.loopexit:
 ; CHECK-T1-NEXT:    [[INCDEC_PTR11_LCSSA:%.*]] = phi ptr [ [[INCDEC_PTR11]], [[WHILE_END]] ]
 ; CHECK-T1-NEXT:    [[ADD_PTR_LCSSA:%.*]] = phi ptr [ [[ADD_PTR]], [[WHILE_END]] ]
@@ -85,34 +86,34 @@
 ; CHECK-T1-NEXT:    [[PY_31056:%.*]] = phi ptr [ [[ADD_PTR_I884:%.*]], [[WHILE_BODY23]] ], [ [[PY_21070]], [[WHILE_BODY23_PREHEADER]] ]
 ; CHECK-T1-NEXT:    [[PX_31055:%.*]] = phi ptr [ [[ADD_PTR_I890:%.*]], [[WHILE_BODY23]] ], [ [[PSRCB_PSRCA]], [[WHILE_BODY23_PREHEADER]] ]
 ; CHECK-T1-NEXT:    [[ARRAYIDX_I907:%.*]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 1
-; CHECK-T1-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_I907]], align 2
-; CHECK-T1-NEXT:    [[TMP5:%.*]] = load i16, ptr [[PX_31055]], align 2
+; CHECK-T1-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_I907]], align 2
+; CHECK-T1-NEXT:    [[TMP6:%.*]] = load i16, ptr [[PX_31055]], align 2
 ; CHECK-T1-NEXT:    [[ADD_PTR_I912:%.*]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 2
 ; CHECK-T1-NEXT:    [[ARRAYIDX_I901:%.*]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 1
-; CHECK-T1-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_I901]], align 2
-; CHECK-T1-NEXT:    [[TMP7:%.*]] = load i16, ptr [[PY_31056]], align 2
+; CHECK-T1-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_I901]], align 2
+; CHECK-T1-NEXT:    [[TMP8:%.*]] = load i16, ptr [[PY_31056]], align 2
 ; CHECK-T1-NEXT:    [[ADD_PTR_I906:%.*]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 -2
-; CHECK-T1-NEXT:    [[SHR_I892:%.*]] = sext i16 [[TMP5]] to i32
-; CHECK-T1-NEXT:    [[SHR1_I893:%.*]] = sext i16 [[TMP6]] to i32
+; CHECK-T1-NEXT:    [[SHR_I892:%.*]] = sext i16 [[TMP6]] to i32
+; CHECK-T1-NEXT:    [[SHR1_I893:%.*]] = sext i16 [[TMP7]] to i32
 ; CHECK-T1-NEXT:    [[MUL_I894:%.*]] = mul nsw i32 [[SHR1_I893]], [[SHR_I892]]
-; CHECK-T1-NEXT:    [[SHR2_I895:%.*]] = sext i16 [[TMP4]] to i32
-; CHECK-T1-NEXT:    [[SHR4_I897:%.*]] = sext i16 [[TMP7]] to i32
+; CHECK-T1-NEXT:    [[SHR2_I895:%.*]] = sext i16 [[TMP5]] to i32
+; CHECK-T1-NEXT:    [[SHR4_I897:%.*]] = sext i16 [[TMP8]] to i32
 ; CHECK-T1-NEXT:    [[MUL5_I898:%.*]] = mul nsw i32 [[SHR4_I897]], [[SHR2_I895]]
 ; CHECK-T1-NEXT:    [[ADD_I899:%.*]] = add i32 [[MUL_I894]], [[SUM_11057]]
 ; CHECK-T1-NEXT:    [[ADD6_I900:%.*]] = add i32 [[ADD_I899]], [[MUL5_I898]]
 ; CHECK-T1-NEXT:    [[ARRAYIDX_I885:%.*]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 3
-; CHECK-T1-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX_I885]], align 2
-; CHECK-T1-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ADD_PTR_I912]], align 2
+; CHECK-T1-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_I885]], align 2
+; CHECK-T1-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ADD_PTR_I912]], align 2
 ; CHECK-T1-NEXT:    [[ADD_PTR_I890]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 4
 ; CHECK-T1-NEXT:    [[ARRAYIDX_I879:%.*]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 -1
-; CHECK-T1-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_I879]], align 2
-; CHECK-T1-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ADD_PTR_I906]], align 2
+; CHECK-T1-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX_I879]], align 2
+; CHECK-T1-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ADD_PTR_I906]], align 2
 ; CHECK-T1-NEXT:    [[ADD_PTR_I884]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 -4
-; CHECK-T1-NEXT:    [[SHR_I870:%.*]] = sext i16 [[TMP9]] to i32
-; CHECK-T1-NEXT:    [[SHR1_I871:%.*]] = sext i16 [[TMP10]] to i32
+; CHECK-T1-NEXT:    [[SHR_I870:%.*]] = sext i16 [[TMP10]] to i32
+; CHECK-T1-NEXT:    [[SHR1_I871:%.*]] = sext i16 [[TMP11]] to i32
 ; CHECK-T1-NEXT:    [[MUL_I872:%.*]] = mul nsw i32 [[SHR1_I871]], [[SHR_I870]]
-; CHECK-T1-NEXT:    [[SHR2_I873:%.*]] = sext i16 [[TMP8]] to i32
-; CHECK-T1-NEXT:    [[SHR4_I875:%.*]] = sext i16 [[TMP11]] to i32
+; CHECK-T1-NEXT:    [[SHR2_I873:%.*]] = sext i16 [[TMP9]] to i32
+; CHECK-T1-NEXT:    [[SHR4_I875:%.*]] = sext i16 [[TMP12]] to i32
 ; CHECK-T1-NEXT:    [[MUL5_I876:%.*]] = mul nsw i32 [[SHR4_I875]], [[SHR2_I873]]
 ; CHECK-T1-NEXT:    [[ADD_I877:%.*]] = add i32 [[ADD6_I900]], [[MUL_I872]]
 ; CHECK-T1-NEXT:    [[ADD6_I878]] = add i32 [[ADD_I877]], [[MUL5_I876]]
@@ -140,11 +141,11 @@
 ; CHECK-T1-NEXT:    [[PY_41064:%.*]] = phi ptr [ [[INCDEC_PTR39:%.*]], [[WHILE_BODY36]] ], [ [[ADD_PTR32]], [[WHILE_BODY36_PREHEADER]] ]
 ; CHECK-T1-NEXT:    [[PX_41063:%.*]] = phi ptr [ [[INCDEC_PTR37:%.*]], [[WHILE_BODY36]] ], [ [[PX_3_LCSSA]], [[WHILE_BODY36_PREHEADER]] ]
 ; CHECK-T1-NEXT:    [[INCDEC_PTR37]] = getelementptr inbounds i16, ptr [[PX_41063]], i32 1
-; CHECK-T1-NEXT:    [[TMP12:%.*]] = load i16, ptr [[PX_41063]], align 2
-; CHECK-T1-NEXT:    [[CONV38:%.*]] = sext i16 [[TMP12]] to i32
+; CHECK-T1-NEXT:    [[TMP13:%.*]] = load i16, ptr [[PX_41063]], align 2
+; CHECK-T1-NEXT:    [[CONV38:%.*]] = sext i16 [[TMP13]] to i32
 ; CHECK-T1-NEXT:    [[INCDEC_PTR39]] = getelementptr inbounds i16, ptr [[PY_41064]], i32 -1
-; CHECK-T1-NEXT:    [[TMP13:%.*]] = load i16, ptr [[PY_41064]], align 2
-; CHECK-T1-NEXT:    [[CONV40:%.*]] = sext i16 [[TMP13]] to i32
+; CHECK-T1-NEXT:    [[TMP14:%.*]] = load i16, ptr [[PY_41064]], align 2
+; CHECK-T1-NEXT:    [[CONV40:%.*]] = sext i16 [[TMP14]] to i32
 ; CHECK-T1-NEXT:    [[MUL_I863:%.*]] = mul nsw i32 [[CONV40]], [[CONV38]]
 ; CHECK-T1-NEXT:    [[SHR3_I864:%.*]] = ashr i32 [[CONV38]], 16
 ; CHECK-T1-NEXT:    [[SHR4_I865:%.*]] = ashr i32 [[CONV40]], 16
@@ -159,8 +160,8 @@
 ; CHECK-T1-NEXT:    br label [[WHILE_END43]]
 ; CHECK-T1:       while.end43:
 ; CHECK-T1-NEXT:    [[SUM_2_LCSSA:%.*]] = phi i32 [ [[SUM_1_LCSSA]], [[WHILE_END31]] ], [ [[ADD6_I868_LCSSA]], [[WHILE_END43_LOOPEXIT]] ]
-; CHECK-T1-NEXT:    [[TMP14:%.*]] = lshr i32 [[SUM_2_LCSSA]], 15
-; CHECK-T1-NEXT:    [[CONV45:%.*]] = trunc i32 [[TMP14]] to i16
+; CHECK-T1-NEXT:    [[TMP15:%.*]] = lshr i32 [[SUM_2_LCSSA]], 15
+; CHECK-T1-NEXT:    [[CONV45:%.*]] = trunc i32 [[TMP15]] to i16
 ; CHECK-T1-NEXT:    [[INCDEC_PTR46]] = getelementptr inbounds i16, ptr [[POUT_11069]], i32 1
 ; CHECK-T1-NEXT:    store i16 [[CONV45]], ptr [[POUT_11069]], align 2
 ; CHECK-T1-NEXT:    [[SUB47:%.*]] = add i32 [[COUNT_11072]], -1
@@ -184,6 +185,9 @@
 ; CHECK-T2-NEXT:    [[CMP41080:%.*]] = icmp eq i32 [[SUB]], 0
 ; CHECK-T2-NEXT:    br i1 [[CMP41080]], label [[WHILE_END13:%.*]], label [[WHILE_COND5_PREHEADER_PREHEADER:%.*]]
 ; CHECK-T2:       while.cond5.preheader.preheader:
+; CHECK-T2-NEXT:    [[TMP0:%.*]] = add i32 [[SRCALEN_SRCBLEN]], -2
+; CHECK-T2-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 2)
+; CHECK-T2-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[UMIN]], 2
 ; CHECK-T2-NEXT:    br label [[WHILE_COND5_PREHEADER:%.*]]
 ; CHECK-T2:       while.cond5.preheader:
 ; CHECK-T2-NEXT:    [[COUNT_01084:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_END:%.*]] ], [ 1, [[WHILE_COND5_PREHEADER_PREHEADER]] ]
@@ -197,11 +201,11 @@
 ; CHECK-T2-NEXT:    [[PY_11076:%.*]] = phi ptr [ [[INCDEC_PTR8:%.*]], [[WHILE_BODY7]] ], [ [[PY_01082]], [[WHILE_COND5_PREHEADER]] ]
 ; CHECK-T2-NEXT:    [[PX_11075:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY7]] ], [ [[PSRCB_PSRCA]], [[WHILE_COND5_PREHEADER]] ]
 ; CHECK-T2-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i16, ptr [[PX_11075]], i32 1
-; CHECK-T2-NEXT:    [[TMP0:%.*]] = load i16, ptr [[PX_11075]], align 2
-; CHECK-T2-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+; CHECK-T2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[PX_11075]], align 2
+; CHECK-T2-NEXT:    [[CONV:%.*]] = sext i16 [[TMP2]] to i32
 ; CHECK-T2-NEXT:    [[INCDEC_PTR8]] = getelementptr inbounds i16, ptr [[PY_11076]], i32 -1
-; CHECK-T2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[PY_11076]], align 2
-; CHECK-T2-NEXT:    [[CONV9:%.*]] = sext i16 [[TMP1]] to i32
+; CHECK-T2-NEXT:    [[TMP3:%.*]] = load i16, ptr [[PY_11076]], align 2
+; CHECK-T2-NEXT:    [[CONV9:%.*]] = sext i16 [[TMP3]] to i32
 ; CHECK-T2-NEXT:    [[MUL_I:%.*]] = mul nsw i32 [[CONV9]], [[CONV]]
 ; CHECK-T2-NEXT:    [[SHR3_I:%.*]] = ashr i32 [[CONV]], 16
 ; CHECK-T2-NEXT:    [[SHR4_I:%.*]] = ashr i32 [[CONV9]], 16
@@ -213,17 +217,15 @@
 ; CHECK-T2-NEXT:    br i1 [[CMP6]], label [[WHILE_END]], label [[WHILE_BODY7]]
 ; CHECK-T2:       while.end:
 ; CHECK-T2-NEXT:    [[ADD6_I_LCSSA:%.*]] = phi i32 [ [[ADD6_I]], [[WHILE_BODY7]] ]
-; CHECK-T2-NEXT:    [[TMP2:%.*]] = lshr i32 [[ADD6_I_LCSSA]], 15
-; CHECK-T2-NEXT:    [[CONV10:%.*]] = trunc i32 [[TMP2]] to i16
+; CHECK-T2-NEXT:    [[TMP4:%.*]] = lshr i32 [[ADD6_I_LCSSA]], 15
+; CHECK-T2-NEXT:    [[CONV10:%.*]] = trunc i32 [[TMP4]] to i16
 ; CHECK-T2-NEXT:    [[INCDEC_PTR11]] = getelementptr inbounds i16, ptr [[POUT_01081]], i32 1
 ; CHECK-T2-NEXT:    store i16 [[CONV10]], ptr [[POUT_01081]], align 2
 ; CHECK-T2-NEXT:    [[ADD_PTR]] = getelementptr inbounds i16, ptr [[PSRCA_PSRCB]], i32 [[COUNT_01084]]
 ; CHECK-T2-NEXT:    [[INC]] = add nuw nsw i32 [[COUNT_01084]], 1
 ; CHECK-T2-NEXT:    [[DEC12]] = add i32 [[BLOCKSIZE1_01083]], -1
-; CHECK-T2-NEXT:    [[CMP3:%.*]] = icmp ult i32 [[COUNT_01084]], 3
-; CHECK-T2-NEXT:    [[CMP4:%.*]] = icmp ne i32 [[DEC12]], 0
-; CHECK-T2-NEXT:    [[TMP3:%.*]] = and i1 [[CMP4]], [[CMP3]]
-; CHECK-T2-NEXT:    br i1 [[TMP3]], label [[WHILE_COND5_PREHEADER]], label [[WHILE_END13_LOOPEXIT:%.*]]
+; CHECK-T2-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], [[TMP1]]
+; CHECK-T2-NEXT:    br i1 [[EXITCOND]], label [[WHILE_COND5_PREHEADER]], label [[WHILE_END13_LOOPEXIT:%.*]]
 ; CHECK-T2:       while.end13.loopexit:
 ; CHECK-T2-NEXT:    [[INCDEC_PTR11_LCSSA:%.*]] = phi ptr [ [[INCDEC_PTR11]], [[WHILE_END]] ]
 ; CHECK-T2-NEXT:    [[ADD_PTR_LCSSA:%.*]] = phi ptr [ [[ADD_PTR]], [[WHILE_END]] ]
@@ -256,34 +258,34 @@
 ; CHECK-T2-NEXT:    [[PY_31056:%.*]] = phi ptr [ [[ADD_PTR_I884:%.*]], [[WHILE_BODY23]] ], [ [[PY_21070]], [[WHILE_BODY23_PREHEADER]] ]
 ; CHECK-T2-NEXT:    [[PX_31055:%.*]] = phi ptr [ [[ADD_PTR_I890:%.*]], [[WHILE_BODY23]] ], [ [[PSRCB_PSRCA]], [[WHILE_BODY23_PREHEADER]] ]
 ; CHECK-T2-NEXT:    [[ARRAYIDX_I907:%.*]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 1
-; CHECK-T2-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_I907]], align 2
-; CHECK-T2-NEXT:    [[TMP5:%.*]] = load i16, ptr [[PX_31055]], align 2
+; CHECK-T2-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_I907]], align 2
+; CHECK-T2-NEXT:    [[TMP6:%.*]] = load i16, ptr [[PX_31055]], align 2
 ; CHECK-T2-NEXT:    [[ADD_PTR_I912:%.*]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 2
 ; CHECK-T2-NEXT:    [[ARRAYIDX_I901:%.*]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 1
-; CHECK-T2-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_I901]], align 2
-; CHECK-T2-NEXT:    [[TMP7:%.*]] = load i16, ptr [[PY_31056]], align 2
+; CHECK-T2-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_I901]], align 2
+; CHECK-T2-NEXT:    [[TMP8:%.*]] = load i16, ptr [[PY_31056]], align 2
 ; CHECK-T2-NEXT:    [[ADD_PTR_I906:%.*]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 -2
-; CHECK-T2-NEXT:    [[SHR_I892:%.*]] = sext i16 [[TMP5]] to i32
-; CHECK-T2-NEXT:    [[SHR1_I893:%.*]] = sext i16 [[TMP6]] to i32
+; CHECK-T2-NEXT:    [[SHR_I892:%.*]] = sext i16 [[TMP6]] to i32
+; CHECK-T2-NEXT:    [[SHR1_I893:%.*]] = sext i16 [[TMP7]] to i32
 ; CHECK-T2-NEXT:    [[MUL_I894:%.*]] = mul nsw i32 [[SHR1_I893]], [[SHR_I892]]
-; CHECK-T2-NEXT:    [[SHR2_I895:%.*]] = sext i16 [[TMP4]] to i32
-; CHECK-T2-NEXT:    [[SHR4_I897:%.*]] = sext i16 [[TMP7]] to i32
+; CHECK-T2-NEXT:    [[SHR2_I895:%.*]] = sext i16 [[TMP5]] to i32
+; CHECK-T2-NEXT:    [[SHR4_I897:%.*]] = sext i16 [[TMP8]] to i32
 ; CHECK-T2-NEXT:    [[MUL5_I898:%.*]] = mul nsw i32 [[SHR4_I897]], [[SHR2_I895]]
 ; CHECK-T2-NEXT:    [[ADD_I899:%.*]] = add i32 [[MUL_I894]], [[SUM_11057]]
 ; CHECK-T2-NEXT:    [[ADD6_I900:%.*]] = add i32 [[ADD_I899]], [[MUL5_I898]]
 ; CHECK-T2-NEXT:    [[ARRAYIDX_I885:%.*]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 3
-; CHECK-T2-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX_I885]], align 2
-; CHECK-T2-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ADD_PTR_I912]], align 2
+; CHECK-T2-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_I885]], align 2
+; CHECK-T2-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ADD_PTR_I912]], align 2
 ; CHECK-T2-NEXT:    [[ADD_PTR_I890]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 4
 ; CHECK-T2-NEXT:    [[ARRAYIDX_I879:%.*]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 -1
-; CHECK-T2-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_I879]], align 2
-; CHECK-T2-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ADD_PTR_I906]], align 2
+; CHECK-T2-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX_I879]], align 2
+; CHECK-T2-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ADD_PTR_I906]], align 2
 ; CHECK-T2-NEXT:    [[ADD_PTR_I884]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 -4
-; CHECK-T2-NEXT:    [[SHR_I870:%.*]] = sext i16 [[TMP9]] to i32
-; CHECK-T2-NEXT:    [[SHR1_I871:%.*]] = sext i16 [[TMP10]] to i32
+; CHECK-T2-NEXT:    [[SHR_I870:%.*]] = sext i16 [[TMP10]] to i32
+; CHECK-T2-NEXT:    [[SHR1_I871:%.*]] = sext i16 [[TMP11]] to i32
 ; CHECK-T2-NEXT:    [[MUL_I872:%.*]] = mul nsw i32 [[SHR1_I871]], [[SHR_I870]]
-; CHECK-T2-NEXT:    [[SHR2_I873:%.*]] = sext i16 [[TMP8]] to i32
-; CHECK-T2-NEXT:    [[SHR4_I875:%.*]] = sext i16 [[TMP11]] to i32
+; CHECK-T2-NEXT:    [[SHR2_I873:%.*]] = sext i16 [[TMP9]] to i32
+; CHECK-T2-NEXT:    [[SHR4_I875:%.*]] = sext i16 [[TMP12]] to i32
 ; CHECK-T2-NEXT:    [[MUL5_I876:%.*]] = mul nsw i32 [[SHR4_I875]], [[SHR2_I873]]
 ; CHECK-T2-NEXT:    [[ADD_I877:%.*]] = add i32 [[ADD6_I900]], [[MUL_I872]]
 ; CHECK-T2-NEXT:    [[ADD6_I878]] = add i32 [[ADD_I877]], [[MUL5_I876]]
@@ -311,11 +313,11 @@
 ; CHECK-T2-NEXT:    [[PY_41064:%.*]] = phi ptr [ [[INCDEC_PTR39:%.*]], [[WHILE_BODY36]] ], [ [[ADD_PTR32]], [[WHILE_BODY36_PREHEADER]] ]
 ; CHECK-T2-NEXT:    [[PX_41063:%.*]] = phi ptr [ [[INCDEC_PTR37:%.*]], [[WHILE_BODY36]] ], [ [[PX_3_LCSSA]], [[WHILE_BODY36_PREHEADER]] ]
 ; CHECK-T2-NEXT:    [[INCDEC_PTR37]] = getelementptr inbounds i16, ptr [[PX_41063]], i32 1
-; CHECK-T2-NEXT:    [[TMP12:%.*]] = load i16, ptr [[PX_41063]], align 2
-; CHECK-T2-NEXT:    [[CONV38:%.*]] = sext i16 [[TMP12]] to i32
+; CHECK-T2-NEXT:    [[TMP13:%.*]] = load i16, ptr [[PX_41063]], align 2
+; CHECK-T2-NEXT:    [[CONV38:%.*]] = sext i16 [[TMP13]] to i32
 ; CHECK-T2-NEXT:    [[INCDEC_PTR39]] = getelementptr inbounds i16, ptr [[PY_41064]], i32 -1
-; CHECK-T2-NEXT:    [[TMP13:%.*]] = load i16, ptr [[PY_41064]], align 2
-; CHECK-T2-NEXT:    [[CONV40:%.*]] = sext i16 [[TMP13]] to i32
+; CHECK-T2-NEXT:    [[TMP14:%.*]] = load i16, ptr [[PY_41064]], align 2
+; CHECK-T2-NEXT:    [[CONV40:%.*]] = sext i16 [[TMP14]] to i32
 ; CHECK-T2-NEXT:    [[MUL_I863:%.*]] = mul nsw i32 [[CONV40]], [[CONV38]]
 ; CHECK-T2-NEXT:    [[SHR3_I864:%.*]] = ashr i32 [[CONV38]], 16
 ; CHECK-T2-NEXT:    [[SHR4_I865:%.*]] = ashr i32 [[CONV40]], 16
@@ -330,8 +332,8 @@
 ; CHECK-T2-NEXT:    br label [[WHILE_END43]]
 ; CHECK-T2:       while.end43:
 ; CHECK-T2-NEXT:    [[SUM_2_LCSSA:%.*]] = phi i32 [ [[SUM_1_LCSSA]], [[WHILE_END31]] ], [ [[ADD6_I868_LCSSA]], [[WHILE_END43_LOOPEXIT]] ]
-; CHECK-T2-NEXT:    [[TMP14:%.*]] = lshr i32 [[SUM_2_LCSSA]], 15
-; CHECK-T2-NEXT:    [[CONV45:%.*]] = trunc i32 [[TMP14]] to i16
+; CHECK-T2-NEXT:    [[TMP15:%.*]] = lshr i32 [[SUM_2_LCSSA]], 15
+; CHECK-T2-NEXT:    [[CONV45:%.*]] = trunc i32 [[TMP15]] to i16
 ; CHECK-T2-NEXT:    [[INCDEC_PTR46]] = getelementptr inbounds i16, ptr [[POUT_11069]], i32 1
 ; CHECK-T2-NEXT:    store i16 [[CONV45]], ptr [[POUT_11069]], align 2
 ; CHECK-T2-NEXT:    [[SUB47:%.*]] = add i32 [[COUNT_11072]], -1
diff --git a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll
--- a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll
+++ b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll
@@ -2,7 +2,7 @@
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes=inferattrs -S | FileCheck --match-full-lines --check-prefixes=CHECK,CHECK-KNOWN,CHECK-NOLINUX,CHECK-OPEN,CHECK-DARWIN %s
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes=inferattrs -S | FileCheck --match-full-lines --check-prefixes=CHECK,CHECK-KNOWN,CHECK-LINUX %s
 ; RUN: opt < %s -mtriple=nvptx -passes=inferattrs -S | FileCheck --match-full-lines --check-prefixes=CHECK-NOLINUX,CHECK-NVPTX %s
-; RUN: opt < %s -mtriple=powerpc-ibm-aix-xcoff -passes=inferattrs -S | FileCheck --match-full-lines --check-prefixes=CHECK-AIX %s
+; RUN: opt < %s -mtriple=powerpc64-ibm-aix-xcoff -passes=inferattrs -S | FileCheck --match-full-lines --check-prefixes=CHECK-NOLINUX,CHECK-AIX %s
 
 declare i32 @__nvvm_reflect(ptr)
 ; CHECK-NVPTX: declare noundef i32 @__nvvm_reflect(ptr noundef) [[NOFREE_NOUNWIND_READNONE:#[0-9]+]]
diff --git a/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll b/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll
--- a/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll
+++ b/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S -mtriple "i386-pc-linux"     | FileCheck %s
-; RUN: opt < %s -passes=instcombine -S -mtriple "i386-pc-win32"     | FileCheck %s
-; RUN: opt < %s -passes=instcombine -S -mtriple "x86_64-pc-win32"   | FileCheck %s
-; RUN: opt < %s -passes=instcombine -S -mtriple "i386-pc-mingw32"   | FileCheck %s
-; RUN: opt < %s -passes=instcombine -S -mtriple "x86_64-pc-mingw32" | FileCheck %s
-; RUN: opt < %s -passes=instcombine -S -mtriple "sparc-sun-solaris" | FileCheck %s
+; RUN: opt < %s -passes=instcombine -S -mtriple "i386-pc-linux"     | FileCheck %s --check-prefixes=CHECK,DOUBLE-4BYTE-ALIGN
+; RUN: opt < %s -passes=instcombine -S -mtriple "i386-pc-win32"     | FileCheck %s --check-prefixes=CHECK,DOUBLE-8BYTE-ALIGN
+; RUN: opt < %s -passes=instcombine -S -mtriple "x86_64-pc-win32"   | FileCheck %s --check-prefixes=CHECK,DOUBLE-8BYTE-ALIGN
+; RUN: opt < %s -passes=instcombine -S -mtriple "i386-pc-mingw32"   | FileCheck %s --check-prefixes=CHECK,DOUBLE-8BYTE-ALIGN
+; RUN: opt < %s -passes=instcombine -S -mtriple "x86_64-pc-mingw32" | FileCheck %s --check-prefixes=CHECK,DOUBLE-8BYTE-ALIGN
+; RUN: opt < %s -passes=instcombine -S -mtriple "sparc-sun-solaris" | FileCheck %s --check-prefixes=CHECK,DOUBLE-8BYTE-ALIGN
 ; RUN: opt < %s -passes=instcombine -S -mtriple "x86_64-pc-win32" -enable-debugify 2>&1 | FileCheck --check-prefix=DBG-VALID %s
 
 declare double @floor(double)
@@ -708,12 +708,19 @@
 }
 
 define float @test_no_shrink_intrin_floor_multi_use_fpext(half %C) {
-; CHECK-LABEL: @test_no_shrink_intrin_floor_multi_use_fpext(
-; CHECK-NEXT:    [[D:%.*]] = fpext half [[C:%.*]] to double
-; CHECK-NEXT:    store volatile double [[D]], ptr undef, align 8
-; CHECK-NEXT:    [[E:%.*]] = call double @llvm.floor.f64(double [[D]])
-; CHECK-NEXT:    [[F:%.*]] = fptrunc double [[E]] to float
-; CHECK-NEXT:    ret float [[F]]
+; DOUBLE-4BYTE-ALIGN-LABEL: @test_no_shrink_intrin_floor_multi_use_fpext(
+; DOUBLE-4BYTE-ALIGN-NEXT:    [[D:%.*]] = fpext half [[C:%.*]] to double
+; DOUBLE-4BYTE-ALIGN-NEXT:    store volatile double [[D]], ptr undef, align 4
+; DOUBLE-4BYTE-ALIGN-NEXT:    [[E:%.*]] = call double @llvm.floor.f64(double [[D]])
+; DOUBLE-4BYTE-ALIGN-NEXT:    [[F:%.*]] = fptrunc double [[E]] to float
+; DOUBLE-4BYTE-ALIGN-NEXT:    ret float [[F]]
+;
+; DOUBLE-8BYTE-ALIGN-LABEL: @test_no_shrink_intrin_floor_multi_use_fpext(
+; DOUBLE-8BYTE-ALIGN-NEXT:    [[D:%.*]] = fpext half [[C:%.*]] to double
+; DOUBLE-8BYTE-ALIGN-NEXT:    store volatile double [[D]], ptr undef, align 8
+; DOUBLE-8BYTE-ALIGN-NEXT:    [[E:%.*]] = call double @llvm.floor.f64(double [[D]])
+; DOUBLE-8BYTE-ALIGN-NEXT:    [[F:%.*]] = fptrunc double [[E]] to float
+; DOUBLE-8BYTE-ALIGN-NEXT:    ret float [[F]]
 ;
   %D = fpext half %C to double
   store volatile double %D, ptr undef
@@ -723,12 +730,19 @@
 }
 
 define float @test_no_shrink_intrin_fabs_multi_use_fpext(half %C) {
-; CHECK-LABEL: @test_no_shrink_intrin_fabs_multi_use_fpext(
-; CHECK-NEXT:    [[D:%.*]] = fpext half [[C:%.*]] to double
-; CHECK-NEXT:    store volatile double [[D]], ptr undef, align 8
-; CHECK-NEXT:    [[E:%.*]] = call double @llvm.fabs.f64(double [[D]])
-; CHECK-NEXT:    [[F:%.*]] = fptrunc double [[E]] to float
-; CHECK-NEXT:    ret float [[F]]
+; DOUBLE-4BYTE-ALIGN-LABEL: @test_no_shrink_intrin_fabs_multi_use_fpext(
+; DOUBLE-4BYTE-ALIGN-NEXT:    [[D:%.*]] = fpext half [[C:%.*]] to double
+; DOUBLE-4BYTE-ALIGN-NEXT:    store volatile double [[D]], ptr undef, align 4
+; DOUBLE-4BYTE-ALIGN-NEXT:    [[E:%.*]] = call double @llvm.fabs.f64(double [[D]])
+; DOUBLE-4BYTE-ALIGN-NEXT:    [[F:%.*]] = fptrunc double [[E]] to float
+; DOUBLE-4BYTE-ALIGN-NEXT:    ret float [[F]]
+;
+; DOUBLE-8BYTE-ALIGN-LABEL: @test_no_shrink_intrin_fabs_multi_use_fpext(
+; DOUBLE-8BYTE-ALIGN-NEXT:    [[D:%.*]] = fpext half [[C:%.*]] to double
+; DOUBLE-8BYTE-ALIGN-NEXT:    store volatile double [[D]], ptr undef, align 8
+; DOUBLE-8BYTE-ALIGN-NEXT:    [[E:%.*]] = call double @llvm.fabs.f64(double [[D]])
+; DOUBLE-8BYTE-ALIGN-NEXT:    [[F:%.*]] = fptrunc double [[E]] to float
+; DOUBLE-8BYTE-ALIGN-NEXT:    ret float [[F]]
 ;
   %D = fpext half %C to double
   store volatile double %D, ptr undef
diff --git a/llvm/test/Transforms/InstCombine/ffs-i16.ll b/llvm/test/Transforms/InstCombine/ffs-i16.ll
--- a/llvm/test/Transforms/InstCombine/ffs-i16.ll
+++ b/llvm/test/Transforms/InstCombine/ffs-i16.ll
@@ -13,13 +13,13 @@
 
 define void @fold_ffs(i16 %x) {
 ; AVR-LABEL: @fold_ffs(
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 1)
-; AVR-NEXT:    [[CTTZ:%.*]] = call i16 @llvm.cttz.i16(i16 [[X:%.*]], i1 true), !range [[RNG0:![0-9]+]]
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 1)
+; AVR-NEXT:    [[CTTZ:%.*]] = call addrspace(1) i16 @llvm.cttz.i16(i16 [[X:%.*]], i1 true), !range [[RNG0:![0-9]+]]
 ; AVR-NEXT:    [[TMP1:%.*]] = add nuw nsw i16 [[CTTZ]], 1
 ; AVR-NEXT:    [[DOTNOT:%.*]] = icmp eq i16 [[X]], 0
 ; AVR-NEXT:    [[NX:%.*]] = select i1 [[DOTNOT]], i16 0, i16 [[TMP1]]
-; AVR-NEXT:    call void @sink(i16 [[NX]])
+; AVR-NEXT:    call addrspace(1) void @sink(i16 [[NX]])
 ; AVR-NEXT:    ret void
 ;
 ; MSP430-LABEL: @fold_ffs(
diff --git a/llvm/test/Transforms/InstCombine/fls-i16.ll b/llvm/test/Transforms/InstCombine/fls-i16.ll
--- a/llvm/test/Transforms/InstCombine/fls-i16.ll
+++ b/llvm/test/Transforms/InstCombine/fls-i16.ll
@@ -14,11 +14,11 @@
 
 define void @fold_fls(i16 %x) {
 ; AVR-LABEL: @fold_fls(
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 1)
-; AVR-NEXT:    [[CTLZ:%.*]] = call i16 @llvm.ctlz.i16(i16 [[X:%.*]], i1 false), !range [[RNG0:![0-9]+]]
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 1)
+; AVR-NEXT:    [[CTLZ:%.*]] = call addrspace(1) i16 @llvm.ctlz.i16(i16 [[X:%.*]], i1 false), !range [[RNG0:![0-9]+]]
 ; AVR-NEXT:    [[NX:%.*]] = sub nuw nsw i16 16, [[CTLZ]]
-; AVR-NEXT:    call void @sink(i16 [[NX]])
+; AVR-NEXT:    call addrspace(1) void @sink(i16 [[NX]])
 ; AVR-NEXT:    ret void
 ;
 ; MSP430-LABEL: @fold_fls(
diff --git a/llvm/test/Transforms/InstCombine/isascii-i16.ll b/llvm/test/Transforms/InstCombine/isascii-i16.ll
--- a/llvm/test/Transforms/InstCombine/isascii-i16.ll
+++ b/llvm/test/Transforms/InstCombine/isascii-i16.ll
@@ -12,17 +12,17 @@
 
 define void @fold_isascii(i16 %c) {
 ; AVR-LABEL: @fold_isascii(
-; AVR-NEXT:    call void @sink(i16 1)
-; AVR-NEXT:    call void @sink(i16 1)
-; AVR-NEXT:    call void @sink(i16 1)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 1)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 1)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 1)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
 ; AVR-NEXT:    [[ISASCII:%.*]] = icmp ult i16 [[C:%.*]], 128
 ; AVR-NEXT:    [[IC:%.*]] = zext i1 [[ISASCII]] to i16
-; AVR-NEXT:    call void @sink(i16 [[IC]])
+; AVR-NEXT:    call addrspace(1) void @sink(i16 [[IC]])
 ; AVR-NEXT:    ret void
 ;
 ; MSP430-LABEL: @fold_isascii(
diff --git a/llvm/test/Transforms/InstCombine/isdigit-i16.ll b/llvm/test/Transforms/InstCombine/isdigit-i16.ll
--- a/llvm/test/Transforms/InstCombine/isdigit-i16.ll
+++ b/llvm/test/Transforms/InstCombine/isdigit-i16.ll
@@ -11,22 +11,22 @@
 
 define void @fold_isdigit(i16 %c) {
 ; AVR-LABEL: @fold_isdigit(
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 1)
-; AVR-NEXT:    call void @sink(i16 1)
-; AVR-NEXT:    call void @sink(i16 1)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 1)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 1)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 1)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
 ; AVR-NEXT:    [[ISDIGITTMP:%.*]] = add i16 [[C:%.*]], -48
 ; AVR-NEXT:    [[ISDIGIT:%.*]] = icmp ult i16 [[ISDIGITTMP]], 10
 ; AVR-NEXT:    [[IC:%.*]] = zext i1 [[ISDIGIT]] to i16
-; AVR-NEXT:    call void @sink(i16 [[IC]])
+; AVR-NEXT:    call addrspace(1) void @sink(i16 [[IC]])
 ; AVR-NEXT:    ret void
 ;
 ; MSP430-LABEL: @fold_isdigit(
diff --git a/llvm/test/Transforms/InstCombine/pow-4.ll b/llvm/test/Transforms/InstCombine/pow-4.ll
--- a/llvm/test/Transforms/InstCombine/pow-4.ll
+++ b/llvm/test/Transforms/InstCombine/pow-4.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=instcombine -S < %s -mtriple unknown                        | FileCheck %s --check-prefixes=CHECK,CHECKI32,CHECKSQRT
-; RUN: opt -passes=instcombine -S < %s -mtriple unknown -disable-builtin sqrt  | FileCheck %s --check-prefixes=CHECK,CHECKI32,CHECKNOSQRT
+; RUN: opt -passes=instcombine -S < %s -mtriple unknown -data-layout=e          | FileCheck %s --check-prefixes=CHECK,CHECKI32,CHECKSQRT
+; RUN: opt -passes=instcombine -S < %s -mtriple unknown -data-layout=e -disable-builtin sqrt  | FileCheck %s --check-prefixes=CHECK,CHECKI32,CHECKNOSQRT
 ; RUN: opt -passes=instcombine -S < %s -mtriple msp430                        | FileCheck %s --check-prefixes=CHECK,CHECKI16,CHECKSQRT
 ; RUN: opt -passes=instcombine -S < %s -mtriple msp430 -disable-builtin sqrt  | FileCheck %s --check-prefixes=CHECK,CHECKI16,CHECKNOSQRT
 
diff --git a/llvm/test/Transforms/InstCombine/pow_fp_int.ll b/llvm/test/Transforms/InstCombine/pow_fp_int.ll
--- a/llvm/test/Transforms/InstCombine/pow_fp_int.ll
+++ b/llvm/test/Transforms/InstCombine/pow_fp_int.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -mtriple unknown -passes=instcombine -S < %s | FileCheck %s
+target datalayout = "e"
 
 ; PR42190
 ; Can't generate test checks due to PR42740.
diff --git a/llvm/test/Transforms/InstCombine/printf-i16.ll b/llvm/test/Transforms/InstCombine/printf-i16.ll
--- a/llvm/test/Transforms/InstCombine/printf-i16.ll
+++ b/llvm/test/Transforms/InstCombine/printf-i16.ll
@@ -24,21 +24,21 @@
 
 define void @xform_printf(i8 %c8, i16 %c16) {
 ; AVR-LABEL: @xform_printf(
-; AVR-NEXT:    [[PUTCHAR:%.*]] = call i16 @putchar(i16 1)
-; AVR-NEXT:    [[PUTCHAR1:%.*]] = call i16 @putchar(i16 1)
-; AVR-NEXT:    [[PUTCHAR2:%.*]] = call i16 @putchar(i16 1)
-; AVR-NEXT:    [[PUTCHAR3:%.*]] = call i16 @putchar(i16 127)
-; AVR-NEXT:    [[PUTCHAR4:%.*]] = call i16 @putchar(i16 127)
-; AVR-NEXT:    [[PUTCHAR5:%.*]] = call i16 @putchar(i16 127)
-; AVR-NEXT:    [[PUTCHAR6:%.*]] = call i16 @putchar(i16 128)
-; AVR-NEXT:    [[PUTCHAR7:%.*]] = call i16 @putchar(i16 128)
-; AVR-NEXT:    [[PUTCHAR8:%.*]] = call i16 @putchar(i16 128)
-; AVR-NEXT:    [[PUTCHAR9:%.*]] = call i16 @putchar(i16 255)
-; AVR-NEXT:    [[PUTCHAR10:%.*]] = call i16 @putchar(i16 255)
-; AVR-NEXT:    [[PUTCHAR11:%.*]] = call i16 @putchar(i16 255)
+; AVR-NEXT:    [[PUTCHAR:%.*]] = call addrspace(1) i16 @putchar(i16 1)
+; AVR-NEXT:    [[PUTCHAR1:%.*]] = call addrspace(1) i16 @putchar(i16 1)
+; AVR-NEXT:    [[PUTCHAR2:%.*]] = call addrspace(1) i16 @putchar(i16 1)
+; AVR-NEXT:    [[PUTCHAR3:%.*]] = call addrspace(1) i16 @putchar(i16 127)
+; AVR-NEXT:    [[PUTCHAR4:%.*]] = call addrspace(1) i16 @putchar(i16 127)
+; AVR-NEXT:    [[PUTCHAR5:%.*]] = call addrspace(1) i16 @putchar(i16 127)
+; AVR-NEXT:    [[PUTCHAR6:%.*]] = call addrspace(1) i16 @putchar(i16 128)
+; AVR-NEXT:    [[PUTCHAR7:%.*]] = call addrspace(1) i16 @putchar(i16 128)
+; AVR-NEXT:    [[PUTCHAR8:%.*]] = call addrspace(1) i16 @putchar(i16 128)
+; AVR-NEXT:    [[PUTCHAR9:%.*]] = call addrspace(1) i16 @putchar(i16 255)
+; AVR-NEXT:    [[PUTCHAR10:%.*]] = call addrspace(1) i16 @putchar(i16 255)
+; AVR-NEXT:    [[PUTCHAR11:%.*]] = call addrspace(1) i16 @putchar(i16 255)
 ; AVR-NEXT:    [[TMP1:%.*]] = zext i8 [[C8:%.*]] to i16
-; AVR-NEXT:    [[PUTCHAR12:%.*]] = call i16 @putchar(i16 [[TMP1]])
-; AVR-NEXT:    [[PUTCHAR13:%.*]] = call i16 @putchar(i16 [[C16:%.*]])
+; AVR-NEXT:    [[PUTCHAR12:%.*]] = call addrspace(1) i16 @putchar(i16 [[TMP1]])
+; AVR-NEXT:    [[PUTCHAR13:%.*]] = call addrspace(1) i16 @putchar(i16 [[C16:%.*]])
 ; AVR-NEXT:    ret void
 ;
 ; MSP430-LABEL: @xform_printf(
diff --git a/llvm/test/Transforms/InstCombine/puts-i16.ll b/llvm/test/Transforms/InstCombine/puts-i16.ll
--- a/llvm/test/Transforms/InstCombine/puts-i16.ll
+++ b/llvm/test/Transforms/InstCombine/puts-i16.ll
@@ -14,7 +14,7 @@
 define void @xform_puts(i16 %c) {
 ; Transform puts("") to putchar("\n").
 ; AVR-LABEL: @xform_puts(
-; AVR-NEXT:    [[PUTCHAR:%.*]] = call i16 @putchar(i16 10)
+; AVR-NEXT:    [[PUTCHAR:%.*]] = call addrspace(1) i16 @putchar(i16 10)
 ; AVR-NEXT:    ret void
 ;
 ; MSP430-LABEL: @xform_puts(
diff --git a/llvm/test/Transforms/InstCombine/sincospi.ll b/llvm/test/Transforms/InstCombine/sincospi.ll
--- a/llvm/test/Transforms/InstCombine/sincospi.ll
+++ b/llvm/test/Transforms/InstCombine/sincospi.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=instcombine -S < %s -mtriple=x86_64-apple-macosx10.9 | FileCheck %s --check-prefix=CHECK-FLOAT-IN-VEC
-; RUN: opt -passes=instcombine -S < %s -mtriple=arm-apple-ios7.0 | FileCheck %s
-; RUN: opt -passes=instcombine -S < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
-; RUN: opt -passes=instcombine -S < %s -mtriple=x86_64-apple-macosx10.8 | FileCheck %s --check-prefix=CHECK-NO-SINCOS
-; RUN: opt -passes=instcombine -S < %s -mtriple=arm-apple-ios6.0 | FileCheck %s --check-prefix=CHECK-NO-SINCOS
-; RUN: opt -passes=instcombine -S < %s -mtriple=x86_64-none-linux-gnu | FileCheck %s --check-prefix=CHECK-NO-SINCOS
+; RUN: opt -passes=instcombine -S < %s -mtriple=arm-apple-ios7.0 | FileCheck %s --check-prefixes=CHECK,CHECK-DOUBLE-ALIGN4
+; RUN: opt -passes=instcombine -S < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefixes=CHECK,CHECK-DOUBLE-ALIGN8
+; RUN: opt -passes=instcombine -S < %s -mtriple=x86_64-apple-macosx10.8 | FileCheck %s --check-prefixes=CHECK-NO-SINCOS,CHECK-NO-SINCOS-DOUBLE-ALIGN8
+; RUN: opt -passes=instcombine -S < %s -mtriple=arm-apple-ios6.0 | FileCheck %s --check-prefixes=CHECK-NO-SINCOS,CHECK-NO-SINCOS-DOUBLE-ALIGN4
+; RUN: opt -passes=instcombine -S < %s -mtriple=x86_64-none-linux-gnu | FileCheck %s --check-prefixes=CHECK-NO-SINCOS,CHECK-NO-SINCOS-DOUBLE-ALIGN8
 
 
 attributes #0 = { readnone nounwind }
@@ -129,21 +129,37 @@
 ; CHECK-FLOAT-IN-VEC-NEXT:    [[RES:%.*]] = fadd double [[SINPI]], [[COSPI]]
 ; CHECK-FLOAT-IN-VEC-NEXT:    ret double [[RES]]
 ;
-; CHECK-LABEL: @test_instbased_f64(
-; CHECK-NEXT:    [[VAL:%.*]] = load double, ptr @var64, align 8
-; CHECK-NEXT:    [[SINCOSPI:%.*]] = call { double, double } @__sincospi_stret(double [[VAL]])
-; CHECK-NEXT:    [[SINPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 0
-; CHECK-NEXT:    [[COSPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 1
-; CHECK-NEXT:    [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]]
-; CHECK-NEXT:    [[RES:%.*]] = fadd double [[SINPI]], [[COSPI]]
-; CHECK-NEXT:    ret double [[RES]]
+; CHECK-DOUBLE-ALIGN4-LABEL: @test_instbased_f64(
+; CHECK-DOUBLE-ALIGN4-NEXT:    [[VAL:%.*]] = load double, ptr @var64, align 4
+; CHECK-DOUBLE-ALIGN4-NEXT:    [[SINCOSPI:%.*]] = call { double, double } @__sincospi_stret(double [[VAL]])
+; CHECK-DOUBLE-ALIGN4-NEXT:    [[SINPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 0
+; CHECK-DOUBLE-ALIGN4-NEXT:    [[COSPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 1
+; CHECK-DOUBLE-ALIGN4-NEXT:    [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]]
+; CHECK-DOUBLE-ALIGN4-NEXT:    [[RES:%.*]] = fadd double [[SINPI]], [[COSPI]]
+; CHECK-DOUBLE-ALIGN4-NEXT:    ret double [[RES]]
 ;
-; CHECK-NO-SINCOS-LABEL: @test_instbased_f64(
-; CHECK-NO-SINCOS-NEXT:    [[VAL:%.*]] = load double, ptr @var64, align 8
-; CHECK-NO-SINCOS-NEXT:    [[SIN:%.*]] = call double @__sinpi(double [[VAL]]) #[[ATTR0]]
-; CHECK-NO-SINCOS-NEXT:    [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]]
-; CHECK-NO-SINCOS-NEXT:    [[RES:%.*]] = fadd double [[SIN]], [[COS]]
-; CHECK-NO-SINCOS-NEXT:    ret double [[RES]]
+; CHECK-DOUBLE-ALIGN8-LABEL: @test_instbased_f64(
+; CHECK-DOUBLE-ALIGN8-NEXT:    [[VAL:%.*]] = load double, ptr @var64, align 8
+; CHECK-DOUBLE-ALIGN8-NEXT:    [[SINCOSPI:%.*]] = call { double, double } @__sincospi_stret(double [[VAL]])
+; CHECK-DOUBLE-ALIGN8-NEXT:    [[SINPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 0
+; CHECK-DOUBLE-ALIGN8-NEXT:    [[COSPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 1
+; CHECK-DOUBLE-ALIGN8-NEXT:    [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]]
+; CHECK-DOUBLE-ALIGN8-NEXT:    [[RES:%.*]] = fadd double [[SINPI]], [[COSPI]]
+; CHECK-DOUBLE-ALIGN8-NEXT:    ret double [[RES]]
+;
+; CHECK-NO-SINCOS-DOUBLE-ALIGN8-LABEL: @test_instbased_f64(
+; CHECK-NO-SINCOS-DOUBLE-ALIGN8-NEXT:    [[VAL:%.*]] = load double, ptr @var64, align 8
+; CHECK-NO-SINCOS-DOUBLE-ALIGN8-NEXT:    [[SIN:%.*]] = call double @__sinpi(double [[VAL]]) #[[ATTR0]]
+; CHECK-NO-SINCOS-DOUBLE-ALIGN8-NEXT:    [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]]
+; CHECK-NO-SINCOS-DOUBLE-ALIGN8-NEXT:    [[RES:%.*]] = fadd double [[SIN]], [[COS]]
+; CHECK-NO-SINCOS-DOUBLE-ALIGN8-NEXT:    ret double [[RES]]
+;
+; CHECK-NO-SINCOS-DOUBLE-ALIGN4-LABEL: @test_instbased_f64(
+; CHECK-NO-SINCOS-DOUBLE-ALIGN4-NEXT:    [[VAL:%.*]] = load double, ptr @var64, align 4
+; CHECK-NO-SINCOS-DOUBLE-ALIGN4-NEXT:    [[SIN:%.*]] = call double @__sinpi(double [[VAL]]) #[[ATTR0]]
+; CHECK-NO-SINCOS-DOUBLE-ALIGN4-NEXT:    [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]]
+; CHECK-NO-SINCOS-DOUBLE-ALIGN4-NEXT:    [[RES:%.*]] = fadd double [[SIN]], [[COS]]
+; CHECK-NO-SINCOS-DOUBLE-ALIGN4-NEXT:    ret double [[RES]]
 ;
   %val = load double, ptr @var64
   %sin = call double @__sinpi(double %val) #0
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/calls-math-finite.ll b/llvm/test/Transforms/InstSimplify/ConstProp/calls-math-finite.ll
--- a/llvm/test/Transforms/InstSimplify/ConstProp/calls-math-finite.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/calls-math-finite.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
-; RUN: opt < %s -passes=instsimplify -S -mtriple=unknown-unknown-linux-musl | FileCheck -check-prefix=MUSL %s
+; RUN: opt < %s -passes=instsimplify -S -mtriple=x86_64-unknown-linux-musl | FileCheck -check-prefix=MUSL %s
 
 ; Test to verify constant folding can occur when math routines are mapped
 ; to the __<func>_finite versions of functions due to __FINITE_MATH_ONLY__
 ; being enabled on headers on Linux. All calls should constant fold away
 ; in this test.
 
-target triple = "unknown-unknown-linux-gnu"
+target triple = "x86_64-unknown-linux-gnu"
 
 declare double @__acos_finite(double) #0
 declare float @__acosf_finite(float) #0
diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll
--- a/llvm/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll
@@ -10,21 +10,23 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[WHILE_BODY_LR_PH_I:%.*]]
 ; CHECK:       while.body.lr.ph.i:
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 16
 ; CHECK-NEXT:    br label [[WHILE_BODY_I:%.*]]
 ; CHECK:       while.body.i:
 ; CHECK-NEXT:    [[INDVARS_IV7_I:%.*]] = phi i64 [ 16, [[WHILE_BODY_LR_PH_I]] ], [ [[INDVARS_IV_NEXT8_I:%.*]], [[COND_TRUE29_I:%.*]] ]
 ; CHECK-NEXT:    [[I_05_I:%.*]] = phi i64 [ 0, [[WHILE_BODY_LR_PH_I]] ], [ [[INDVARS_IV7_I]], [[COND_TRUE29_I]] ]
+; CHECK-NEXT:    [[LSR4:%.*]] = trunc i64 [[I_05_I]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[LSR4]] to i64
+; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[UGLYGEP]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[SEXT_I:%.*]] = shl i64 [[I_05_I]], 32
 ; CHECK-NEXT:    [[IDX_EXT_I:%.*]] = ashr exact i64 [[SEXT_I]], 32
 ; CHECK-NEXT:    [[ADD_PTR_SUM_I:%.*]] = add i64 [[IDX_EXT_I]], 16
 ; CHECK-NEXT:    br label [[FOR_BODY_I:%.*]]
 ; CHECK:       for.body.i:
-; CHECK-NEXT:    [[INDVARS_IV_I:%.*]] = phi i64 [ 0, [[WHILE_BODY_I]] ], [ [[INDVARS_IV_NEXT_I:%.*]], [[FOR_BODY_I]] ]
-; CHECK-NEXT:    [[ADD_PTR_SUM:%.*]] = add i64 [[ADD_PTR_SUM_I]], [[INDVARS_IV_I]]
-; CHECK-NEXT:    [[ARRAYIDX22_I:%.*]] = getelementptr inbounds i8, ptr [[BASE:%.*]], i64 [[ADD_PTR_SUM]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX22_I]], align 1
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_I]] = add i64 [[INDVARS_IV_I]], 1
+; CHECK-NEXT:    [[LSR_IV2:%.*]] = phi ptr [ [[UGLYGEP3:%.*]], [[FOR_BODY_I]] ], [ [[UGLYGEP1]], [[WHILE_BODY_I]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[LSR_IV2]], align 1
 ; CHECK-NEXT:    [[CMP:%.*]] = call i1 @check() #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    [[UGLYGEP3]] = getelementptr i8, ptr [[LSR_IV2]], i64 1
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_END_I:%.*]], label [[FOR_BODY_I]]
 ; CHECK:       for.end.i:
 ; CHECK-NEXT:    [[ADD_PTR_I144:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD_PTR_SUM_I]]
@@ -93,18 +95,20 @@
 ; CHECK:       for.cond468.preheader:
 ; CHECK-NEXT:    br label [[FOR_COND468:%.*]]
 ; CHECK:       for.cond468:
-; CHECK-NEXT:    [[INDVARS_IV1163:%.*]] = phi i64 [ [[INDVARS_IV_NEXT1164:%.*]], [[IF_THEN477:%.*]] ], [ 1, [[FOR_COND468_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVARS_IV1163]] to i32
-; CHECK-NEXT:    [[CMP469:%.*]] = icmp slt i32 [[TMP0]], [[N:%.*]]
+; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi i32 [ 1, [[FOR_COND468_PREHEADER]] ], [ [[LSR_IV_NEXT:%.*]], [[IF_THEN477:%.*]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ getelementptr inbounds ([5000 x %struct.anon.7.91.199.307.415.475.559.643.751.835.943.1003.1111.1219.1351.1375.1399.1435.1471.1483.1519.1531.1651.1771], ptr @tags, i64 0, i64 0, i32 2), [[FOR_COND468_PREHEADER]] ], [ [[UGLYGEP:%.*]], [[IF_THEN477]] ]
+; CHECK-NEXT:    [[K_0:%.*]] = load i32, ptr [[LSR_IV]], align 4
+; CHECK-NEXT:    [[CMP469:%.*]] = icmp slt i32 [[LSR_IV1]], [[N:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP469]], label [[FOR_BODY471:%.*]], label [[FOR_INC498_PREHEADER:%.*]]
 ; CHECK:       for.body471:
-; CHECK-NEXT:    [[FIRST:%.*]] = getelementptr inbounds [5000 x %struct.anon.7.91.199.307.415.475.559.643.751.835.943.1003.1111.1219.1351.1375.1399.1435.1471.1483.1519.1531.1651.1771], ptr @tags, i64 0, i64 [[INDVARS_IV1163]], i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[FIRST]], align 4
+; CHECK-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[LSR_IV]], i64 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[UGLYGEP2]], align 4
 ; CHECK-NEXT:    br i1 false, label [[IF_THEN477]], label [[FOR_INC498_PREHEADER]]
 ; CHECK:       for.inc498.preheader:
 ; CHECK-NEXT:    br label [[FOR_INC498:%.*]]
 ; CHECK:       if.then477:
-; CHECK-NEXT:    [[INDVARS_IV_NEXT1164]] = add i64 [[INDVARS_IV1163]], 1
+; CHECK-NEXT:    [[UGLYGEP]] = getelementptr i8, ptr [[LSR_IV]], i64 12
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add nuw nsw i32 [[LSR_IV1]], 1
 ; CHECK-NEXT:    br label [[FOR_COND468]]
 ; CHECK:       for.inc498:
 ; CHECK-NEXT:    br label [[FOR_INC498]]
@@ -154,27 +158,27 @@
 ; CHECK:       for.body3.us.i:
 ; CHECK-NEXT:    [[INDVARS_IV_I_SV_PHI:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_I:%.*]], [[MESHBB]] ], [ 0, [[FOR_BODY3_LR_PH_US_I:%.*]] ]
 ; CHECK-NEXT:    [[OPQ_SA_CALC12:%.*]] = sub i32 undef, 227
-; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[INDVARS_IV_I_SV_PHI]], [[INDVARS_IV8_I_SV_PHI26:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
-; CHECK-NEXT:    [[MUL_I_US_I:%.*]] = mul nsw i32 0, [[TMP1]]
-; CHECK-NEXT:    [[ARRAYIDX5_US_I:%.*]] = getelementptr inbounds double, ptr [[U:%.*]], i64 [[INDVARS_IV_I_SV_PHI]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX5_US_I]], align 8
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_I]] = add i64 [[INDVARS_IV_I_SV_PHI]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[LSR_IV:%.*]], [[INDVARS_IV_I_SV_PHI]]
+; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[TMP0]] to i32
+; CHECK-NEXT:    [[MUL_I_US_I:%.*]] = mul nsw i32 0, [[TMP]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV_I_SV_PHI]], 3
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[U:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[UGLYGEP]], align 8
 ; CHECK-NEXT:    br i1 undef, label [[FOR_INC8_US_I:%.*]], label [[MESHBB]]
 ; CHECK:       for.body3.lr.ph.us.i.loopexit:
+; CHECK-NEXT:    [[LSR_IV_NEXT:%.*]] = add i64 [[LSR_IV]], 1
 ; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH_US_I]]
 ; CHECK:       for.body3.lr.ph.us.i:
-; CHECK-NEXT:    [[INDVARS_IV8_I_SV_PHI26]] = phi i64 [ undef, [[MESHBB1]] ], [ [[INDVARS_IV8_I_SV_PHI24:%.*]], [[FOR_BODY3_LR_PH_US_I_LOOPEXIT:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX_US_I:%.*]] = getelementptr inbounds double, ptr undef, i64 [[INDVARS_IV8_I_SV_PHI26]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDVARS_IV8_I_SV_PHI26]], 1
+; CHECK-NEXT:    [[LSR_IV]] = phi i64 [ [[LSR_IV_NEXT]], [[FOR_BODY3_LR_PH_US_I_LOOPEXIT:%.*]] ], [ undef, [[MESHBB1]] ]
+; CHECK-NEXT:    [[ARRAYIDX_US_I:%.*]] = getelementptr inbounds double, ptr undef, i64 [[LSR_IV]]
 ; CHECK-NEXT:    br label [[FOR_BODY3_US_I:%.*]]
 ; CHECK:       for.inc8.us.i2:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       eval_At_times_u.exit:
 ; CHECK-NEXT:    ret void
 ; CHECK:       meshBB:
-; CHECK-NEXT:    [[INDVARS_IV8_I_SV_PHI24]] = phi i64 [ undef, [[FOR_BODY3_US_I]] ], [ [[TMP3]], [[FOR_INC8_US_I]] ]
 ; CHECK-NEXT:    [[MESHSTACKVARIABLE_PHI:%.*]] = phi i32 [ [[OPQ_SA_CALC12]], [[FOR_BODY3_US_I]] ], [ undef, [[FOR_INC8_US_I]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_I]] = add i64 [[INDVARS_IV_I_SV_PHI]], 1
 ; CHECK-NEXT:    br i1 true, label [[FOR_BODY3_LR_PH_US_I_LOOPEXIT]], label [[FOR_BODY3_US_I]]
 ; CHECK:       meshBB1.loopexit:
 ; CHECK-NEXT:    br label [[MESHBB1]]
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/runtime-unroll-generic.ll b/llvm/test/Transforms/LoopUnroll/AArch64/runtime-unroll-generic.ll
--- a/llvm/test/Transforms/LoopUnroll/AArch64/runtime-unroll-generic.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/runtime-unroll-generic.ll
@@ -9,116 +9,110 @@
 ; CHECK-A55-NEXT:    [[CMP52_NOT:%.*]] = icmp eq i32 [[ARG_0:%.*]], 0
 ; CHECK-A55-NEXT:    br i1 [[CMP52_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY6_PREHEADER:%.*]]
 ; CHECK-A55:       for.body6.preheader:
-; CHECK-A55-NEXT:    [[XTRAITER:%.*]] = and i32 [[ARG_0]], 3
+; CHECK-A55-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[ARG_0]] to i64
+; CHECK-A55-NEXT:    [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 3
 ; CHECK-A55-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[ARG_0]], 4
 ; CHECK-A55-NEXT:    br i1 [[TMP0]], label [[FOR_END_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_BODY6_PREHEADER_NEW:%.*]]
 ; CHECK-A55:       for.body6.preheader.new:
-; CHECK-A55-NEXT:    [[UNROLL_ITER:%.*]] = and i32 [[ARG_0]], -4
+; CHECK-A55-NEXT:    [[UNROLL_ITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292
 ; CHECK-A55-NEXT:    br label [[FOR_BODY6:%.*]]
 ; CHECK-A55:       for.body6:
-; CHECK-A55-NEXT:    [[K_03:%.*]] = phi i32 [ 0, [[FOR_BODY6_PREHEADER_NEW]] ], [ [[INC_3:%.*]], [[FOR_BODY6]] ]
-; CHECK-A55-NEXT:    [[NITER:%.*]] = phi i32 [ 0, [[FOR_BODY6_PREHEADER_NEW]] ], [ [[NITER_NEXT_3:%.*]], [[FOR_BODY6]] ]
-; CHECK-A55-NEXT:    [[IDX_EXT:%.*]] = zext i32 [[K_03]] to i64
-; CHECK-A55-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[ARG_2:%.*]], i64 [[IDX_EXT]]
+; CHECK-A55-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY6_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_BODY6]] ]
+; CHECK-A55-NEXT:    [[NITER:%.*]] = phi i64 [ 0, [[FOR_BODY6_PREHEADER_NEW]] ], [ [[NITER_NEXT_3:%.*]], [[FOR_BODY6]] ]
+; CHECK-A55-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[ARG_2:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-A55-NEXT:    [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
 ; CHECK-A55-NEXT:    [[CONV:%.*]] = sext i16 [[TMP1]] to i32
-; CHECK-A55-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, ptr [[ARG_3:%.*]], i64 [[IDX_EXT]]
+; CHECK-A55-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, ptr [[ARG_3:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-A55-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX14]], align 2
 ; CHECK-A55-NEXT:    [[CONV15:%.*]] = sext i16 [[TMP2]] to i32
 ; CHECK-A55-NEXT:    [[MUL16:%.*]] = mul nsw i32 [[CONV15]], [[CONV]]
-; CHECK-A55-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[ARG_1:%.*]], i64 [[IDX_EXT]]
+; CHECK-A55-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[ARG_1:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-A55-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4
 ; CHECK-A55-NEXT:    [[ADD21:%.*]] = add nsw i32 [[MUL16]], [[TMP3]]
 ; CHECK-A55-NEXT:    store i32 [[ADD21]], ptr [[ARRAYIDX20]], align 4
-; CHECK-A55-NEXT:    [[INC:%.*]] = or i32 [[K_03]], 1
-; CHECK-A55-NEXT:    [[IDX_EXT_1:%.*]] = zext i32 [[INC]] to i64
-; CHECK-A55-NEXT:    [[ARRAYIDX10_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[IDX_EXT_1]]
+; CHECK-A55-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; CHECK-A55-NEXT:    [[ARRAYIDX10_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[INDVARS_IV_NEXT]]
 ; CHECK-A55-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX10_1]], align 2
 ; CHECK-A55-NEXT:    [[CONV_1:%.*]] = sext i16 [[TMP4]] to i32
-; CHECK-A55-NEXT:    [[ARRAYIDX14_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[IDX_EXT_1]]
+; CHECK-A55-NEXT:    [[ARRAYIDX14_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[INDVARS_IV_NEXT]]
 ; CHECK-A55-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX14_1]], align 2
 ; CHECK-A55-NEXT:    [[CONV15_1:%.*]] = sext i16 [[TMP5]] to i32
 ; CHECK-A55-NEXT:    [[MUL16_1:%.*]] = mul nsw i32 [[CONV15_1]], [[CONV_1]]
-; CHECK-A55-NEXT:    [[ARRAYIDX20_1:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[IDX_EXT_1]]
+; CHECK-A55-NEXT:    [[ARRAYIDX20_1:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[INDVARS_IV_NEXT]]
 ; CHECK-A55-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX20_1]], align 4
 ; CHECK-A55-NEXT:    [[ADD21_1:%.*]] = add nsw i32 [[MUL16_1]], [[TMP6]]
 ; CHECK-A55-NEXT:    store i32 [[ADD21_1]], ptr [[ARRAYIDX20_1]], align 4
-; CHECK-A55-NEXT:    [[INC_1:%.*]] = or i32 [[K_03]], 2
-; CHECK-A55-NEXT:    [[IDX_EXT_2:%.*]] = zext i32 [[INC_1]] to i64
-; CHECK-A55-NEXT:    [[ARRAYIDX10_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[IDX_EXT_2]]
+; CHECK-A55-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
+; CHECK-A55-NEXT:    [[ARRAYIDX10_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[INDVARS_IV_NEXT_1]]
 ; CHECK-A55-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX10_2]], align 2
 ; CHECK-A55-NEXT:    [[CONV_2:%.*]] = sext i16 [[TMP7]] to i32
-; CHECK-A55-NEXT:    [[ARRAYIDX14_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[IDX_EXT_2]]
+; CHECK-A55-NEXT:    [[ARRAYIDX14_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[INDVARS_IV_NEXT_1]]
 ; CHECK-A55-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX14_2]], align 2
 ; CHECK-A55-NEXT:    [[CONV15_2:%.*]] = sext i16 [[TMP8]] to i32
 ; CHECK-A55-NEXT:    [[MUL16_2:%.*]] = mul nsw i32 [[CONV15_2]], [[CONV_2]]
-; CHECK-A55-NEXT:    [[ARRAYIDX20_2:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[IDX_EXT_2]]
+; CHECK-A55-NEXT:    [[ARRAYIDX20_2:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[INDVARS_IV_NEXT_1]]
 ; CHECK-A55-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX20_2]], align 4
 ; CHECK-A55-NEXT:    [[ADD21_2:%.*]] = add nsw i32 [[MUL16_2]], [[TMP9]]
 ; CHECK-A55-NEXT:    store i32 [[ADD21_2]], ptr [[ARRAYIDX20_2]], align 4
-; CHECK-A55-NEXT:    [[INC_2:%.*]] = or i32 [[K_03]], 3
-; CHECK-A55-NEXT:    [[IDX_EXT_3:%.*]] = zext i32 [[INC_2]] to i64
-; CHECK-A55-NEXT:    [[ARRAYIDX10_3:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[IDX_EXT_3]]
+; CHECK-A55-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
+; CHECK-A55-NEXT:    [[ARRAYIDX10_3:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[INDVARS_IV_NEXT_2]]
 ; CHECK-A55-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX10_3]], align 2
 ; CHECK-A55-NEXT:    [[CONV_3:%.*]] = sext i16 [[TMP10]] to i32
-; CHECK-A55-NEXT:    [[ARRAYIDX14_3:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[IDX_EXT_3]]
+; CHECK-A55-NEXT:    [[ARRAYIDX14_3:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[INDVARS_IV_NEXT_2]]
 ; CHECK-A55-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX14_3]], align 2
 ; CHECK-A55-NEXT:    [[CONV15_3:%.*]] = sext i16 [[TMP11]] to i32
 ; CHECK-A55-NEXT:    [[MUL16_3:%.*]] = mul nsw i32 [[CONV15_3]], [[CONV_3]]
-; CHECK-A55-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[IDX_EXT_3]]
+; CHECK-A55-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[INDVARS_IV_NEXT_2]]
 ; CHECK-A55-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX20_3]], align 4
 ; CHECK-A55-NEXT:    [[ADD21_3:%.*]] = add nsw i32 [[MUL16_3]], [[TMP12]]
 ; CHECK-A55-NEXT:    store i32 [[ADD21_3]], ptr [[ARRAYIDX20_3]], align 4
-; CHECK-A55-NEXT:    [[INC_3]] = add nuw i32 [[K_03]], 4
-; CHECK-A55-NEXT:    [[NITER_NEXT_3]] = add i32 [[NITER]], 4
-; CHECK-A55-NEXT:    [[NITER_NCMP_3_NOT:%.*]] = icmp eq i32 [[NITER_NEXT_3]], [[UNROLL_ITER]]
-; CHECK-A55-NEXT:    br i1 [[NITER_NCMP_3_NOT]], label [[FOR_END_LOOPEXIT_UNR_LCSSA]], label [[FOR_BODY6]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-A55-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; CHECK-A55-NEXT:    [[NITER_NEXT_3]] = add i64 [[NITER]], 4
+; CHECK-A55-NEXT:    [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NEXT_3]], [[UNROLL_ITER]]
+; CHECK-A55-NEXT:    br i1 [[NITER_NCMP_3]], label [[FOR_END_LOOPEXIT_UNR_LCSSA]], label [[FOR_BODY6]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-A55:       for.end.loopexit.unr-lcssa:
-; CHECK-A55-NEXT:    [[K_03_UNR:%.*]] = phi i32 [ 0, [[FOR_BODY6_PREHEADER]] ], [ [[INC_3]], [[FOR_BODY6]] ]
-; CHECK-A55-NEXT:    [[LCMP_MOD_NOT:%.*]] = icmp eq i32 [[XTRAITER]], 0
+; CHECK-A55-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[FOR_BODY6_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3]], [[FOR_BODY6]] ]
+; CHECK-A55-NEXT:    [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0
 ; CHECK-A55-NEXT:    br i1 [[LCMP_MOD_NOT]], label [[FOR_END]], label [[FOR_BODY6_EPIL:%.*]]
 ; CHECK-A55:       for.body6.epil:
-; CHECK-A55-NEXT:    [[IDX_EXT_EPIL:%.*]] = zext i32 [[K_03_UNR]] to i64
-; CHECK-A55-NEXT:    [[ARRAYIDX10_EPIL:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[IDX_EXT_EPIL]]
+; CHECK-A55-NEXT:    [[ARRAYIDX10_EPIL:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[INDVARS_IV_UNR]]
 ; CHECK-A55-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX10_EPIL]], align 2
 ; CHECK-A55-NEXT:    [[CONV_EPIL:%.*]] = sext i16 [[TMP13]] to i32
-; CHECK-A55-NEXT:    [[ARRAYIDX14_EPIL:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[IDX_EXT_EPIL]]
+; CHECK-A55-NEXT:    [[ARRAYIDX14_EPIL:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[INDVARS_IV_UNR]]
 ; CHECK-A55-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX14_EPIL]], align 2
 ; CHECK-A55-NEXT:    [[CONV15_EPIL:%.*]] = sext i16 [[TMP14]] to i32
 ; CHECK-A55-NEXT:    [[MUL16_EPIL:%.*]] = mul nsw i32 [[CONV15_EPIL]], [[CONV_EPIL]]
-; CHECK-A55-NEXT:    [[ARRAYIDX20_EPIL:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[IDX_EXT_EPIL]]
+; CHECK-A55-NEXT:    [[ARRAYIDX20_EPIL:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[INDVARS_IV_UNR]]
 ; CHECK-A55-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX20_EPIL]], align 4
 ; CHECK-A55-NEXT:    [[ADD21_EPIL:%.*]] = add nsw i32 [[MUL16_EPIL]], [[TMP15]]
 ; CHECK-A55-NEXT:    store i32 [[ADD21_EPIL]], ptr [[ARRAYIDX20_EPIL]], align 4
-; CHECK-A55-NEXT:    [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i32 [[XTRAITER]], 1
+; CHECK-A55-NEXT:    [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 1
 ; CHECK-A55-NEXT:    br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_END]], label [[FOR_BODY6_EPIL_1:%.*]]
 ; CHECK-A55:       for.body6.epil.1:
-; CHECK-A55-NEXT:    [[INC_EPIL:%.*]] = add nuw i32 [[K_03_UNR]], 1
-; CHECK-A55-NEXT:    [[IDX_EXT_EPIL_1:%.*]] = zext i32 [[INC_EPIL]] to i64
-; CHECK-A55-NEXT:    [[ARRAYIDX10_EPIL_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[IDX_EXT_EPIL_1]]
+; CHECK-A55-NEXT:    [[INDVARS_IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[INDVARS_IV_UNR]], 1
+; CHECK-A55-NEXT:    [[ARRAYIDX10_EPIL_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[INDVARS_IV_NEXT_EPIL]]
 ; CHECK-A55-NEXT:    [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX10_EPIL_1]], align 2
 ; CHECK-A55-NEXT:    [[CONV_EPIL_1:%.*]] = sext i16 [[TMP16]] to i32
-; CHECK-A55-NEXT:    [[ARRAYIDX14_EPIL_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[IDX_EXT_EPIL_1]]
+; CHECK-A55-NEXT:    [[ARRAYIDX14_EPIL_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[INDVARS_IV_NEXT_EPIL]]
 ; CHECK-A55-NEXT:    [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX14_EPIL_1]], align 2
 ; CHECK-A55-NEXT:    [[CONV15_EPIL_1:%.*]] = sext i16 [[TMP17]] to i32
 ; CHECK-A55-NEXT:    [[MUL16_EPIL_1:%.*]] = mul nsw i32 [[CONV15_EPIL_1]], [[CONV_EPIL_1]]
-; CHECK-A55-NEXT:    [[ARRAYIDX20_EPIL_1:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[IDX_EXT_EPIL_1]]
+; CHECK-A55-NEXT:    [[ARRAYIDX20_EPIL_1:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[INDVARS_IV_NEXT_EPIL]]
 ; CHECK-A55-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX20_EPIL_1]], align 4
 ; CHECK-A55-NEXT:    [[ADD21_EPIL_1:%.*]] = add nsw i32 [[MUL16_EPIL_1]], [[TMP18]]
 ; CHECK-A55-NEXT:    store i32 [[ADD21_EPIL_1]], ptr [[ARRAYIDX20_EPIL_1]], align 4
-; CHECK-A55-NEXT:    [[EPIL_ITER_CMP_1_NOT:%.*]] = icmp eq i32 [[XTRAITER]], 2
+; CHECK-A55-NEXT:    [[EPIL_ITER_CMP_1_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 2
 ; CHECK-A55-NEXT:    br i1 [[EPIL_ITER_CMP_1_NOT]], label [[FOR_END]], label [[FOR_BODY6_EPIL_2:%.*]]
 ; CHECK-A55:       for.body6.epil.2:
-; CHECK-A55-NEXT:    [[INC_EPIL_1:%.*]] = add nuw i32 [[K_03_UNR]], 2
-; CHECK-A55-NEXT:    [[IDX_EXT_EPIL_2:%.*]] = zext i32 [[INC_EPIL_1]] to i64
-; CHECK-A55-NEXT:    [[ARRAYIDX10_EPIL_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[IDX_EXT_EPIL_2]]
+; CHECK-A55-NEXT:    [[INDVARS_IV_NEXT_EPIL_1:%.*]] = add nuw nsw i64 [[INDVARS_IV_UNR]], 2
+; CHECK-A55-NEXT:    [[ARRAYIDX10_EPIL_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[INDVARS_IV_NEXT_EPIL_1]]
 ; CHECK-A55-NEXT:    [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX10_EPIL_2]], align 2
 ; CHECK-A55-NEXT:    [[CONV_EPIL_2:%.*]] = sext i16 [[TMP19]] to i32
-; CHECK-A55-NEXT:    [[ARRAYIDX14_EPIL_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[IDX_EXT_EPIL_2]]
+; CHECK-A55-NEXT:    [[ARRAYIDX14_EPIL_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[INDVARS_IV_NEXT_EPIL_1]]
 ; CHECK-A55-NEXT:    [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX14_EPIL_2]], align 2
 ; CHECK-A55-NEXT:    [[CONV15_EPIL_2:%.*]] = sext i16 [[TMP20]] to i32
 ; CHECK-A55-NEXT:    [[MUL16_EPIL_2:%.*]] = mul nsw i32 [[CONV15_EPIL_2]], [[CONV_EPIL_2]]
-; CHECK-A55-NEXT:    [[ARRAYIDX20_EPIL_2:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[IDX_EXT_EPIL_2]]
+; CHECK-A55-NEXT:    [[ARRAYIDX20_EPIL_2:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[INDVARS_IV_NEXT_EPIL_1]]
 ; CHECK-A55-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX20_EPIL_2]], align 4
 ; CHECK-A55-NEXT:    [[ADD21_EPIL_2:%.*]] = add nsw i32 [[MUL16_EPIL_2]], [[TMP21]]
 ; CHECK-A55-NEXT:    store i32 [[ADD21_EPIL_2]], ptr [[ARRAYIDX20_EPIL_2]], align 4
@@ -129,24 +123,26 @@
 ; CHECK-GENERIC-LABEL: @runtime_unroll_generic(
 ; CHECK-GENERIC-NEXT:  entry:
 ; CHECK-GENERIC-NEXT:    [[CMP52_NOT:%.*]] = icmp eq i32 [[ARG_0:%.*]], 0
-; CHECK-GENERIC-NEXT:    br i1 [[CMP52_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY6:%.*]]
+; CHECK-GENERIC-NEXT:    br i1 [[CMP52_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY6_PREHEADER:%.*]]
+; CHECK-GENERIC:       for.body6.preheader:
+; CHECK-GENERIC-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[ARG_0]] to i64
+; CHECK-GENERIC-NEXT:    br label [[FOR_BODY6:%.*]]
 ; CHECK-GENERIC:       for.body6:
-; CHECK-GENERIC-NEXT:    [[K_03:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY6]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-GENERIC-NEXT:    [[IDX_EXT:%.*]] = zext i32 [[K_03]] to i64
-; CHECK-GENERIC-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[ARG_2:%.*]], i64 [[IDX_EXT]]
+; CHECK-GENERIC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY6_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY6]] ]
+; CHECK-GENERIC-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[ARG_2:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-GENERIC-NEXT:    [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
 ; CHECK-GENERIC-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
-; CHECK-GENERIC-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, ptr [[ARG_3:%.*]], i64 [[IDX_EXT]]
+; CHECK-GENERIC-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, ptr [[ARG_3:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-GENERIC-NEXT:    [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX14]], align 2
 ; CHECK-GENERIC-NEXT:    [[CONV15:%.*]] = sext i16 [[TMP1]] to i32
 ; CHECK-GENERIC-NEXT:    [[MUL16:%.*]] = mul nsw i32 [[CONV15]], [[CONV]]
-; CHECK-GENERIC-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[ARG_1:%.*]], i64 [[IDX_EXT]]
+; CHECK-GENERIC-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[ARG_1:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-GENERIC-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4
 ; CHECK-GENERIC-NEXT:    [[ADD21:%.*]] = add nsw i32 [[MUL16]], [[TMP2]]
 ; CHECK-GENERIC-NEXT:    store i32 [[ADD21]], ptr [[ARRAYIDX20]], align 4
-; CHECK-GENERIC-NEXT:    [[INC]] = add nuw i32 [[K_03]], 1
-; CHECK-GENERIC-NEXT:    [[CMP5:%.*]] = icmp ult i32 [[INC]], [[ARG_0]]
-; CHECK-GENERIC-NEXT:    br i1 [[CMP5]], label [[FOR_BODY6]], label [[FOR_END]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-GENERIC-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-GENERIC-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-GENERIC-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY6]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-GENERIC:       for.end:
 ; CHECK-GENERIC-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopUnroll/ARM/instr-size-costs.ll b/llvm/test/Transforms/LoopUnroll/ARM/instr-size-costs.ll
--- a/llvm/test/Transforms/LoopUnroll/ARM/instr-size-costs.ll
+++ b/llvm/test/Transforms/LoopUnroll/ARM/instr-size-costs.ll
@@ -103,19 +103,19 @@
 ; CHECK-V8-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[COUNT_1:%.*]], [[LOOP]] ]
 ; CHECK-V8-NEXT:    [[ADDR_A:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 [[IV]]
 ; CHECK-V8-NEXT:    [[ADDR_B:%.*]] = getelementptr i64, ptr [[B:%.*]], i32 [[IV]]
-; CHECK-V8-NEXT:    [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 4
-; CHECK-V8-NEXT:    [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 4
+; CHECK-V8-NEXT:    [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 8
+; CHECK-V8-NEXT:    [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 8
 ; CHECK-V8-NEXT:    [[RES:%.*]] = add i64 [[DATA_A]], [[DATA_B]]
 ; CHECK-V8-NEXT:    [[ADDR_C:%.*]] = getelementptr i64, ptr [[C:%.*]], i32 [[IV]]
-; CHECK-V8-NEXT:    store i64 [[RES]], ptr [[ADDR_C]], align 4
+; CHECK-V8-NEXT:    store i64 [[RES]], ptr [[ADDR_C]], align 8
 ; CHECK-V8-NEXT:    [[COUNT:%.*]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-V8-NEXT:    [[ADDR_A_1:%.*]] = getelementptr i64, ptr [[A]], i32 [[COUNT]]
 ; CHECK-V8-NEXT:    [[ADDR_B_1:%.*]] = getelementptr i64, ptr [[B]], i32 [[COUNT]]
-; CHECK-V8-NEXT:    [[DATA_A_1:%.*]] = load i64, ptr [[ADDR_A_1]], align 4
-; CHECK-V8-NEXT:    [[DATA_B_1:%.*]] = load i64, ptr [[ADDR_B_1]], align 4
+; CHECK-V8-NEXT:    [[DATA_A_1:%.*]] = load i64, ptr [[ADDR_A_1]], align 8
+; CHECK-V8-NEXT:    [[DATA_B_1:%.*]] = load i64, ptr [[ADDR_B_1]], align 8
 ; CHECK-V8-NEXT:    [[RES_1:%.*]] = add i64 [[DATA_A_1]], [[DATA_B_1]]
 ; CHECK-V8-NEXT:    [[ADDR_C_1:%.*]] = getelementptr i64, ptr [[C]], i32 [[COUNT]]
-; CHECK-V8-NEXT:    store i64 [[RES_1]], ptr [[ADDR_C_1]], align 4
+; CHECK-V8-NEXT:    store i64 [[RES_1]], ptr [[ADDR_C_1]], align 8
 ; CHECK-V8-NEXT:    [[COUNT_1]] = add nuw nsw i32 [[IV]], 2
 ; CHECK-V8-NEXT:    [[END_1:%.*]] = icmp ne i32 [[COUNT_1]], 100
 ; CHECK-V8-NEXT:    br i1 [[END_1]], label [[LOOP]], label [[EXIT:%.*]]
@@ -150,19 +150,19 @@
 ; CHECK-V8-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[COUNT_1:%.*]], [[LOOP]] ]
 ; CHECK-V8-NEXT:    [[ADDR_A:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 [[IV]]
 ; CHECK-V8-NEXT:    [[ADDR_B:%.*]] = getelementptr i64, ptr [[B:%.*]], i32 [[IV]]
-; CHECK-V8-NEXT:    [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 4
-; CHECK-V8-NEXT:    [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 4
+; CHECK-V8-NEXT:    [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 8
+; CHECK-V8-NEXT:    [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 8
 ; CHECK-V8-NEXT:    [[RES:%.*]] = add i64 [[DATA_A]], [[DATA_B]]
 ; CHECK-V8-NEXT:    [[ADDR_C:%.*]] = getelementptr i64, ptr [[C:%.*]], i32 [[IV]]
-; CHECK-V8-NEXT:    store i64 [[RES]], ptr [[ADDR_C]], align 4
+; CHECK-V8-NEXT:    store i64 [[RES]], ptr [[ADDR_C]], align 8
 ; CHECK-V8-NEXT:    [[COUNT:%.*]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-V8-NEXT:    [[ADDR_A_1:%.*]] = getelementptr i64, ptr [[A]], i32 [[COUNT]]
 ; CHECK-V8-NEXT:    [[ADDR_B_1:%.*]] = getelementptr i64, ptr [[B]], i32 [[COUNT]]
-; CHECK-V8-NEXT:    [[DATA_A_1:%.*]] = load i64, ptr [[ADDR_A_1]], align 4
-; CHECK-V8-NEXT:    [[DATA_B_1:%.*]] = load i64, ptr [[ADDR_B_1]], align 4
+; CHECK-V8-NEXT:    [[DATA_A_1:%.*]] = load i64, ptr [[ADDR_A_1]], align 8
+; CHECK-V8-NEXT:    [[DATA_B_1:%.*]] = load i64, ptr [[ADDR_B_1]], align 8
 ; CHECK-V8-NEXT:    [[RES_1:%.*]] = add i64 [[DATA_A_1]], [[DATA_B_1]]
 ; CHECK-V8-NEXT:    [[ADDR_C_1:%.*]] = getelementptr i64, ptr [[C]], i32 [[COUNT]]
-; CHECK-V8-NEXT:    store i64 [[RES_1]], ptr [[ADDR_C_1]], align 4
+; CHECK-V8-NEXT:    store i64 [[RES_1]], ptr [[ADDR_C_1]], align 8
 ; CHECK-V8-NEXT:    [[COUNT_1]] = add nuw nsw i32 [[IV]], 2
 ; CHECK-V8-NEXT:    [[END_1:%.*]] = icmp ne i32 [[COUNT_1]], 100
 ; CHECK-V8-NEXT:    br i1 [[END_1]], label [[LOOP]], label [[EXIT:%.*]]
@@ -310,13 +310,13 @@
 ; CHECK-V8-NEXT:    [[ACC:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ACC_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-V8-NEXT:    [[ADDR_A:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 [[IV]]
 ; CHECK-V8-NEXT:    [[ADDR_B:%.*]] = getelementptr i64, ptr [[B:%.*]], i32 [[IV]]
-; CHECK-V8-NEXT:    [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 4
-; CHECK-V8-NEXT:    [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 4
+; CHECK-V8-NEXT:    [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 8
+; CHECK-V8-NEXT:    [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 8
 ; CHECK-V8-NEXT:    [[UGT:%.*]] = icmp ugt i64 [[DATA_A]], [[DATA_B]]
 ; CHECK-V8-NEXT:    [[UMAX:%.*]] = select i1 [[UGT]], i64 [[DATA_A]], i64 [[DATA_B]]
 ; CHECK-V8-NEXT:    [[ACC_NEXT]] = add i64 [[UMAX]], [[ACC]]
 ; CHECK-V8-NEXT:    [[ADDR_C:%.*]] = getelementptr i64, ptr [[C:%.*]], i32 [[IV]]
-; CHECK-V8-NEXT:    store i64 [[UMAX]], ptr [[ADDR_C]], align 4
+; CHECK-V8-NEXT:    store i64 [[UMAX]], ptr [[ADDR_C]], align 8
 ; CHECK-V8-NEXT:    [[COUNT]] = add nuw i32 [[IV]], 1
 ; CHECK-V8-NEXT:    [[END:%.*]] = icmp ne i32 [[COUNT]], 100
 ; CHECK-V8-NEXT:    br i1 [[END]], label [[LOOP]], label [[EXIT:%.*]]
@@ -356,13 +356,13 @@
 ; CHECK-V8-NEXT:    [[ACC:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ACC_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-V8-NEXT:    [[ADDR_A:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 [[IV]]
 ; CHECK-V8-NEXT:    [[ADDR_B:%.*]] = getelementptr i64, ptr [[B:%.*]], i32 [[IV]]
-; CHECK-V8-NEXT:    [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 4
-; CHECK-V8-NEXT:    [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 4
+; CHECK-V8-NEXT:    [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 8
+; CHECK-V8-NEXT:    [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 8
 ; CHECK-V8-NEXT:    [[UGT:%.*]] = icmp ugt i64 [[DATA_A]], [[DATA_B]]
 ; CHECK-V8-NEXT:    [[UMAX:%.*]] = select i1 [[UGT]], i64 [[DATA_A]], i64 [[DATA_B]]
 ; CHECK-V8-NEXT:    [[ACC_NEXT]] = add i64 [[UMAX]], [[ACC]]
 ; CHECK-V8-NEXT:    [[ADDR_C:%.*]] = getelementptr i64, ptr [[C:%.*]], i32 [[IV]]
-; CHECK-V8-NEXT:    store i64 [[UMAX]], ptr [[ADDR_C]], align 4
+; CHECK-V8-NEXT:    store i64 [[UMAX]], ptr [[ADDR_C]], align 8
 ; CHECK-V8-NEXT:    [[COUNT]] = add nuw i32 [[IV]], 1
 ; CHECK-V8-NEXT:    [[END:%.*]] = icmp ne i32 [[COUNT]], 100
 ; CHECK-V8-NEXT:    br i1 [[END]], label [[LOOP]], label [[EXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll b/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll
--- a/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll
+++ b/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll
@@ -17,7 +17,7 @@
 ; CHECK-NEXT:    store i32 0, ptr [[X]], align 4
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[REM]], 1
 ; CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_BODY_1:%.*]], label [[WHILE_END]]
 ; CHECK:       while.body.1:
@@ -28,7 +28,7 @@
 ; CHECK-NEXT:    store i32 0, ptr [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    br label [[IF_END_1]]
 ; CHECK:       if.end.1:
-; CHECK-NEXT:    [[INCDEC_PTR_1:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR_1:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 2
 ; CHECK-NEXT:    [[CMP_1:%.*]] = icmp sgt i32 [[REM]], 2
 ; CHECK-NEXT:    br i1 [[CMP_1]], label [[WHILE_BODY_2:%.*]], label [[WHILE_END]]
 ; CHECK:       while.body.2:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -23,10 +23,10 @@
 ; TFNONE:       vector.body:
 ; TFNONE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFNONE-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 4
+; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 8
 ; TFNONE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; TFNONE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 4
+; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 8
 ; TFNONE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; TFNONE-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
 ; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
@@ -40,10 +40,10 @@
 ; TFNONE:       for.body:
 ; TFNONE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; TFNONE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; TFNONE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; TFNONE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR2:[0-9]+]]
 ; TFNONE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFNONE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; TFNONE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
 ; TFNONE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -68,10 +68,10 @@
 ; TFALWAYS-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFALWAYS-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFALWAYS-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFALWAYS-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFALWAYS-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; TFALWAYS-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFALWAYS-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFALWAYS-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFALWAYS-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFALWAYS-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; TFALWAYS-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
 ; TFALWAYS-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
@@ -87,10 +87,10 @@
 ; TFALWAYS:       for.body:
 ; TFALWAYS-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; TFALWAYS-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; TFALWAYS-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; TFALWAYS-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; TFALWAYS-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR4:[0-9]+]]
 ; TFALWAYS-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; TFALWAYS-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; TFALWAYS-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFALWAYS-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; TFALWAYS-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
 ; TFALWAYS-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -115,10 +115,10 @@
 ; TFFALLBACK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFFALLBACK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; TFFALLBACK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFFALLBACK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFFALLBACK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFFALLBACK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; TFFALLBACK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
 ; TFFALLBACK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
@@ -134,10 +134,10 @@
 ; TFFALLBACK:       for.body:
 ; TFFALLBACK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; TFFALLBACK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; TFFALLBACK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; TFFALLBACK-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR4:[0-9]+]]
 ; TFFALLBACK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; TFFALLBACK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; TFFALLBACK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFFALLBACK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; TFFALLBACK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
 ; TFFALLBACK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -594,10 +594,10 @@
 ; TFNONE:       vector.body:
 ; TFNONE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFNONE-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 4
+; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 8
 ; TFNONE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> [[WIDE_LOAD]])
 ; TFNONE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 4
+; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 8
 ; TFNONE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; TFNONE-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
 ; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
@@ -611,10 +611,10 @@
 ; TFNONE:       for.body:
 ; TFNONE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; TFNONE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; TFNONE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; TFNONE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR4:[0-9]+]]
 ; TFNONE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFNONE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; TFNONE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
 ; TFNONE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -627,10 +627,10 @@
 ; TFALWAYS:       for.body:
 ; TFALWAYS-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; TFALWAYS-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDVARS_IV]]
-; TFALWAYS-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; TFALWAYS-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; TFALWAYS-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR6:[0-9]+]]
 ; TFALWAYS-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDVARS_IV]]
-; TFALWAYS-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; TFALWAYS-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFALWAYS-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; TFALWAYS-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
 ; TFALWAYS-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
@@ -652,10 +652,10 @@
 ; TFFALLBACK:       vector.body:
 ; TFFALLBACK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 4
+; TFFALLBACK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 8
 ; TFFALLBACK-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> [[WIDE_LOAD]])
 ; TFFALLBACK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 4
+; TFFALLBACK-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 8
 ; TFFALLBACK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; TFFALLBACK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
 ; TFFALLBACK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
@@ -669,10 +669,10 @@
 ; TFFALLBACK:       for.body:
 ; TFFALLBACK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; TFFALLBACK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; TFFALLBACK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; TFFALLBACK-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR6:[0-9]+]]
 ; TFFALLBACK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; TFFALLBACK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; TFFALLBACK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFFALLBACK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; TFFALLBACK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
 ; TFFALLBACK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -716,10 +716,10 @@
 ; TFNONE:       vector.body:
 ; TFNONE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFNONE-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 4
+; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 8
 ; TFNONE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> [[WIDE_LOAD]])
 ; TFNONE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 4
+; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 8
 ; TFNONE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; TFNONE-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
 ; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
@@ -733,10 +733,10 @@
 ; TFNONE:       for.body:
 ; TFNONE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; TFNONE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; TFNONE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; TFNONE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR5:[0-9]+]]
 ; TFNONE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFNONE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; TFNONE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
 ; TFNONE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
@@ -761,10 +761,10 @@
 ; TFALWAYS-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFALWAYS-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFALWAYS-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFALWAYS-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFALWAYS-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; TFALWAYS-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFALWAYS-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFALWAYS-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFALWAYS-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFALWAYS-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; TFALWAYS-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
 ; TFALWAYS-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
@@ -780,10 +780,10 @@
 ; TFALWAYS:       for.body:
 ; TFALWAYS-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; TFALWAYS-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; TFALWAYS-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; TFALWAYS-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; TFALWAYS-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR7:[0-9]+]]
 ; TFALWAYS-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; TFALWAYS-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; TFALWAYS-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFALWAYS-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; TFALWAYS-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
 ; TFALWAYS-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -808,10 +808,10 @@
 ; TFFALLBACK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFFALLBACK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; TFFALLBACK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFFALLBACK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFFALLBACK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFFALLBACK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; TFFALLBACK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
 ; TFFALLBACK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
@@ -827,10 +827,10 @@
 ; TFFALLBACK:       for.body:
 ; TFFALLBACK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; TFFALLBACK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; TFFALLBACK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; TFFALLBACK-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR7:[0-9]+]]
 ; TFFALLBACK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; TFFALLBACK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; TFFALLBACK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFFALLBACK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; TFFALLBACK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
 ; TFFALLBACK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
@@ -33,11 +33,11 @@
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 2
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP14]]
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vector.reduce.and.nxv2i64(<vscale x 2 x i64> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP17]] = and i64 [[TMP16]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vector.reduce.and.nxv2i64(<vscale x 2 x i64> [[WIDE_LOAD3]])
@@ -67,7 +67,7 @@
 ; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX7]], 0
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[TMP24]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x i64>, ptr [[TMP25]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x i64>, ptr [[TMP25]], align 8
 ; CHECK-NEXT:    [[TMP26:%.*]] = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> [[WIDE_LOAD9]])
 ; CHECK-NEXT:    [[TMP27]] = and i64 [[TMP26]], [[VEC_PHI8]]
 ; CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], 2
@@ -84,7 +84,7 @@
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[AND:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX11]], [[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[L2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[L3:%.*]] = load i64, ptr [[L2]], align 4
+; CHECK-NEXT:    [[L3:%.*]] = load i64, ptr [[L2]], align 8
 ; CHECK-NEXT:    [[AND]] = and i64 [[RDX]], [[L3]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
@@ -33,11 +33,11 @@
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 2
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP14]]
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[TMP16]] = add <vscale x 2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP17]] = add <vscale x 2 x i64> [[WIDE_LOAD3]], [[VEC_PHI2]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
@@ -67,7 +67,7 @@
 ; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX7]], 0
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[TMP24]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x i64>, ptr [[TMP25]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x i64>, ptr [[TMP25]], align 8
 ; CHECK-NEXT:    [[TMP26]] = add <2 x i64> [[WIDE_LOAD9]], [[VEC_PHI8]]
 ; CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], 2
 ; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]]
@@ -84,7 +84,7 @@
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX11]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[ADD]] = add i64 [[TMP29]], [[SUM]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
@@ -55,11 +55,11 @@
 ; CHECK-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 8
 ; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[START_2]], i64 [[TMP28]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x i64> zeroinitializer, ptr [[TMP29]], align 4
+; CHECK-NEXT:    store <vscale x 2 x i64> zeroinitializer, ptr [[TMP29]], align 8
 ; CHECK-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP31:%.*]] = mul i64 [[TMP30]], 2
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i64 [[TMP31]]
-; CHECK-NEXT:    store <vscale x 2 x i64> zeroinitializer, ptr [[TMP32]], align 4
+; CHECK-NEXT:    store <vscale x 2 x i64> zeroinitializer, ptr [[TMP32]], align 8
 ; CHECK-NEXT:    [[TMP33:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP34:%.*]] = mul i64 [[TMP33]], 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP34]]
@@ -79,7 +79,7 @@
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV_1:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[IV_2:%.*]] = phi ptr [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    store i64 0, ptr [[IV_2]], align 4
+; CHECK-NEXT:    store i64 0, ptr [[IV_2]], align 8
 ; CHECK-NEXT:    [[IV_2_NEXT]] = getelementptr inbounds ptr, ptr [[IV_2]], i64 1
 ; CHECK-NEXT:    [[IV_1_NEXT]] = getelementptr inbounds ptr, ptr [[IV_1]], i64 1
 ; CHECK-NEXT:    [[CMP_I_I_NOT_I:%.*]] = icmp eq ptr [[IV_2_NEXT]], [[END]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll
@@ -61,17 +61,17 @@
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i64, ptr [[SRC_2]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i64, ptr [[SRC_2]], i64 [[TMP22]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i64, ptr [[TMP23]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP27]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP27]], align 8
 ; CHECK-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 2
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i64, ptr [[TMP23]], i64 [[TMP29]]
-; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 2 x i64>, ptr [[TMP30]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 2 x i64>, ptr [[TMP30]], align 8
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i64, ptr [[TMP25]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <vscale x 2 x i64>, ptr [[TMP31]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <vscale x 2 x i64>, ptr [[TMP31]], align 8
 ; CHECK-NEXT:    [[TMP32:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP33:%.*]] = mul i64 [[TMP32]], 2
 ; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i64, ptr [[TMP25]], i64 [[TMP33]]
-; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <vscale x 2 x i64>, ptr [[TMP34]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <vscale x 2 x i64>, ptr [[TMP34]], align 8
 ; CHECK-NEXT:    [[TMP35:%.*]] = add <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD13]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = add <vscale x 2 x i64> [[WIDE_LOAD12]], [[WIDE_LOAD14]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i64, ptr [[DST_1]], i64 [[TMP17]]
@@ -79,17 +79,17 @@
 ; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr i64, ptr [[DST_2]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr i64, ptr [[DST_2]], i64 [[TMP22]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr i64, ptr [[TMP37]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP35]], ptr [[TMP41]], align 4
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP35]], ptr [[TMP41]], align 8
 ; CHECK-NEXT:    [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP42]], 2
 ; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i64, ptr [[TMP37]], i64 [[TMP43]]
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP36]], ptr [[TMP44]], align 4
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP36]], ptr [[TMP44]], align 8
 ; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr i64, ptr [[TMP39]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP35]], ptr [[TMP45]], align 4
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP35]], ptr [[TMP45]], align 8
 ; CHECK-NEXT:    [[TMP46:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP47:%.*]] = mul i64 [[TMP46]], 2
 ; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr i64, ptr [[TMP39]], i64 [[TMP47]]
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP36]], ptr [[TMP48]], align 4
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP36]], ptr [[TMP48]], align 8
 ; CHECK-NEXT:    [[TMP49:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP50:%.*]] = mul i64 [[TMP49]], 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP50]]
@@ -105,13 +105,13 @@
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i64, ptr [[SRC_1]], i64 [[IV]]
 ; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i64, ptr [[SRC_2]], i64 [[IV]]
-; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[GEP_SRC_1]], align 4
-; CHECK-NEXT:    [[L_2:%.*]] = load i64, ptr [[GEP_SRC_2]], align 4
+; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[GEP_SRC_1]], align 8
+; CHECK-NEXT:    [[L_2:%.*]] = load i64, ptr [[GEP_SRC_2]], align 8
 ; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[L_1]], [[L_2]]
 ; CHECK-NEXT:    [[GEP_DST_1:%.*]] = getelementptr i64, ptr [[DST_1]], i64 [[IV]]
 ; CHECK-NEXT:    [[GEP_DST_2:%.*]] = getelementptr i64, ptr [[DST_2]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[ADD]], ptr [[GEP_DST_1]], align 4
-; CHECK-NEXT:    store i64 [[ADD]], ptr [[GEP_DST_2]], align 4
+; CHECK-NEXT:    store i64 [[ADD]], ptr [[GEP_DST_1]], align 8
+; CHECK-NEXT:    store i64 [[ADD]], ptr [[GEP_DST_2]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[CMP10]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
@@ -185,11 +185,11 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @foo_vector_fixed4_nomask(<4 x i64> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -201,10 +201,10 @@
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR1:[0-9]+]]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -241,11 +241,11 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @foo_vector_fixed4_mask(<4 x i64> [[WIDE_LOAD]], <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -257,10 +257,10 @@
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -297,11 +297,11 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @foo_vector_fixed4_nomask(<4 x i64> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -313,10 +313,10 @@
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll
@@ -8,21 +8,20 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[INDEX]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[ARR:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = fpext <2 x float> [[WIDE_LOAD]] to <2 x double>
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast <2 x double> @__sind2(<2 x double> [[TMP2]])
-; CHECK-NEXT:    [[TMP4]] = fadd fast <2 x double> [[TMP3]], [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[ARR:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext <2 x float> [[WIDE_LOAD]] to <2 x double>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <2 x double> @__sind2(<2 x double> [[TMP1]])
+; CHECK-NEXT:    [[TMP3]] = fadd fast <2 x double> [[TMP2]], [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi <2 x double> [ [[TMP4]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[DOTLCSSA]])
-; CHECK-NEXT:    ret double [[TMP6]]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi <2 x double> [ [[TMP3]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[DOTLCSSA]])
+; CHECK-NEXT:    ret double [[TMP5]]
 ;
 entry:
   br label %for.cond
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses-zve32x.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses-zve32x.ll
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses-zve32x.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses-zve32x.ll
@@ -10,14 +10,14 @@
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
 ; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 4
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 4
+; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 4
+; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
 ; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -77,22 +77,49 @@
 define void @load_store_factor2_i64(ptr %p) {
 ; CHECK-LABEL: @load_store_factor2_i64(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC1]], <i64 2, i64 2, i64 2, i64 2>
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[TMP6]], i32 -1
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 4
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 4
+; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 4
+; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -150,7 +177,7 @@
 ; CHECK-NEXT:    store <6 x i32> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -175,7 +202,7 @@
 ; CHECK-NEXT:    store i32 [[Y2]], ptr [[Q2]], align 4
 ; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -212,27 +239,75 @@
 define void @load_store_factor3_i64(ptr %p) {
 ; CHECK-LABEL: @load_store_factor3_i64(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = add <vscale x 2 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <vscale x 2 x i64> [[TMP5]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP8]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[P:%.*]], <vscale x 2 x i64> [[TMP10]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP11]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP12]], <vscale x 2 x ptr> [[TMP11]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 2 x i64> [[TMP10]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP13]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP14]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER1]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 2, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP15]], <vscale x 2 x ptr> [[TMP14]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP16:%.*]] = add <vscale x 2 x i64> [[TMP13]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP16]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP17]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP18:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER2]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP18]], <vscale x 2 x ptr> [[TMP17]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP20]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 4
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 4
+; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 4
+; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
 ; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; CHECK-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 4
+; CHECK-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
 ; CHECK-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
-; CHECK-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 4
+; CHECK-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
 ; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -269,52 +344,125 @@
 define void @load_store_factor8(ptr %p) {
 ; CHECK-LABEL: @load_store_factor8(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = add <vscale x 2 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <vscale x 2 x i64> [[TMP5]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP8]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[P:%.*]], <vscale x 2 x i64> [[TMP10]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP11]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP12]], <vscale x 2 x ptr> [[TMP11]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 2 x i64> [[TMP10]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP13]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP14]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER1]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 2, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP15]], <vscale x 2 x ptr> [[TMP14]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP16:%.*]] = add <vscale x 2 x i64> [[TMP13]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP16]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP17]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP18:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER2]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP18]], <vscale x 2 x ptr> [[TMP17]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP19:%.*]] = add <vscale x 2 x i64> [[TMP16]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP19]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP20]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP21:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER3]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 4, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP21]], <vscale x 2 x ptr> [[TMP20]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP22:%.*]] = add <vscale x 2 x i64> [[TMP19]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP22]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP23]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP24:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER4]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 5, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP24]], <vscale x 2 x ptr> [[TMP23]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP25:%.*]] = add <vscale x 2 x i64> [[TMP22]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP25]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP26]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP27:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER5]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 6, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP27]], <vscale x 2 x ptr> [[TMP26]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP28:%.*]] = add <vscale x 2 x i64> [[TMP25]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP28]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP29]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP30:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER6]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 7, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP30]], <vscale x 2 x ptr> [[TMP29]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP31:%.*]] = add <vscale x 2 x i64> [[TMP28]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP31]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP32]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP33:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 8, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP33]], <vscale x 2 x ptr> [[TMP32]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP35:%.*]] = mul i64 [[TMP34]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP35]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 3
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 4
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 4
+; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 4
+; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
 ; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; CHECK-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 4
+; CHECK-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
 ; CHECK-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
-; CHECK-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 4
+; CHECK-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
 ; CHECK-NEXT:    [[OFFSET3:%.*]] = add i64 [[OFFSET2]], 1
 ; CHECK-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
-; CHECK-NEXT:    [[X3:%.*]] = load i64, ptr [[Q3]], align 4
+; CHECK-NEXT:    [[X3:%.*]] = load i64, ptr [[Q3]], align 8
 ; CHECK-NEXT:    [[Y3:%.*]] = add i64 [[X3]], 4
-; CHECK-NEXT:    store i64 [[Y3]], ptr [[Q3]], align 4
+; CHECK-NEXT:    store i64 [[Y3]], ptr [[Q3]], align 8
 ; CHECK-NEXT:    [[OFFSET4:%.*]] = add i64 [[OFFSET3]], 1
 ; CHECK-NEXT:    [[Q4:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET4]]
-; CHECK-NEXT:    [[X4:%.*]] = load i64, ptr [[Q4]], align 4
+; CHECK-NEXT:    [[X4:%.*]] = load i64, ptr [[Q4]], align 8
 ; CHECK-NEXT:    [[Y4:%.*]] = add i64 [[X4]], 5
-; CHECK-NEXT:    store i64 [[Y4]], ptr [[Q4]], align 4
+; CHECK-NEXT:    store i64 [[Y4]], ptr [[Q4]], align 8
 ; CHECK-NEXT:    [[OFFSET5:%.*]] = add i64 [[OFFSET4]], 1
 ; CHECK-NEXT:    [[Q5:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET5]]
-; CHECK-NEXT:    [[X5:%.*]] = load i64, ptr [[Q5]], align 4
+; CHECK-NEXT:    [[X5:%.*]] = load i64, ptr [[Q5]], align 8
 ; CHECK-NEXT:    [[Y5:%.*]] = add i64 [[X5]], 6
-; CHECK-NEXT:    store i64 [[Y5]], ptr [[Q5]], align 4
+; CHECK-NEXT:    store i64 [[Y5]], ptr [[Q5]], align 8
 ; CHECK-NEXT:    [[OFFSET6:%.*]] = add i64 [[OFFSET5]], 1
 ; CHECK-NEXT:    [[Q6:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET6]]
-; CHECK-NEXT:    [[X6:%.*]] = load i64, ptr [[Q6]], align 4
+; CHECK-NEXT:    [[X6:%.*]] = load i64, ptr [[Q6]], align 8
 ; CHECK-NEXT:    [[Y6:%.*]] = add i64 [[X6]], 7
-; CHECK-NEXT:    store i64 [[Y6]], ptr [[Q6]], align 4
+; CHECK-NEXT:    store i64 [[Y6]], ptr [[Q6]], align 8
 ; CHECK-NEXT:    [[OFFSET7:%.*]] = add i64 [[OFFSET6]], 1
 ; CHECK-NEXT:    [[Q7:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET7]]
-; CHECK-NEXT:    [[X7:%.*]] = load i64, ptr [[Q7]], align 4
+; CHECK-NEXT:    [[X7:%.*]] = load i64, ptr [[Q7]], align 8
 ; CHECK-NEXT:    [[Y7:%.*]] = add i64 [[X7]], 8
-; CHECK-NEXT:    store i64 [[Y7]], ptr [[Q7]], align 4
+; CHECK-NEXT:    store i64 [[Y7]], ptr [[Q7]], align 8
 ; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -410,7 +558,7 @@
 ; CHECK-NEXT:    store <8 x i32> [[TMP9]], ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -429,7 +577,7 @@
 ; CHECK-NEXT:    store i32 [[RES]], ptr [[DST]], align 4
 ; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -461,21 +609,55 @@
 define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; CHECK-LABEL: @combine_load_factor2_i64(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[Q]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    store <4 x i64> [[TMP8]], ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP10]], i32 4
+; CHECK-NEXT:    store <4 x i64> [[TMP9]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 4
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[RES:%.*]] = add i64 [[X0]], [[X1]]
-; CHECK-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[I]]
-; CHECK-NEXT:    store i64 [[RES]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]]
+; CHECK-NEXT:    store i64 [[RES]], ptr [[DST]], align 8
 ; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll
--- a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll
@@ -21,9 +21,9 @@
 ; LMUL1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
 ; LMUL1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP2]]
 ; LMUL1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; LMUL1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 4
+; LMUL1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 8
 ; LMUL1-NEXT:    [[TMP5:%.*]] = add <vscale x 1 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; LMUL1-NEXT:    store <vscale x 1 x i64> [[TMP5]], ptr [[TMP4]], align 4
+; LMUL1-NEXT:    store <vscale x 1 x i64> [[TMP5]], ptr [[TMP4]], align 8
 ; LMUL1-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; LMUL1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; LMUL1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -37,9 +37,9 @@
 ; LMUL1:       for.body:
 ; LMUL1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; LMUL1-NEXT:    [[Q:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV]]
-; LMUL1-NEXT:    [[V:%.*]] = load i64, ptr [[Q]], align 4
+; LMUL1-NEXT:    [[V:%.*]] = load i64, ptr [[Q]], align 8
 ; LMUL1-NEXT:    [[W:%.*]] = add i64 [[V]], 1
-; LMUL1-NEXT:    store i64 [[W]], ptr [[Q]], align 4
+; LMUL1-NEXT:    store i64 [[W]], ptr [[Q]], align 8
 ; LMUL1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; LMUL1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
 ; LMUL1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -63,9 +63,9 @@
 ; LMUL2-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
 ; LMUL2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP4]]
 ; LMUL2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; LMUL2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 4
+; LMUL2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
 ; LMUL2-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; LMUL2-NEXT:    store <vscale x 2 x i64> [[TMP7]], ptr [[TMP6]], align 4
+; LMUL2-NEXT:    store <vscale x 2 x i64> [[TMP7]], ptr [[TMP6]], align 8
 ; LMUL2-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; LMUL2-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
 ; LMUL2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
@@ -80,9 +80,9 @@
 ; LMUL2:       for.body:
 ; LMUL2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; LMUL2-NEXT:    [[Q:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV]]
-; LMUL2-NEXT:    [[V:%.*]] = load i64, ptr [[Q]], align 4
+; LMUL2-NEXT:    [[V:%.*]] = load i64, ptr [[Q]], align 8
 ; LMUL2-NEXT:    [[W:%.*]] = add i64 [[V]], 1
-; LMUL2-NEXT:    store i64 [[W]], ptr [[Q]], align 4
+; LMUL2-NEXT:    store i64 [[W]], ptr [[Q]], align 8
 ; LMUL2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; LMUL2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
 ; LMUL2-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -106,9 +106,9 @@
 ; LMUL4-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
 ; LMUL4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP4]]
 ; LMUL4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; LMUL4-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i64>, ptr [[TMP6]], align 4
+; LMUL4-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i64>, ptr [[TMP6]], align 8
 ; LMUL4-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; LMUL4-NEXT:    store <vscale x 4 x i64> [[TMP7]], ptr [[TMP6]], align 4
+; LMUL4-NEXT:    store <vscale x 4 x i64> [[TMP7]], ptr [[TMP6]], align 8
 ; LMUL4-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; LMUL4-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
 ; LMUL4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
@@ -123,9 +123,9 @@
 ; LMUL4:       for.body:
 ; LMUL4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; LMUL4-NEXT:    [[Q:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV]]
-; LMUL4-NEXT:    [[V:%.*]] = load i64, ptr [[Q]], align 4
+; LMUL4-NEXT:    [[V:%.*]] = load i64, ptr [[Q]], align 8
 ; LMUL4-NEXT:    [[W:%.*]] = add i64 [[V]], 1
-; LMUL4-NEXT:    store i64 [[W]], ptr [[Q]], align 4
+; LMUL4-NEXT:    store i64 [[W]], ptr [[Q]], align 8
 ; LMUL4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; LMUL4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
 ; LMUL4-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -149,9 +149,9 @@
 ; LMUL8-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
 ; LMUL8-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP4]]
 ; LMUL8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; LMUL8-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i64>, ptr [[TMP6]], align 4
+; LMUL8-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i64>, ptr [[TMP6]], align 8
 ; LMUL8-NEXT:    [[TMP7:%.*]] = add <vscale x 8 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
-; LMUL8-NEXT:    store <vscale x 8 x i64> [[TMP7]], ptr [[TMP6]], align 4
+; LMUL8-NEXT:    store <vscale x 8 x i64> [[TMP7]], ptr [[TMP6]], align 8
 ; LMUL8-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; LMUL8-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 8
 ; LMUL8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
@@ -166,9 +166,9 @@
 ; LMUL8:       for.body:
 ; LMUL8-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; LMUL8-NEXT:    [[Q:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV]]
-; LMUL8-NEXT:    [[V:%.*]] = load i64, ptr [[Q]], align 4
+; LMUL8-NEXT:    [[V:%.*]] = load i64, ptr [[Q]], align 8
 ; LMUL8-NEXT:    [[W:%.*]] = add i64 [[V]], 1
-; LMUL8-NEXT:    store i64 [[W]], ptr [[Q]], align 4
+; LMUL8-NEXT:    store i64 [[W]], ptr [[Q]], align 8
 ; LMUL8-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; LMUL8-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
 ; LMUL8-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
--- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
@@ -22,9 +22,9 @@
 ; RV32-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP2]]
 ; RV32-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; RV32:       vector.memcheck:
-; RV32-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 79880
-; RV32-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 39940
-; RV32-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 159752
+; RV32-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 79880
+; RV32-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i32 39940
+; RV32-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 159752
 ; RV32-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
 ; RV32-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
 ; RV32-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
diff --git a/llvm/test/Transforms/MemCpyOpt/no-libcalls.ll b/llvm/test/Transforms/MemCpyOpt/no-libcalls.ll
--- a/llvm/test/Transforms/MemCpyOpt/no-libcalls.ll
+++ b/llvm/test/Transforms/MemCpyOpt/no-libcalls.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=memcpyopt < %s | FileCheck %s --check-prefixes=CHECK,LIBCALLS
+; RUN: opt -S -passes=memcpyopt -mtriple=x86_64 < %s | FileCheck %s --check-prefixes=CHECK,LIBCALLS
 ; RUN: opt -S -passes=memcpyopt -mtriple=amdgcn-- < %s | FileCheck %s --check-prefixes=CHECK,NO-LIBCALLS
 ; RUN: opt -S -passes=memcpyopt -mtriple=amdgcn-- -enable-memcpyopt-without-libcalls < %s \
 ; RUN:     | FileCheck %s --check-prefixes=CHECK,LIBCALLS
@@ -38,12 +38,12 @@
 
 define void @dont_create_memcpy(ptr %p1, ptr %p2) {
 ; LIBCALLS-LABEL: @dont_create_memcpy(
-; LIBCALLS-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 4 [[P2:%.*]], ptr align 4 [[P1:%.*]], i64 8, i1 false)
+; LIBCALLS-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 8 [[P2:%.*]], ptr align 8 [[P1:%.*]], i64 8, i1 false)
 ; LIBCALLS-NEXT:    ret void
 ;
 ; NO-LIBCALLS-LABEL: @dont_create_memcpy(
-; NO-LIBCALLS-NEXT:    [[V:%.*]] = load [[TY:%.*]], ptr [[P1:%.*]], align 4
-; NO-LIBCALLS-NEXT:    store [[TY]] [[V]], ptr [[P2:%.*]], align 4
+; NO-LIBCALLS-NEXT:    [[V:%.*]] = load [[TY:%.*]], ptr [[P1:%.*]], align 8
+; NO-LIBCALLS-NEXT:    store [[TY]] [[V]], ptr [[P2:%.*]], align 8
 ; NO-LIBCALLS-NEXT:    ret void
 ;
   %v = load %ty, ptr %p1
diff --git a/llvm/test/Transforms/MergeICmps/X86/addressspaces.ll b/llvm/test/Transforms/MergeICmps/X86/addressspaces.ll
--- a/llvm/test/Transforms/MergeICmps/X86/addressspaces.ll
+++ b/llvm/test/Transforms/MergeICmps/X86/addressspaces.ll
@@ -46,13 +46,13 @@
 ; CHECK-NEXT:    [[PTR_B1:%.*]] = getelementptr inbounds [2 x i64], ptr addrspace(11) [[B:%.*]], i64 0, i64 1
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[A0:%.*]] = load i64, ptr addrspace(11) [[A]], align 4
-; CHECK-NEXT:    [[B0:%.*]] = load i64, ptr addrspace(11) [[B]], align 4
+; CHECK-NEXT:    [[A0:%.*]] = load i64, ptr addrspace(11) [[A]], align 8
+; CHECK-NEXT:    [[B0:%.*]] = load i64, ptr addrspace(11) [[B]], align 8
 ; CHECK-NEXT:    [[COND0:%.*]] = icmp eq i64 [[A0]], [[B0]]
 ; CHECK-NEXT:    br i1 [[COND0]], label [[BB2:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[A1:%.*]] = load i64, ptr addrspace(11) [[PTR_A1]], align 4
-; CHECK-NEXT:    [[B1:%.*]] = load i64, ptr addrspace(11) [[PTR_B1]], align 4
+; CHECK-NEXT:    [[A1:%.*]] = load i64, ptr addrspace(11) [[PTR_A1]], align 8
+; CHECK-NEXT:    [[B1:%.*]] = load i64, ptr addrspace(11) [[PTR_B1]], align 8
 ; CHECK-NEXT:    [[COND1:%.*]] = icmp eq i64 [[A1]], [[B1]]
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
diff --git a/llvm/test/Transforms/NewGVN/refine-stores.ll b/llvm/test/Transforms/NewGVN/refine-stores.ll
--- a/llvm/test/Transforms/NewGVN/refine-stores.ll
+++ b/llvm/test/Transforms/NewGVN/refine-stores.ll
@@ -56,7 +56,7 @@
 ; CHECK-NEXT:  b:
 ; CHECK-NEXT:    br label [[C:%.*]]
 ; CHECK:       c:
-; CHECK-NEXT:    store i64 undef, ptr null, align 4
+; CHECK-NEXT:    store i64 undef, ptr null, align 8
 ; CHECK-NEXT:    br label [[E:%.*]]
 ; CHECK:       e:
 ; CHECK-NEXT:    store ptr undef, ptr null, align 8
@@ -93,9 +93,9 @@
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[TMP3]], 0
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[BB7]], label [[BB5:%.*]]
 ; CHECK:       bb5:
-; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr null, align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr null, align 8
 ; CHECK-NEXT:    call void @quux()
-; CHECK-NEXT:    store i64 [[TMP6]], ptr undef, align 4
+; CHECK-NEXT:    store i64 [[TMP6]], ptr undef, align 8
 ; CHECK-NEXT:    br label [[BB7]]
 ; CHECK:       bb7:
 ; CHECK-NEXT:    [[TMP8]] = add i64 [[TMP3]], 1
diff --git a/llvm/test/Transforms/OpenMP/barrier_removal.ll b/llvm/test/Transforms/OpenMP/barrier_removal.ll
--- a/llvm/test/Transforms/OpenMP/barrier_removal.ll
+++ b/llvm/test/Transforms/OpenMP/barrier_removal.ll
@@ -329,7 +329,7 @@
 define void @pos_priv_mem() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@pos_priv_mem
 ; CHECK-SAME: () #[[ATTR4]] {
-; CHECK-NEXT:    [[ARG:%.*]] = load ptr addrspace(5), ptr @GPtr5, align 8
+; CHECK-NEXT:    [[ARG:%.*]] = load ptr addrspace(5), ptr @GPtr5, align 4
 ; CHECK-NEXT:    [[LOC:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[A:%.*]] = load i32, ptr @PG1, align 4
 ; CHECK-NEXT:    store i32 [[A]], ptr [[LOC]], align 4
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
@@ -41,49 +41,47 @@
 define void @loop(ptr %X, ptr %Y) {
 ; CHECK-LABEL: @loop(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X5:%.*]] = ptrtoint ptr [[X:%.*]] to i64
-; CHECK-NEXT:    [[Y6:%.*]] = ptrtoint ptr [[Y:%.*]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[X5]], [[Y6]]
+; CHECK-NEXT:    [[X6:%.*]] = ptrtoint ptr [[X:%.*]] to i64
+; CHECK-NEXT:    [[Y7:%.*]] = ptrtoint ptr [[Y:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[X6]], [[Y7]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
 ; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[INDEX]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i64 2
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD7]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD]], <double 6.000000e+00, double 6.000000e+00>
-; CHECK-NEXT:    [[TMP7:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD7]], <double 6.000000e+00, double 6.000000e+00>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP6]], <2 x double> <double 6.000000e+00, double 6.000000e+00>, <2 x double> [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[TMP7]], <2 x double> <double 6.000000e+00, double 6.000000e+00>, <2 x double> [[WIDE_LOAD7]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i64 2
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD8]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD]], <double 6.000000e+00, double 6.000000e+00>
+; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD8]], <double 6.000000e+00, double 6.000000e+00>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[TMP5]], <2 x double> <double 6.000000e+00, double 6.000000e+00>, <2 x double> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP6]], <2 x double> <double 6.000000e+00, double 6.000000e+00>, <2 x double> [[WIDE_LOAD8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[TMP3]], <2 x double> zeroinitializer, <2 x double> [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[TMP4]], <2 x double> zeroinitializer, <2 x double> [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP5]], <2 x double> zeroinitializer, <2 x double> [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX]]
+; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[TMP11]], i64 2
 ; CHECK-NEXT:    store <2 x double> [[TMP10]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds double, ptr [[TMP12]], i64 2
-; CHECK-NEXT:    store <2 x double> [[TMP11]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 20000
-; CHECK-NEXT:    br i1 [[TMP14]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20000
+; CHECK-NEXT:    br i1 [[TMP13]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_04]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt double [[TMP15]], 0.000000e+00
-; CHECK-NEXT:    [[CMP1_I:%.*]] = fcmp ogt double [[TMP15]], 6.000000e+00
-; CHECK-NEXT:    [[DOTV_I:%.*]] = select i1 [[CMP1_I]], double 6.000000e+00, double [[TMP15]]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt double [[TMP14]], 0.000000e+00
+; CHECK-NEXT:    [[CMP1_I:%.*]] = fcmp ogt double [[TMP14]], 6.000000e+00
+; CHECK-NEXT:    [[DOTV_I:%.*]] = select i1 [[CMP1_I]], double 6.000000e+00, double [[TMP14]]
 ; CHECK-NEXT:    [[RETVAL_0_I:%.*]] = select i1 [[CMP_I]], double 0.000000e+00, double [[DOTV_I]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store double [[RETVAL_0_I]], ptr [[ARRAYIDX2]], align 8
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_04]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[I_04]], 19999
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ;
 entry:
   %X.addr = alloca ptr, align 8
@@ -193,8 +191,8 @@
 ; CHECK-NEXT:    [[ADD_SINK:%.*]] = phi float [ [[ADD]], [[ELSE]] ], [ [[MUL2_I81_I]], [[LOOP_BODY]] ]
 ; CHECK-NEXT:    store float [[ADD_SINK]], ptr [[B_GEP_0]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
-; CHECK-NEXT:    [[CMP_0:%.*]] = icmp ult i64 [[IV1]], 9999
-; CHECK-NEXT:    br i1 [[CMP_0]], label [[LOOP_BODY]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 10000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
@@ -92,21 +92,20 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <225 x double>, ptr [[B:%.*]], i64 0, i64 [[CONV6]]
 ; CHECK-NEXT:    br label [[FOR_BODY4_US:%.*]]
 ; CHECK:       for.body4.us:
-; CHECK-NEXT:    [[K_011_US:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[FOR_BODY4_US]] ]
-; CHECK-NEXT:    [[CONV_US:%.*]] = zext i32 [[K_011_US]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[K_011_US]], 225
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY4_US]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[INDVARS_IV]], 225
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP2]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <225 x double>, ptr [[A:%.*]], i64 0, i64 [[CONV_US]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <225 x double>, ptr [[A:%.*]], i64 0, i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[MATRIXEXT_US:%.*]] = load double, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[MATRIXEXT8_US:%.*]] = load double, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[MUL_US:%.*]] = fmul double [[MATRIXEXT_US]], [[MATRIXEXT8_US]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[CONV_US]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[MATRIXEXT11_US:%.*]] = load double, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[SUB_US:%.*]] = fsub double [[MATRIXEXT11_US]], [[MUL_US]]
 ; CHECK-NEXT:    store double [[SUB_US]], ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[INC_US]] = add nuw nsw i32 [[K_011_US]], 1
-; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ult i32 [[INC_US]], [[I]]
-; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY4_US]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[CONV6]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]], label [[FOR_BODY4_US]]
 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us:
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[CONV6]], 15
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[I]], 210
@@ -114,10 +113,9 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[TMP5]]
 ; CHECK-NEXT:    br label [[FOR_BODY4_US_1:%.*]]
 ; CHECK:       for.body4.us.1:
-; CHECK-NEXT:    [[K_011_US_1:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ [[INC_US_1:%.*]], [[FOR_BODY4_US_1]] ]
-; CHECK-NEXT:    [[CONV_US_1:%.*]] = zext i32 [[K_011_US_1]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[CONV_US_1]], 15
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[K_011_US_1]], 210
+; CHECK-NEXT:    [[INDVARS_IV_1:%.*]] = phi i64 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_BODY4_US_1]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[INDVARS_IV_1]], 15
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[INDVARS_IV_1]], 210
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP9]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <225 x double>, ptr [[A]], i64 0, i64 [[TMP8]]
 ; CHECK-NEXT:    [[MATRIXEXT_US_1:%.*]] = load double, ptr [[TMP10]], align 8
@@ -127,9 +125,9 @@
 ; CHECK-NEXT:    [[MATRIXEXT11_US_1:%.*]] = load double, ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[SUB_US_1:%.*]] = fsub double [[MATRIXEXT11_US_1]], [[MUL_US_1]]
 ; CHECK-NEXT:    store double [[SUB_US_1]], ptr [[TMP11]], align 8
-; CHECK-NEXT:    [[INC_US_1]] = add nuw nsw i32 [[K_011_US_1]], 1
-; CHECK-NEXT:    [[CMP2_US_1:%.*]] = icmp ult i32 [[INC_US_1]], [[I]]
-; CHECK-NEXT:    br i1 [[CMP2_US_1]], label [[FOR_BODY4_US_1]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1:%.*]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV_1]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], [[CONV6]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_1]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1:%.*]], label [[FOR_BODY4_US_1]]
 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us.1:
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nuw nsw i64 [[CONV6]], 30
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult i32 [[I]], 195
@@ -137,10 +135,9 @@
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[TMP12]]
 ; CHECK-NEXT:    br label [[FOR_BODY4_US_2:%.*]]
 ; CHECK:       for.body4.us.2:
-; CHECK-NEXT:    [[K_011_US_2:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1]] ], [ [[INC_US_2:%.*]], [[FOR_BODY4_US_2]] ]
-; CHECK-NEXT:    [[CONV_US_2:%.*]] = zext i32 [[K_011_US_2]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw i64 [[CONV_US_2]], 30
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp ult i32 [[K_011_US_2]], 195
+; CHECK-NEXT:    [[INDVARS_IV_2:%.*]] = phi i64 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1]] ], [ [[INDVARS_IV_NEXT_2:%.*]], [[FOR_BODY4_US_2]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw i64 [[INDVARS_IV_2]], 30
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ult i64 [[INDVARS_IV_2]], 195
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP16]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds <225 x double>, ptr [[A]], i64 0, i64 [[TMP15]]
 ; CHECK-NEXT:    [[MATRIXEXT_US_2:%.*]] = load double, ptr [[TMP17]], align 8
@@ -150,9 +147,9 @@
 ; CHECK-NEXT:    [[MATRIXEXT11_US_2:%.*]] = load double, ptr [[TMP18]], align 8
 ; CHECK-NEXT:    [[SUB_US_2:%.*]] = fsub double [[MATRIXEXT11_US_2]], [[MUL_US_2]]
 ; CHECK-NEXT:    store double [[SUB_US_2]], ptr [[TMP18]], align 8
-; CHECK-NEXT:    [[INC_US_2]] = add nuw nsw i32 [[K_011_US_2]], 1
-; CHECK-NEXT:    [[CMP2_US_2:%.*]] = icmp ult i32 [[INC_US_2]], [[I]]
-; CHECK-NEXT:    br i1 [[CMP2_US_2]], label [[FOR_BODY4_US_2]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2:%.*]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_2]] = add nuw nsw i64 [[INDVARS_IV_2]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT_2:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_2]], [[CONV6]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_2]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2:%.*]], label [[FOR_BODY4_US_2]]
 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us.2:
 ; CHECK-NEXT:    [[TMP19:%.*]] = add nuw nsw i64 [[CONV6]], 45
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp ult i32 [[I]], 180
@@ -160,10 +157,9 @@
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[TMP19]]
 ; CHECK-NEXT:    br label [[FOR_BODY4_US_3:%.*]]
 ; CHECK:       for.body4.us.3:
-; CHECK-NEXT:    [[K_011_US_3:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2]] ], [ [[INC_US_3:%.*]], [[FOR_BODY4_US_3]] ]
-; CHECK-NEXT:    [[CONV_US_3:%.*]] = zext i32 [[K_011_US_3]] to i64
-; CHECK-NEXT:    [[TMP22:%.*]] = add nuw nsw i64 [[CONV_US_3]], 45
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp ult i32 [[K_011_US_3]], 180
+; CHECK-NEXT:    [[INDVARS_IV_3:%.*]] = phi i64 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_BODY4_US_3]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = add nuw nsw i64 [[INDVARS_IV_3]], 45
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ult i64 [[INDVARS_IV_3]], 180
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP23]])
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds <225 x double>, ptr [[A]], i64 0, i64 [[TMP22]]
 ; CHECK-NEXT:    [[MATRIXEXT_US_3:%.*]] = load double, ptr [[TMP24]], align 8
@@ -173,9 +169,9 @@
 ; CHECK-NEXT:    [[MATRIXEXT11_US_3:%.*]] = load double, ptr [[TMP25]], align 8
 ; CHECK-NEXT:    [[SUB_US_3:%.*]] = fsub double [[MATRIXEXT11_US_3]], [[MUL_US_3]]
 ; CHECK-NEXT:    store double [[SUB_US_3]], ptr [[TMP25]], align 8
-; CHECK-NEXT:    [[INC_US_3]] = add nuw nsw i32 [[K_011_US_3]], 1
-; CHECK-NEXT:    [[CMP2_US_3:%.*]] = icmp ult i32 [[INC_US_3]], [[I]]
-; CHECK-NEXT:    br i1 [[CMP2_US_3]], label [[FOR_BODY4_US_3]], label [[FOR_COND_CLEANUP]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV_3]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], [[CONV6]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_3]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY4_US_3]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll
@@ -16,19 +16,20 @@
 ; CHECK-NEXT:    [[START_INT_I:%.*]] = ptrtoint ptr [[START_I]] to i64
 ; CHECK-NEXT:    [[END_INT_I:%.*]] = ptrtoint ptr [[END_I]] to i64
 ; CHECK-NEXT:    [[SUB_I:%.*]] = sub i64 [[END_INT_I]], [[START_INT_I]]
+; CHECK-NEXT:    [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 0)
 ; CHECK-NEXT:    [[GEP_END_I2:%.*]] = getelementptr [[VEC]], ptr [[B:%.*]], i64 0, i32 1
 ; CHECK-NEXT:    [[START_I1_PEEL:%.*]] = load ptr, ptr [[B]], align 8
 ; CHECK-NEXT:    [[END_I3_PEEL:%.*]] = load ptr, ptr [[GEP_END_I2]], align 8
 ; CHECK-NEXT:    [[START_INT_I4_PEEL:%.*]] = ptrtoint ptr [[START_I1_PEEL]] to i64
 ; CHECK-NEXT:    [[END_INT_I5_PEEL:%.*]] = ptrtoint ptr [[END_I3_PEEL]] to i64
 ; CHECK-NEXT:    [[SUB_I6_PEEL:%.*]] = sub i64 [[END_INT_I5_PEEL]], [[START_INT_I4_PEEL]]
-; CHECK-NEXT:    [[LV_I_PEEL:%.*]] = load i64, ptr [[START_I]], align 4
-; CHECK-NEXT:    [[LV_I9_PEEL:%.*]] = load i64, ptr [[START_I1_PEEL]], align 4
+; CHECK-NEXT:    [[LV_I_PEEL:%.*]] = load i64, ptr [[START_I]], align 8
+; CHECK-NEXT:    [[LV_I9_PEEL:%.*]] = load i64, ptr [[START_I1_PEEL]], align 8
 ; CHECK-NEXT:    [[SUM_NEXT_PEEL:%.*]] = add i64 [[LV_I_PEEL]], [[LV_I9_PEEL]]
-; CHECK-NEXT:    [[C_PEEL:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[C_PEEL]], label [[LOOP_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK-NEXT:    [[EXITCOND_PEEL_NOT:%.*]] = icmp slt i64 [[N]], 1
+; CHECK-NEXT:    br i1 [[EXITCOND_PEEL_NOT]], label [[EXIT:%.*]], label [[LOOP_PREHEADER:%.*]]
 ; CHECK:       loop.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
 ; CHECK-NEXT:    [[UMIN:%.*]] = tail call i64 @llvm.umin.i64(i64 [[SUB_I6_PEEL]], i64 [[TMP0]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i64 [[UMIN]]
 ; CHECK-NEXT:    [[UMIN15:%.*]] = tail call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[SUB_I]])
@@ -49,13 +50,13 @@
 ; CHECK-NEXT:    [[VEC_PHI16:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = or i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i64 2
-; CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <2 x i64>, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_LOAD18:%.*]] = load <2 x i64>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD18:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i64 2
-; CHECK-NEXT:    [[WIDE_LOAD19:%.*]] = load <2 x i64>, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD19:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = add <2 x i64> [[WIDE_LOAD17]], [[VEC_PHI16]]
 ; CHECK-NEXT:    [[TMP12]] = add <2 x i64> [[TMP10]], [[WIDE_LOAD18]]
@@ -87,14 +88,14 @@
 ; CHECK-NEXT:    unreachable
 ; CHECK:       at_with_int_conversion.exit11:
 ; CHECK-NEXT:    [[GEP_IDX_I:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[IV]]
-; CHECK-NEXT:    [[LV_I:%.*]] = load i64, ptr [[GEP_IDX_I]], align 4
+; CHECK-NEXT:    [[LV_I:%.*]] = load i64, ptr [[GEP_IDX_I]], align 8
 ; CHECK-NEXT:    [[GEP_IDX_I8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[IV]]
-; CHECK-NEXT:    [[LV_I9:%.*]] = load i64, ptr [[GEP_IDX_I8]], align 4
+; CHECK-NEXT:    [[LV_I9:%.*]] = load i64, ptr [[GEP_IDX_I8]], align 8
 ; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[LV_I]], [[SUM]]
 ; CHECK-NEXT:    [[SUM_NEXT]] = add i64 [[ADD]], [[LV_I9]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[C:%.*]] = icmp slt i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], [[SMAX]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[AT_WITH_INT_CONVERSION_EXIT11_PEEL:%.*]] ], [ [[SUM_NEXT]], [[AT_WITH_INT_CONVERSION_EXIT11]] ]
 ; CHECK-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
@@ -127,8 +128,9 @@
 ; CHECK-NEXT:    [[END_INT_I:%.*]] = ptrtoint ptr [[END_I]] to i64
 ; CHECK-NEXT:    [[SUB_I:%.*]] = sub i64 [[END_INT_I]], [[START_INT_I]]
 ; CHECK-NEXT:    [[GEP_END_I13:%.*]] = getelementptr [[VEC]], ptr [[C:%.*]], i64 0, i32 1
+; CHECK-NEXT:    [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 0)
 ; CHECK-NEXT:    [[GEP_END_I2:%.*]] = getelementptr [[VEC]], ptr [[B:%.*]], i64 0, i32 1
-; CHECK-NEXT:    [[LV_I_PEEL:%.*]] = load i64, ptr [[START_I]], align 4
+; CHECK-NEXT:    [[LV_I_PEEL:%.*]] = load i64, ptr [[START_I]], align 8
 ; CHECK-NEXT:    [[START_I1_PEEL:%.*]] = load ptr, ptr [[B]], align 8
 ; CHECK-NEXT:    [[END_I3_PEEL:%.*]] = load ptr, ptr [[GEP_END_I2]], align 8
 ; CHECK-NEXT:    [[START_INT_I4_PEEL:%.*]] = ptrtoint ptr [[START_I1_PEEL]] to i64
@@ -140,14 +142,14 @@
 ; CHECK-NEXT:    [[START_INT_I15_PEEL:%.*]] = ptrtoint ptr [[START_I12_PEEL]] to i64
 ; CHECK-NEXT:    [[END_INT_I16_PEEL:%.*]] = ptrtoint ptr [[END_I14_PEEL]] to i64
 ; CHECK-NEXT:    [[SUB_I17_PEEL:%.*]] = sub i64 [[END_INT_I16_PEEL]], [[START_INT_I15_PEEL]]
-; CHECK-NEXT:    [[LV_I9_PEEL:%.*]] = load i64, ptr [[START_I1_PEEL]], align 4
-; CHECK-NEXT:    [[LV_I20_PEEL:%.*]] = load i64, ptr [[START_I12_PEEL]], align 4
+; CHECK-NEXT:    [[LV_I9_PEEL:%.*]] = load i64, ptr [[START_I1_PEEL]], align 8
+; CHECK-NEXT:    [[LV_I20_PEEL:%.*]] = load i64, ptr [[START_I12_PEEL]], align 8
 ; CHECK-NEXT:    [[ADD_2_PEEL:%.*]] = add i64 [[LV_I_PEEL]], [[LV_I9_PEEL]]
 ; CHECK-NEXT:    [[SUM_NEXT_PEEL:%.*]] = add i64 [[ADD_2_PEEL]], [[LV_I20_PEEL]]
-; CHECK-NEXT:    [[COND_PEEL:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[COND_PEEL]], label [[LOOP_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK-NEXT:    [[EXITCOND_PEEL_NOT:%.*]] = icmp slt i64 [[N]], 1
+; CHECK-NEXT:    br i1 [[EXITCOND_PEEL_NOT]], label [[EXIT:%.*]], label [[LOOP_PREHEADER:%.*]]
 ; CHECK:       loop.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
 ; CHECK-NEXT:    [[UMIN:%.*]] = tail call i64 @llvm.umin.i64(i64 [[SUB_I17_PEEL]], i64 [[TMP0]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i64 [[UMIN]]
 ; CHECK-NEXT:    [[UMIN26:%.*]] = tail call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[SUB_I6_PEEL]])
@@ -169,17 +171,17 @@
 ; CHECK-NEXT:    [[VEC_PHI28:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = or i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i64 2
-; CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <2 x i64>, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <2 x i64>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i64 2
-; CHECK-NEXT:    [[WIDE_LOAD31:%.*]] = load <2 x i64>, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD31:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[START_I12_PEEL]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <2 x i64>, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <2 x i64>, ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[TMP10]], i64 2
-; CHECK-NEXT:    [[WIDE_LOAD33:%.*]] = load <2 x i64>, ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD33:%.*]] = load <2 x i64>, ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <2 x i64> [[WIDE_LOAD29]], [[VEC_PHI28]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = add <2 x i64> [[TMP12]], [[WIDE_LOAD30]]
@@ -207,7 +209,7 @@
 ; CHECK-NEXT:    unreachable
 ; CHECK:       at_with_int_conversion.exit:
 ; CHECK-NEXT:    [[GEP_IDX_I:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[IV]]
-; CHECK-NEXT:    [[LV_I:%.*]] = load i64, ptr [[GEP_IDX_I]], align 4
+; CHECK-NEXT:    [[LV_I:%.*]] = load i64, ptr [[GEP_IDX_I]], align 8
 ; CHECK-NEXT:    [[INRANGE_I7:%.*]] = icmp ult i64 [[SUB_I6_PEEL]], [[IV]]
 ; CHECK-NEXT:    br i1 [[INRANGE_I7]], label [[ERROR_I10:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT11:%.*]]
 ; CHECK:       error.i10:
@@ -221,15 +223,15 @@
 ; CHECK-NEXT:    unreachable
 ; CHECK:       at_with_int_conversion.exit22:
 ; CHECK-NEXT:    [[GEP_IDX_I8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[IV]]
-; CHECK-NEXT:    [[LV_I9:%.*]] = load i64, ptr [[GEP_IDX_I8]], align 4
+; CHECK-NEXT:    [[LV_I9:%.*]] = load i64, ptr [[GEP_IDX_I8]], align 8
 ; CHECK-NEXT:    [[GEP_IDX_I19:%.*]] = getelementptr i64, ptr [[START_I12_PEEL]], i64 [[IV]]
-; CHECK-NEXT:    [[LV_I20:%.*]] = load i64, ptr [[GEP_IDX_I19]], align 4
+; CHECK-NEXT:    [[LV_I20:%.*]] = load i64, ptr [[GEP_IDX_I19]], align 8
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add i64 [[LV_I]], [[SUM]]
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add i64 [[ADD_1]], [[LV_I9]]
 ; CHECK-NEXT:    [[SUM_NEXT]] = add i64 [[ADD_2]], [[LV_I20]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[COND]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], [[SMAX]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[AT_WITH_INT_CONVERSION_EXIT22_PEEL:%.*]] ], [ [[SUM_NEXT]], [[AT_WITH_INT_CONVERSION_EXIT22]] ]
 ; CHECK-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
@@ -267,7 +269,7 @@
 ; CHECK-NEXT:    br i1 [[INRANGE]], label [[ERROR:%.*]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[GEP_IDX:%.*]] = getelementptr i64, ptr [[START]], i64 [[IDX]]
-; CHECK-NEXT:    [[LV:%.*]] = load i64, ptr [[GEP_IDX]], align 4
+; CHECK-NEXT:    [[LV:%.*]] = load i64, ptr [[GEP_IDX]], align 8
 ; CHECK-NEXT:    ret i64 [[LV]]
 ; CHECK:       error:
 ; CHECK-NEXT:    tail call void @error()
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll
@@ -183,9 +183,9 @@
 define i64 @red_ld_2xi64(ptr %ptr) {
 ; CHECK-LABEL: @red_ld_2xi64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LD0:%.*]] = load i64, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[LD0:%.*]] = load i64, ptr [[PTR:%.*]], align 8
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 1
-; CHECK-NEXT:    [[LD1:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[LD1:%.*]] = load i64, ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i64 [[LD0]], [[LD1]]
 ; CHECK-NEXT:    ret i64 [[ADD_1]]
 ;
@@ -200,7 +200,7 @@
 define i64 @red_ld_4xi64(ptr %ptr) {
 ; CHECK-LABEL: @red_ld_4xi64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[PTR:%.*]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP0]])
 ; CHECK-NEXT:    ret i64 [[TMP1]]
 ;
@@ -221,7 +221,7 @@
 define i64 @red_ld_8xi64(ptr %ptr) {
 ; CHECK-LABEL: @red_ld_8xi64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[PTR:%.*]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP0]])
 ; CHECK-NEXT:    ret i64 [[TMP1]]
 ;
@@ -254,7 +254,7 @@
 define i64 @red_ld_16xi64(ptr %ptr) {
 ; CHECK-LABEL: @red_ld_16xi64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i64>, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i64>, ptr [[PTR:%.*]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP0]])
 ; CHECK-NEXT:    ret i64 [[TMP1]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll
@@ -101,7 +101,7 @@
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i64> [[TMP0]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP3]], [[TMP6]]
-; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr [[ARRAY1]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr [[ARRAY1]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll b/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll
@@ -8,10 +8,10 @@
 ; Base case with no interesting control dependencies
 define void @test_no_control(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test_no_control(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    store <2 x i64> [[TMP3]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP3]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -32,13 +32,13 @@
 
 define void @test1(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[C2:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -58,13 +58,13 @@
 
 define void @test2(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[C2:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %c1 = load i64, ptr %c
@@ -85,13 +85,13 @@
 
 define void @test3(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test3(
-; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[C2:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -111,13 +111,13 @@
 
 define void @test4(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test4(
-; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[C2:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -138,12 +138,12 @@
 define void @test5(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:    [[C2:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %a2 = getelementptr i64, ptr %a, i32 1
@@ -164,10 +164,10 @@
 define void @test6(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -196,15 +196,15 @@
 define void @test7(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A]], align 4
-; CHECK-NEXT:    store i64 0, ptr [[A]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[A]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -229,15 +229,15 @@
 define void @test8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test8(
 ; CHECK-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A]], align 4
-; CHECK-NEXT:    store i64 0, ptr [[A]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[A]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_throw() #[[ATTR4:[0-9]+]]
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -262,15 +262,15 @@
 define void @test9(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test9(
 ; CHECK-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A]], align 4
-; CHECK-NEXT:    store i64 0, ptr [[A]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[A]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_throw()
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -294,18 +294,18 @@
 ; A variant of test7 which shows the same problem with a non-load instruction
 define void @test10(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test10(
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[A]], i32 1
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 8
 ; CHECK-NEXT:    [[U1:%.*]] = udiv i64 200, [[V1]]
-; CHECK-NEXT:    store i64 [[U1]], ptr [[A]], align 4
+; CHECK-NEXT:    store i64 [[U1]], ptr [[A]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_inf_loop_ro()
 ; CHECK-NEXT:    [[U2:%.*]] = udiv i64 200, [[V2]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[U2]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -334,14 +334,14 @@
 define void @test11(i64 %x, i64 %y, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test11(
 ; CHECK-NEXT:    [[U1:%.*]] = udiv i64 200, [[X:%.*]]
-; CHECK-NEXT:    store i64 [[U1]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store i64 [[U1]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_inf_loop_ro()
 ; CHECK-NEXT:    [[U2:%.*]] = udiv i64 200, [[Y:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[U2]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %u1 = udiv i64 200, %x
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
@@ -52,14 +52,17 @@
 
 define void @test2(i64* %a, i64* %b) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[A:%.*]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[B:%.*]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 1, i64 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP4]], [[TMP6]]
-; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[A1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 1
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 2
+; CHECK-NEXT:    [[I1:%.*]] = ptrtoint ptr [[A1]] to i64
+; CHECK-NEXT:    [[B3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 3
+; CHECK-NEXT:    [[I2:%.*]] = ptrtoint ptr [[B3]] to i64
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A1]], align 8
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 8
+; CHECK-NEXT:    [[ADD1:%.*]] = add i64 [[I1]], [[V1]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add i64 [[I2]], [[V2]]
+; CHECK-NEXT:    store i64 [[ADD1]], ptr [[A1]], align 8
+; CHECK-NEXT:    store i64 [[ADD2]], ptr [[A2]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %a1 = getelementptr inbounds i64, i64* %a, i64 1
diff --git a/llvm/test/Transforms/SafeStack/X86/setjmp2.ll b/llvm/test/Transforms/SafeStack/X86/setjmp2.ll
--- a/llvm/test/Transforms/SafeStack/X86/setjmp2.ll
+++ b/llvm/test/Transforms/SafeStack/X86/setjmp2.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -safe-stack -S -mtriple=i386-pc-linux-gnu < %s -o - | FileCheck %s
-; RUN: opt -safe-stack -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck %s
+; RUN: opt -safe-stack -S -mtriple=i386-pc-linux-gnu < %s -o - | FileCheck %s --check-prefix=I386
+; RUN: opt -safe-stack -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck %s  --check-prefix=X86-64
 
 %struct.__jmp_buf_tag = type { [8 x i64], i32, %struct.__sigset_t }
 %struct.__sigset_t = type { [16 x i64] }
@@ -11,27 +11,48 @@
 ; setjmp/longjmp test with dynamically sized array.
 ; Requires protector.
 define i32 @foo(i32 %size) nounwind uwtable safestack {
-; CHECK-LABEL: define i32 @foo(
-; CHECK-SAME: i32 [[SIZE:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[UNSAFE_STACK_PTR:%.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr, align 8
-; CHECK-NEXT:    [[UNSAFE_STACK_DYNAMIC_PTR:%.*]] = alloca ptr, align 8
-; CHECK-NEXT:    store ptr [[UNSAFE_STACK_PTR]], ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[SIZE]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -16
-; CHECK-NEXT:    [[A:%.*]] = inttoptr i64 [[TMP5]] to ptr
-; CHECK-NEXT:    store ptr [[A]], ptr @__safestack_unsafe_stack_ptr, align 8
-; CHECK-NEXT:    store ptr [[A]], ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 8
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @_setjmp(ptr @buf) #[[ATTR1:[0-9]+]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 8
-; CHECK-NEXT:    store ptr [[TMP6]], ptr @__safestack_unsafe_stack_ptr, align 8
-; CHECK-NEXT:    call void @funcall(ptr [[A]])
-; CHECK-NEXT:    store ptr [[UNSAFE_STACK_PTR]], ptr @__safestack_unsafe_stack_ptr, align 8
-; CHECK-NEXT:    ret i32 0
+; I386-LABEL: define i32 @foo(
+; I386-SAME: i32 [[SIZE:%.*]]) #[[ATTR0:[0-9]+]] {
+; I386-NEXT:  entry:
+; I386-NEXT:    [[UNSAFE_STACK_PTR:%.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr, align 4
+; I386-NEXT:    [[UNSAFE_STACK_DYNAMIC_PTR:%.*]] = alloca ptr, align 4
+; I386-NEXT:    store ptr [[UNSAFE_STACK_PTR]], ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 4
+; I386-NEXT:    [[TMP0:%.*]] = mul i32 [[SIZE]], 4
+; I386-NEXT:    [[TMP1:%.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr, align 4
+; I386-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i32
+; I386-NEXT:    [[TMP3:%.*]] = sub i32 [[TMP2]], [[TMP0]]
+; I386-NEXT:    [[TMP4:%.*]] = and i32 [[TMP3]], -16
+; I386-NEXT:    [[A:%.*]] = inttoptr i32 [[TMP4]] to ptr
+; I386-NEXT:    store ptr [[A]], ptr @__safestack_unsafe_stack_ptr, align 4
+; I386-NEXT:    store ptr [[A]], ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 4
+; I386-NEXT:    [[CALL:%.*]] = call i32 @_setjmp(ptr @buf) #[[ATTR1:[0-9]+]]
+; I386-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 4
+; I386-NEXT:    store ptr [[TMP5]], ptr @__safestack_unsafe_stack_ptr, align 4
+; I386-NEXT:    call void @funcall(ptr [[A]])
+; I386-NEXT:    store ptr [[UNSAFE_STACK_PTR]], ptr @__safestack_unsafe_stack_ptr, align 4
+; I386-NEXT:    ret i32 0
+;
+; X86-64-LABEL: define i32 @foo(
+; X86-64-SAME: i32 [[SIZE:%.*]]) #[[ATTR0:[0-9]+]] {
+; X86-64-NEXT:  entry:
+; X86-64-NEXT:    [[UNSAFE_STACK_PTR:%.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr, align 8
+; X86-64-NEXT:    [[UNSAFE_STACK_DYNAMIC_PTR:%.*]] = alloca ptr, align 8
+; X86-64-NEXT:    store ptr [[UNSAFE_STACK_PTR]], ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 8
+; X86-64-NEXT:    [[TMP0:%.*]] = zext i32 [[SIZE]] to i64
+; X86-64-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; X86-64-NEXT:    [[TMP2:%.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr, align 8
+; X86-64-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP2]] to i64
+; X86-64-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], [[TMP1]]
+; X86-64-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -16
+; X86-64-NEXT:    [[A:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; X86-64-NEXT:    store ptr [[A]], ptr @__safestack_unsafe_stack_ptr, align 8
+; X86-64-NEXT:    store ptr [[A]], ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 8
+; X86-64-NEXT:    [[CALL:%.*]] = call i32 @_setjmp(ptr @buf) #[[ATTR1:[0-9]+]]
+; X86-64-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 8
+; X86-64-NEXT:    store ptr [[TMP6]], ptr @__safestack_unsafe_stack_ptr, align 8
+; X86-64-NEXT:    call void @funcall(ptr [[A]])
+; X86-64-NEXT:    store ptr [[UNSAFE_STACK_PTR]], ptr @__safestack_unsafe_stack_ptr, align 8
+; X86-64-NEXT:    ret i32 0
 ;
 entry:
 
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn.ll
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn.ll
@@ -22,20 +22,20 @@
 ; IR-NEXT:  .preheader:
 ; IR-NEXT:    [[I:%.*]] = sext i32 [[Y]] to i64
 ; IR-NEXT:    [[I1:%.*]] = sext i32 [[X]] to i64
-; IR-NEXT:    [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 [[I1]], i64 [[I]]
+; IR-NEXT:    [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i32 0, i32 [[X]], i32 [[Y]]
 ; IR-NEXT:    [[I3:%.*]] = addrspacecast ptr addrspace(3) [[I2]] to ptr
 ; IR-NEXT:    [[I4:%.*]] = load float, ptr [[I3]], align 4
 ; IR-NEXT:    [[I5:%.*]] = fadd float [[I4]], 0.000000e+00
-; IR-NEXT:    [[I82:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 1
-; IR-NEXT:    [[I9:%.*]] = addrspacecast ptr addrspace(3) [[I82]] to ptr
+; IR-NEXT:    [[I87:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 1
+; IR-NEXT:    [[I9:%.*]] = addrspacecast ptr addrspace(3) [[I87]] to ptr
 ; IR-NEXT:    [[I10:%.*]] = load float, ptr [[I9]], align 4
 ; IR-NEXT:    [[I11:%.*]] = fadd float [[I5]], [[I10]]
-; IR-NEXT:    [[I144:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 32
-; IR-NEXT:    [[I15:%.*]] = addrspacecast ptr addrspace(3) [[I144]] to ptr
+; IR-NEXT:    [[I1412:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 32
+; IR-NEXT:    [[I15:%.*]] = addrspacecast ptr addrspace(3) [[I1412]] to ptr
 ; IR-NEXT:    [[I16:%.*]] = load float, ptr [[I15]], align 4
 ; IR-NEXT:    [[I17:%.*]] = fadd float [[I11]], [[I16]]
-; IR-NEXT:    [[I187:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 33
-; IR-NEXT:    [[I19:%.*]] = addrspacecast ptr addrspace(3) [[I187]] to ptr
+; IR-NEXT:    [[I1818:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 33
+; IR-NEXT:    [[I19:%.*]] = addrspacecast ptr addrspace(3) [[I1818]] to ptr
 ; IR-NEXT:    [[I20:%.*]] = load float, ptr [[I19]], align 4
 ; IR-NEXT:    [[I21:%.*]] = fadd float [[I17]], [[I20]]
 ; IR-NEXT:    store float [[I21]], ptr [[OUTPUT]], align 4
@@ -84,20 +84,20 @@
 ; IR-NEXT:  .preheader:
 ; IR-NEXT:    [[I:%.*]] = sext i32 [[Y]] to i64
 ; IR-NEXT:    [[I1:%.*]] = sext i32 [[X]] to i64
-; IR-NEXT:    [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 [[I1]], i64 [[I]]
+; IR-NEXT:    [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i32 0, i32 [[X]], i32 [[Y]]
 ; IR-NEXT:    [[I3:%.*]] = addrspacecast ptr addrspace(3) [[I2]] to ptr
 ; IR-NEXT:    [[I4:%.*]] = load float, ptr [[I3]], align 4
 ; IR-NEXT:    [[I5:%.*]] = fadd float [[I4]], 0.000000e+00
-; IR-NEXT:    [[I72:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 1
-; IR-NEXT:    [[I8:%.*]] = addrspacecast ptr addrspace(3) [[I72]] to ptr
+; IR-NEXT:    [[I77:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 1
+; IR-NEXT:    [[I8:%.*]] = addrspacecast ptr addrspace(3) [[I77]] to ptr
 ; IR-NEXT:    [[I9:%.*]] = load float, ptr [[I8]], align 4
 ; IR-NEXT:    [[I10:%.*]] = fadd float [[I5]], [[I9]]
-; IR-NEXT:    [[I124:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 32
-; IR-NEXT:    [[I13:%.*]] = addrspacecast ptr addrspace(3) [[I124]] to ptr
+; IR-NEXT:    [[I1212:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 32
+; IR-NEXT:    [[I13:%.*]] = addrspacecast ptr addrspace(3) [[I1212]] to ptr
 ; IR-NEXT:    [[I14:%.*]] = load float, ptr [[I13]], align 4
 ; IR-NEXT:    [[I15:%.*]] = fadd float [[I10]], [[I14]]
-; IR-NEXT:    [[I167:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 33
-; IR-NEXT:    [[I17:%.*]] = addrspacecast ptr addrspace(3) [[I167]] to ptr
+; IR-NEXT:    [[I1618:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 33
+; IR-NEXT:    [[I17:%.*]] = addrspacecast ptr addrspace(3) [[I1618]] to ptr
 ; IR-NEXT:    [[I18:%.*]] = load float, ptr [[I17]], align 4
 ; IR-NEXT:    [[I19:%.*]] = fadd float [[I15]], [[I18]]
 ; IR-NEXT:    store float [[I19]], ptr [[OUTPUT]], align 4
@@ -145,20 +145,20 @@
 ; IR-NEXT:  .preheader:
 ; IR-NEXT:    [[I:%.*]] = zext i32 [[Y]] to i64
 ; IR-NEXT:    [[I1:%.*]] = zext i32 [[X]] to i64
-; IR-NEXT:    [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 [[I1]], i64 [[I]]
+; IR-NEXT:    [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i32 0, i32 [[X]], i32 [[Y]]
 ; IR-NEXT:    [[I3:%.*]] = addrspacecast ptr addrspace(3) [[I2]] to ptr
 ; IR-NEXT:    [[I4:%.*]] = load float, ptr [[I3]], align 4
 ; IR-NEXT:    [[I5:%.*]] = fadd float [[I4]], 0.000000e+00
-; IR-NEXT:    [[I82:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 1
-; IR-NEXT:    [[I9:%.*]] = addrspacecast ptr addrspace(3) [[I82]] to ptr
+; IR-NEXT:    [[I87:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 1
+; IR-NEXT:    [[I9:%.*]] = addrspacecast ptr addrspace(3) [[I87]] to ptr
 ; IR-NEXT:    [[I10:%.*]] = load float, ptr [[I9]], align 4
 ; IR-NEXT:    [[I11:%.*]] = fadd float [[I5]], [[I10]]
-; IR-NEXT:    [[I144:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 32
-; IR-NEXT:    [[I15:%.*]] = addrspacecast ptr addrspace(3) [[I144]] to ptr
+; IR-NEXT:    [[I1412:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 32
+; IR-NEXT:    [[I15:%.*]] = addrspacecast ptr addrspace(3) [[I1412]] to ptr
 ; IR-NEXT:    [[I16:%.*]] = load float, ptr [[I15]], align 4
 ; IR-NEXT:    [[I17:%.*]] = fadd float [[I11]], [[I16]]
-; IR-NEXT:    [[I187:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 33
-; IR-NEXT:    [[I19:%.*]] = addrspacecast ptr addrspace(3) [[I187]] to ptr
+; IR-NEXT:    [[I1818:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 33
+; IR-NEXT:    [[I19:%.*]] = addrspacecast ptr addrspace(3) [[I1818]] to ptr
 ; IR-NEXT:    [[I20:%.*]] = load float, ptr [[I19]], align 4
 ; IR-NEXT:    [[I21:%.*]] = fadd float [[I17]], [[I20]]
 ; IR-NEXT:    store float [[I21]], ptr [[OUTPUT]], align 4
@@ -205,20 +205,20 @@
 ; IR-NEXT:  .preheader:
 ; IR-NEXT:    [[I:%.*]] = zext i32 [[Y]] to i64
 ; IR-NEXT:    [[I1:%.*]] = zext i32 [[X]] to i64
-; IR-NEXT:    [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 [[I1]], i64 [[I]]
+; IR-NEXT:    [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i32 0, i32 [[X]], i32 [[Y]]
 ; IR-NEXT:    [[I3:%.*]] = addrspacecast ptr addrspace(3) [[I2]] to ptr
 ; IR-NEXT:    [[I4:%.*]] = load float, ptr [[I3]], align 4
 ; IR-NEXT:    [[I5:%.*]] = fadd float [[I4]], 0.000000e+00
-; IR-NEXT:    [[I72:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 1
-; IR-NEXT:    [[I8:%.*]] = addrspacecast ptr addrspace(3) [[I72]] to ptr
+; IR-NEXT:    [[I77:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 1
+; IR-NEXT:    [[I8:%.*]] = addrspacecast ptr addrspace(3) [[I77]] to ptr
 ; IR-NEXT:    [[I9:%.*]] = load float, ptr [[I8]], align 4
 ; IR-NEXT:    [[I10:%.*]] = fadd float [[I5]], [[I9]]
-; IR-NEXT:    [[I124:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 32
-; IR-NEXT:    [[I13:%.*]] = addrspacecast ptr addrspace(3) [[I124]] to ptr
+; IR-NEXT:    [[I1212:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 32
+; IR-NEXT:    [[I13:%.*]] = addrspacecast ptr addrspace(3) [[I1212]] to ptr
 ; IR-NEXT:    [[I14:%.*]] = load float, ptr [[I13]], align 4
 ; IR-NEXT:    [[I15:%.*]] = fadd float [[I10]], [[I14]]
-; IR-NEXT:    [[I167:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 33
-; IR-NEXT:    [[I17:%.*]] = addrspacecast ptr addrspace(3) [[I167]] to ptr
+; IR-NEXT:    [[I1618:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 33
+; IR-NEXT:    [[I17:%.*]] = addrspacecast ptr addrspace(3) [[I1618]] to ptr
 ; IR-NEXT:    [[I18:%.*]] = load float, ptr [[I17]], align 4
 ; IR-NEXT:    [[I19:%.*]] = fadd float [[I15]], [[I18]]
 ; IR-NEXT:    store float [[I19]], ptr [[OUTPUT]], align 4
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep.ll
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep.ll
@@ -12,10 +12,9 @@
 define ptr addrspace(3) @packed_struct(i32 %i, i32 %j) {
 ; CHECK-LABEL: @packed_struct(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[I:%.*]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[J:%.*]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [1024 x %struct.Packed], ptr addrspace(3) @packed_struct_array, i64 0, i64 [[TMP0]], i32 1, i64 [[TMP1]]
-; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i64 100
+; CHECK-NEXT:    [[IDXPROM:%.*]] = trunc i64 0 to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [1024 x %struct.Packed], ptr addrspace(3) @packed_struct_array, i32 [[IDXPROM]], i32 [[I:%.*]], i32 1, i32 [[J:%.*]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP0]], i32 100
 ; CHECK-NEXT:    ret ptr addrspace(3) [[UGLYGEP]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
@@ -163,7 +163,7 @@
 ; CHECK-NEXT:    [[I2:%.*]] = add i64 [[B]], [[A]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 [[I2]], i64 0
 ; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i64 160
-; CHECK-NEXT:    store i64 [[B5]], ptr [[OUT]], align 4
+; CHECK-NEXT:    store i64 [[B5]], ptr [[OUT]], align 8
 ; CHECK-NEXT:    ret ptr [[P3]]
 ;
 entry:
@@ -358,8 +358,8 @@
 ; CHECK-SAME: ptr [[PTR:%.*]], i64 [[IDX:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT0:%.*]], ptr [[PTR]], i64 0, i32 3, i64 [[IDX]], i32 1
-; CHECK-NEXT:    [[PTR22:%.*]] = getelementptr inbounds [[STRUCT2:%.*]], ptr [[TMP0]], i64 -3
-; CHECK-NEXT:    ret ptr [[PTR22]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 -64
+; CHECK-NEXT:    ret ptr [[UGLYGEP]]
 ;
 entry:
   %arrayidx = add nsw i64 %idx, -2
@@ -373,7 +373,7 @@
 ; CHECK-SAME: ptr [[PTR:%.*]], i64 [[IDX:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT0:%.*]], ptr [[PTR]], i64 0, i32 3, i64 [[IDX]], i32 1
-; CHECK-NEXT:    [[PTR21:%.*]] = getelementptr inbounds [[STRUCT2:%.*]], ptr [[TMP0]], i64 151
+; CHECK-NEXT:    [[PTR21:%.*]] = getelementptr inbounds [[STRUCT2:%.*]], ptr [[TMP0]], i64 134
 ; CHECK-NEXT:    ret ptr [[PTR21]]
 ;
 entry:
@@ -390,7 +390,7 @@
 ; CHECK-SAME: ptr [[PTR:%.*]], i64 [[IDX:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT0:%.*]], ptr [[PTR]], i64 0, i32 3, i64 [[IDX]], i32 1
-; CHECK-NEXT:    [[PTR21:%.*]] = getelementptr inbounds [[STRUCT2:%.*]], ptr [[TMP0]], i64 151
+; CHECK-NEXT:    [[PTR21:%.*]] = getelementptr inbounds [[STRUCT2:%.*]], ptr [[TMP0]], i64 134
 ; CHECK-NEXT:    ret ptr [[PTR21]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/RISCV/split-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/RISCV/split-gep.ll
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/RISCV/split-gep.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/RISCV/split-gep.ll
@@ -13,11 +13,11 @@
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[I:%.*]], 5
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i64, ptr [[ARRAY:%.*]], i64 [[I]]
 ; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 5
-; CHECK-NEXT:    store i64 [[J:%.*]], ptr [[GEP4]], align 4
+; CHECK-NEXT:    store i64 [[J:%.*]], ptr [[GEP4]], align 8
 ; CHECK-NEXT:    [[GEP26:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 6
-; CHECK-NEXT:    store i64 [[J]], ptr [[GEP26]], align 4
+; CHECK-NEXT:    store i64 [[J]], ptr [[GEP26]], align 8
 ; CHECK-NEXT:    [[GEP38:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 35
-; CHECK-NEXT:    store i64 [[ADD]], ptr [[GEP38]], align 4
+; CHECK-NEXT:    store i64 [[ADD]], ptr [[GEP38]], align 8
 ; CHECK-NEXT:    ret i64 undef
 ;
 entry:
@@ -169,11 +169,11 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[I:%.*]], 5
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[ARRAY:%.*]], i64 [[J:%.*]]
-; CHECK-NEXT:    store i64 [[ADD]], ptr [[GEP]], align 4
+; CHECK-NEXT:    store i64 [[ADD]], ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i64, ptr [[ARRAY]], i64 [[I]]
 ; CHECK-NEXT:    [[GEP52:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 6
-; CHECK-NEXT:    store i64 [[I]], ptr [[GEP52]], align 4
-; CHECK-NEXT:    store i64 [[I]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    store i64 [[I]], ptr [[GEP52]], align 8
+; CHECK-NEXT:    store i64 [[I]], ptr [[TMP0]], align 8
 ; CHECK-NEXT:    ret i64 undef
 ;
 entry:
diff --git a/llvm/test/Transforms/TypePromotion/ARM/pointers.ll b/llvm/test/Transforms/TypePromotion/ARM/pointers.ll
--- a/llvm/test/Transforms/TypePromotion/ARM/pointers.ll
+++ b/llvm/test/Transforms/TypePromotion/ARM/pointers.ll
@@ -130,7 +130,7 @@
 define i16 @pointer_to_pointer(ptr %arg, i16 zeroext %limit) {
 ; CHECK-LABEL: @pointer_to_pointer(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ADDR:%.*]] = load ptr, ptr [[ARG:%.*]], align 8
+; CHECK-NEXT:    [[ADDR:%.*]] = load ptr, ptr [[ARG:%.*]], align 4
 ; CHECK-NEXT:    [[VAL:%.*]] = load i16, ptr [[ADDR]], align 2
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[VAL]] to i32
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[TMP0]], 7
diff --git a/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant-inseltpoison.ll
--- a/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant-inseltpoison.ll
@@ -26,7 +26,7 @@
 
 define <2 x i64> @add_constant_load(ptr %p) {
 ; CHECK-LABEL: @add_constant_load(
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> poison, i64 [[LD]], i32 0
 ; CHECK-NEXT:    [[BO:%.*]] = add <2 x i64> [[INS]], <i64 42, i64 -42>
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
@@ -152,7 +152,7 @@
 
 define <2 x i64> @shl_constant_op0_load(ptr %p) {
 ; CHECK-LABEL: @shl_constant_op0_load(
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> poison, i64 [[LD]], i32 1
 ; CHECK-NEXT:    [[BO:%.*]] = shl <2 x i64> <i64 undef, i64 2>, [[INS]]
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
@@ -203,7 +203,7 @@
 
 define <2 x i64> @shl_constant_op1_load(ptr %p) {
 ; CHECK-LABEL: @shl_constant_op1_load(
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> poison, i64 [[LD]], i32 0
 ; CHECK-NEXT:    [[BO:%.*]] = shl nuw <2 x i64> [[INS]], <i64 5, i64 2>
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
diff --git a/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll b/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll
--- a/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll
@@ -26,7 +26,7 @@
 
 define <2 x i64> @add_constant_load(ptr %p) {
 ; CHECK-LABEL: @add_constant_load(
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[LD]], i32 0
 ; CHECK-NEXT:    [[BO:%.*]] = add <2 x i64> [[INS]], <i64 42, i64 -42>
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
@@ -152,7 +152,7 @@
 
 define <2 x i64> @shl_constant_op0_load(ptr %p) {
 ; CHECK-LABEL: @shl_constant_op0_load(
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[LD]], i32 1
 ; CHECK-NEXT:    [[BO:%.*]] = shl <2 x i64> <i64 undef, i64 2>, [[INS]]
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
@@ -203,7 +203,7 @@
 
 define <2 x i64> @shl_constant_op1_load(ptr %p) {
 ; CHECK-LABEL: @shl_constant_op1_load(
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[LD]], i32 0
 ; CHECK-NEXT:    [[BO:%.*]] = shl nuw <2 x i64> [[INS]], <i64 5, i64 2>
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
diff --git a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll
--- a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll
@@ -169,7 +169,7 @@
 
 define <2 x i1> @constant_op1_i64_load(ptr %p) {
 ; CHECK-LABEL: @constant_op1_i64_load(
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> poison, i64 [[LD]], i32 0
 ; CHECK-NEXT:    [[R:%.*]] = icmp eq <2 x i64> [[INS]], <i64 42, i64 -42>
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
diff --git a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll
--- a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll
@@ -169,7 +169,7 @@
 
 define <2 x i1> @constant_op1_i64_load(ptr %p) {
 ; CHECK-LABEL: @constant_op1_i64_load(
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[LD]], i32 0
 ; CHECK-NEXT:    [[R:%.*]] = icmp eq <2 x i64> [[INS]], <i64 42, i64 -42>
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -504,10 +504,31 @@
   std::unique_ptr<ToolOutputFile> RemarksFile = std::move(*RemarksFileOrErr);
 
   // Load the input module...
-  auto SetDataLayout = [](StringRef, StringRef) -> std::optional<std::string> {
-    if (ClDataLayout.empty())
+  auto SetDataLayout = [&](StringRef IRTriple,
+                           StringRef IRLayout) -> std::optional<std::string> {
+    // Data layout specified on the command line has the highest priority.
+    if (!ClDataLayout.empty())
+      return ClDataLayout;
+    // If an explicit data layout is already defined in the IR, don't infer.
+    if (!IRLayout.empty())
       return std::nullopt;
-    return ClDataLayout;
+
+    // If an explicit triple was specified (either in the IR or on the
+    // command line), use that to infer the default data layout. However, the
+    // command line target triple should override the IR file target triple.
+    std::string TripleStr =
+        TargetTriple.empty() ? IRTriple.str() : Triple::normalize(TargetTriple);
+    // If the triple string is still empty, we don't fall back to
+    // sys::getDefaultTargetTriple() since we do not want to have differing
+    // behaviour dependent on the configured default triple. Therefore, if the
+    // user did not pass -mtriple or define an explicit triple/datalayout in
+    // the IR, we should default to an empty (default) DataLayout.
+    if (TripleStr.empty())
+      return std::nullopt;
+    // Otherwise we infer the DataLayout from the target machine.
+    std::unique_ptr<TargetMachine> TM =
+        ExitOnErr(codegen::createTargetMachineForTriple(TripleStr));
+    return TM->createDataLayout().getStringRepresentation();
   };
   std::unique_ptr<Module> M;
   if (NoUpgradeDebugInfo)
@@ -535,7 +556,7 @@
     }
   }
 
-  // If we are supposed to override the target triple or data layout, do so now.
+  // If we are supposed to override the target triple, do so now.
   if (!TargetTriple.empty())
     M->setTargetTriple(Triple::normalize(TargetTriple));