diff --git a/llvm/test/Analysis/BasicAA/libfuncs.ll b/llvm/test/Analysis/BasicAA/libfuncs.ll --- a/llvm/test/Analysis/BasicAA/libfuncs.ll +++ b/llvm/test/Analysis/BasicAA/libfuncs.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=i386-pc-linux-gnu -aa-pipeline=basic-aa -passes=inferattrs,aa-eval -print-all-alias-modref-info -disable-output 2>&1 %s | FileCheck %s +; RUN: opt -mtriple=x86_64-pc-linux-gnu -aa-pipeline=basic-aa -passes=inferattrs,aa-eval -print-all-alias-modref-info -disable-output 2>&1 %s | FileCheck %s ; CHECK-LABEL: Function: test_memcmp_const_size ; CHECK: Just Ref: Ptr: i8* %a <-> %res = tail call i32 @memcmp(ptr %a, ptr %b, i64 4) diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll b/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll --- a/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll @@ -1,19 +1,22 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 3 ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple aarch64-linux-gnu -mattr=+sve -S -o - < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" define void @sve_fptruncs() { - ;CHECK-LABEL: 'sve_fptruncs' - ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2_f16_from_f32 = fptrunc undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4_f16_from_f32 = fptrunc undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8_f16_from_f32 = fptrunc undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2_f16_from_f64 = fptrunc undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4_f16_from_f64 = fptrunc undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv8_f16_from_f64 = fptrunc undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2_f32_from_f64 = fptrunc undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4_f32_from_f64 = fptrunc undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv8_f32_from_f64 = fptrunc undef to +; CHECK-LABEL: 'sve_fptruncs' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2_f16_from_f32 = fptrunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4_f16_from_f32 = fptrunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8_f16_from_f32 = fptrunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2_f16_from_f64 = fptrunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4_f16_from_f64 = fptrunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv8_f16_from_f64 = fptrunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2_f32_from_f64 = fptrunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4_f32_from_f64 = fptrunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv8_f32_from_f64 = fptrunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; %nxv2_f16_from_f32 = fptrunc undef to %nxv4_f16_from_f32 = fptrunc undef to %nxv8_f16_from_f32 = fptrunc undef to diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll b/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll --- a/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll @@ -1,24 +1,27 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 3 ; RUN: opt -mtriple=aarch64-linux-gnu -mattr=+sve -passes="print" 2>&1 -disable-output < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define void @sve_truncs() { - ;CHECK-LABEL: 'sve_truncs' - ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i16_to_i1 = trunc undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i32_to_i1 = trunc undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i64_to_i1 = trunc undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %trunc_v4i16_to_i1 = trunc undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %trunc_v4i32_to_i1 = trunc undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %trunc_v4i64_to_i1 = trunc undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %trunc_v8i16_to_i1 = trunc undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %trunc_v8i32_to_i1 = trunc undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %trunc_v8i64_to_i1 = trunc undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i32_to_i16 = trunc undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i64_to_i32 = trunc undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %trunc_v4i32_to_i16 = trunc undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %trunc_v4i64_to_i32 = trunc undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %trunc_v8i32_to_i16 = trunc undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %trunc_v8i64_to_i32 = trunc undef to +; CHECK-LABEL: 'sve_truncs' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i16_to_i1 = trunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i32_to_i1 = trunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i64_to_i1 = trunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %trunc_v4i16_to_i1 = trunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %trunc_v4i32_to_i1 = trunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %trunc_v4i64_to_i1 = trunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %trunc_v8i16_to_i1 = trunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %trunc_v8i32_to_i1 = trunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %trunc_v8i64_to_i1 = trunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i32_to_i16 = trunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i64_to_i32 = trunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %trunc_v4i32_to_i16 = trunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %trunc_v4i64_to_i32 = trunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %trunc_v8i32_to_i16 = trunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %trunc_v8i64_to_i32 = trunc undef to +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; %trunc_v2i16_to_i1 = trunc undef to %trunc_v2i32_to_i1 = trunc undef to %trunc_v2i64_to_i1 = trunc undef to diff --git a/llvm/test/Analysis/CostModel/AMDGPU/cast.ll b/llvm/test/Analysis/CostModel/AMDGPU/cast.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/cast.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/cast.ll @@ -74,11 +74,11 @@ ; FAST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %E = trunc <4 x i64> undef to <4 x i32> ; FAST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %F = trunc <8 x i32> undef to <8 x i16> ; FAST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %F1 = trunc <16 x i16> undef to <16 x i8> -; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8> -; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8> +; FAST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8> +; FAST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8> ; FAST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %G = trunc <8 x i64> undef to <8 x i32> ; FAST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %G1 = trunc <16 x i32> undef to <16 x i16> -; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8> +; FAST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8> ; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef ; ; SLOW-LABEL: 'zext_sext' @@ -134,11 +134,11 @@ ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %E = trunc <4 x i64> undef to <4 x i32> ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %F = trunc <8 x i32> undef to <8 x i16> ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %F1 = trunc <16 x i16> undef to <16 x i8> -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8> -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8> +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8> +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8> ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %G = trunc <8 x i64> undef to <8 x i32> ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %G1 = trunc <16 x i32> undef to <16 x i16> -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8> +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8> ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SLOW-SIZE-LABEL: 'zext_sext' diff --git a/llvm/test/Analysis/CostModel/AMDGPU/load-to-trunc.ll b/llvm/test/Analysis/CostModel/AMDGPU/load-to-trunc.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/load-to-trunc.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/load-to-trunc.ll @@ -8,7 +8,7 @@ ; Check that cost is 1 for unusual load to register sized load. define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) { ; CHECK-LABEL: 'loadUnusualIntegerWithTrunc' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc ; @@ -19,7 +19,7 @@ define i128 @loadUnusualInteger(ptr %ptr) { ; CHECK-LABEL: 'loadUnusualInteger' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %out = load i128, ptr %ptr, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %out = load i128, ptr %ptr, align 8 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out ; %out = load i128, ptr %ptr diff --git a/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll b/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll --- a/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll +++ b/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll @@ -9,8 +9,8 @@ ; Check that cost is 1 for unusual load to register sized load. define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) { ; CHECK-LABEL: 'loadUnusualIntegerWithTrunc' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %trunc = trunc i128 %out to i32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc ; %out = load i128, ptr %ptr @@ -20,7 +20,7 @@ define i128 @loadUnusualInteger(ptr %ptr) { ; CHECK-LABEL: 'loadUnusualInteger' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out ; %out = load i128, ptr %ptr diff --git a/llvm/test/Analysis/CostModel/PowerPC/load-to-trunc.ll b/llvm/test/Analysis/CostModel/PowerPC/load-to-trunc.ll --- a/llvm/test/Analysis/CostModel/PowerPC/load-to-trunc.ll +++ b/llvm/test/Analysis/CostModel/PowerPC/load-to-trunc.ll @@ -7,7 +7,7 @@ ; Check that cost is 1 for unusual load to register sized load. define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) { ; CHECK-LABEL: 'loadUnusualIntegerWithTrunc' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc ; @@ -18,7 +18,7 @@ define i128 @loadUnusualInteger(ptr %ptr) { ; CHECK-LABEL: 'loadUnusualInteger' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %out = load i128, ptr %ptr, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %out = load i128, ptr %ptr, align 8 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out ; %out = load i128, ptr %ptr diff --git a/llvm/test/Analysis/CostModel/RISCV/cast.ll b/llvm/test/Analysis/CostModel/RISCV/cast.ll --- a/llvm/test/Analysis/CostModel/RISCV/cast.ll +++ b/llvm/test/Analysis/CostModel/RISCV/cast.ll @@ -1075,7 +1075,7 @@ ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_v64i1 = trunc <64 x i8> undef to <64 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i16_v64i1 = trunc <64 x i16> undef to <64 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1> -; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1> +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128i16_v128i8 = trunc <128 x i16> undef to <128 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v128i32_v128i8 = trunc <128 x i32> undef to <128 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v128i64_v128i8 = trunc <128 x i64> undef to <128 x i8> @@ -1085,7 +1085,7 @@ ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_v128i1 = trunc <128 x i8> undef to <128 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v128i16_v128i1 = trunc <128 x i16> undef to <128 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1> -; RV32-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1> +; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v256i16_v256i8 = trunc <256 x i16> undef to <256 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v256i32_v256i8 = trunc <256 x i32> undef to <256 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %v256i64_v256i8 = trunc <256 x i64> undef to <256 x i8> @@ -1095,7 +1095,7 @@ ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v256i8_v256i1 = trunc <256 x i8> undef to <256 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v256i16_v256i1 = trunc <256 x i16> undef to <256 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1> -; RV32-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1> +; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i8 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i8 = trunc undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i64_nxv1i8 = trunc undef to @@ -1227,8 +1227,8 @@ ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v64i64_v64i32 = trunc <64 x i64> undef to <64 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_v64i1 = trunc <64 x i8> undef to <64 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i16_v64i1 = trunc <64 x i16> undef to <64 x i1> -; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1> -; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128i16_v128i8 = trunc <128 x i16> undef to <128 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v128i32_v128i8 = trunc <128 x i32> undef to <128 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v128i64_v128i8 = trunc <128 x i64> undef to <128 x i8> @@ -1237,8 +1237,8 @@ ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v128i64_v128i32 = trunc <128 x i64> undef to <128 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_v128i1 = trunc <128 x i8> undef to <128 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v128i16_v128i1 = trunc <128 x i16> undef to <128 x i1> -; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1> -; RV64-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v256i16_v256i8 = trunc <256 x i16> undef to <256 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v256i32_v256i8 = trunc <256 x i32> undef to <256 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %v256i64_v256i8 = trunc <256 x i64> undef to <256 x i8> @@ -1247,8 +1247,8 @@ ; RV64-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v256i64_v256i32 = trunc <256 x i64> undef to <256 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v256i8_v256i1 = trunc <256 x i8> undef to <256 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v256i16_v256i1 = trunc <256 x i16> undef to <256 x i1> -; RV64-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1> -; RV64-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1> +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i8 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i8 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i64_nxv1i8 = trunc undef to diff --git a/llvm/test/Analysis/CostModel/RISCV/fca-load-store.ll b/llvm/test/Analysis/CostModel/RISCV/fca-load-store.ll --- a/llvm/test/Analysis/CostModel/RISCV/fca-load-store.ll +++ b/llvm/test/Analysis/CostModel/RISCV/fca-load-store.ll @@ -4,10 +4,10 @@ define void @load(ptr %p) { ; CHECK-LABEL: 'load' -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %1 = load [2 x i64], ptr %p, align 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = load [4 x i64], ptr %p, align 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %3 = load { i64, i64 }, ptr %p, align 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %4 = load { i64, i32 }, ptr %p, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %1 = load [2 x i64], ptr %p, align 8 +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = load [4 x i64], ptr %p, align 8 +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %3 = load { i64, i64 }, ptr %p, align 8 +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %4 = load { i64, i32 }, ptr %p, align 8 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; load [2 x i64], ptr %p @@ -20,10 +20,10 @@ define void @store(ptr %p) { ; CHECK-LABEL: 'store' -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store [2 x i64] undef, ptr %p, align 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store [4 x i64] undef, ptr %p, align 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store { i64, i64 } undef, ptr %p, align 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store { i64, i32 } undef, ptr %p, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store [2 x i64] undef, ptr %p, align 8 +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store [4 x i64] undef, ptr %p, align 8 +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store { i64, i64 } undef, ptr %p, align 8 +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store { i64, i32 } undef, ptr %p, align 8 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; store [2 x i64] undef, ptr %p diff --git a/llvm/test/Analysis/CostModel/RISCV/load-to-trunc.ll b/llvm/test/Analysis/CostModel/RISCV/load-to-trunc.ll --- a/llvm/test/Analysis/CostModel/RISCV/load-to-trunc.ll +++ b/llvm/test/Analysis/CostModel/RISCV/load-to-trunc.ll @@ -8,8 +8,8 @@ ; Check that cost is 1 for unusual load to register sized load. define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) { ; CHECK-LABEL: 'loadUnusualIntegerWithTrunc' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %trunc = trunc i128 %out to i32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 16 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc ; %out = load i128, ptr %ptr @@ -19,7 +19,7 @@ define i128 @loadUnusualInteger(ptr %ptr) { ; CHECK-LABEL: 'loadUnusualInteger' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %out = load i128, ptr %ptr, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %out = load i128, ptr %ptr, align 16 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out ; %out = load i128, ptr %ptr diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll --- a/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll +++ b/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll @@ -44,7 +44,7 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %37 = load , ptr %p, align 32 ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %38 = load , ptr %p, align 64 ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %39 = load , ptr %p, align 128 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %40 = load i64, ptr %p, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %40 = load i64, ptr %p, align 8 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %41 = load <1 x i64>, ptr %p, align 8 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %42 = load <2 x i64>, ptr %p, align 16 ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %43 = load <4 x i64>, ptr %p, align 32 @@ -187,7 +187,7 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store undef, ptr %p, align 32 ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store undef, ptr %p, align 64 ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: store undef, ptr %p, align 128 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i64 undef, ptr %p, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i64 undef, ptr %p, align 8 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <1 x i64> undef, ptr %p, align 8 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr %p, align 16 ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <4 x i64> undef, ptr %p, align 32 diff --git a/llvm/test/Analysis/CostModel/SystemZ/load-to-trunc.ll b/llvm/test/Analysis/CostModel/SystemZ/load-to-trunc.ll --- a/llvm/test/Analysis/CostModel/SystemZ/load-to-trunc.ll +++ b/llvm/test/Analysis/CostModel/SystemZ/load-to-trunc.ll @@ -8,7 +8,7 @@ ; Check that cost is 1 for unusual load to register sized load. define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) { ; CHECK-LABEL: 'loadUnusualIntegerWithTrunc' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc ; @@ -19,7 +19,7 @@ define i128 @loadUnusualInteger(ptr %ptr) { ; CHECK-LABEL: 'loadUnusualInteger' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out ; %out = load i128, ptr %ptr diff --git a/llvm/test/Analysis/CostModel/X86/load-to-trunc.ll b/llvm/test/Analysis/CostModel/X86/load-to-trunc.ll --- a/llvm/test/Analysis/CostModel/X86/load-to-trunc.ll +++ b/llvm/test/Analysis/CostModel/X86/load-to-trunc.ll @@ -9,7 +9,7 @@ ; Check that cost is 1 for unusual load to register sized load. define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) { ; CHECK-LABEL: 'loadUnusualIntegerWithTrunc' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 16 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc ; @@ -20,7 +20,7 @@ define i128 @loadUnusualInteger(ptr %ptr) { ; CHECK-LABEL: 'loadUnusualInteger' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 16 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out ; %out = load i128, ptr %ptr diff --git a/llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll b/llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll --- a/llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll +++ b/llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll @@ -542,20 +542,20 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1> @@ -569,8 +569,8 @@ ; AVX512VL256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1> ; AVX512VL256-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1> ; AVX512VL256-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1> -; AVX512VL256-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> -; AVX512VL256-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> +; AVX512VL256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> +; AVX512VL256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> ; AVX512VL256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1> ; AVX512VL256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1> ; AVX512VL256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1> @@ -587,8 +587,8 @@ ; AVX512VL256-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1> ; AVX512VL256-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1> ; AVX512VL256-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1> -; AVX512VL256-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1> -; AVX512VL256-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1> +; AVX512VL256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1> +; AVX512VL256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1> ; AVX512VL256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512VL512-LABEL: 'trunc_vXi1' @@ -596,8 +596,8 @@ ; AVX512VL512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1> ; AVX512VL512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1> ; AVX512VL512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1> -; AVX512VL512-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> -; AVX512VL512-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> +; AVX512VL512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> +; AVX512VL512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> ; AVX512VL512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1> ; AVX512VL512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1> ; AVX512VL512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1> @@ -608,14 +608,14 @@ ; AVX512VL512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1> ; AVX512VL512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1> ; AVX512VL512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1> -; AVX512VL512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> -; AVX512VL512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1> +; AVX512VL512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> +; AVX512VL512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1> ; AVX512VL512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1> ; AVX512VL512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1> ; AVX512VL512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1> ; AVX512VL512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1> -; AVX512VL512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1> -; AVX512VL512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1> +; AVX512VL512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1> +; AVX512VL512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1> ; AVX512VL512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SKX256-LABEL: 'trunc_vXi1' @@ -623,14 +623,14 @@ ; SKX256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1> ; SKX256-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1> ; SKX256-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1> -; SKX256-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> -; SKX256-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> +; SKX256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> +; SKX256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> ; SKX256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1> ; SKX256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1> ; SKX256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1> ; SKX256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1> -; SKX256-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> -; SKX256-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> +; SKX256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> +; SKX256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> ; SKX256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1> ; SKX256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1> ; SKX256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1> @@ -650,14 +650,14 @@ ; SKX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1> ; SKX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1> ; SKX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1> -; SKX512-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> -; SKX512-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> +; SKX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> +; SKX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> ; SKX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1> ; SKX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1> ; SKX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1> ; SKX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1> -; SKX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> -; SKX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> +; SKX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> +; SKX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> ; SKX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1> ; SKX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1> ; SKX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1> diff --git a/llvm/test/Analysis/CostModel/X86/size-cost.ll b/llvm/test/Analysis/CostModel/X86/size-cost.ll --- a/llvm/test/Analysis/CostModel/X86/size-cost.ll +++ b/llvm/test/Analysis/CostModel/X86/size-cost.ll @@ -48,7 +48,7 @@ define ptr @inttoptr_i64_p64(i64 %x) { ; CHECK-LABEL: 'inttoptr_i64_p64' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = inttoptr i64 %x to ptr +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = inttoptr i64 %x to ptr ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret ptr %r ; %r = inttoptr i64 %x to ptr @@ -57,7 +57,7 @@ define i64 @ptrtoint_p64_i64(ptr %x) { ; CHECK-LABEL: 'ptrtoint_p64_i64' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = ptrtoint ptr %x to i64 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = ptrtoint ptr %x to i64 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r ; %r = ptrtoint ptr %x to i64 diff --git a/llvm/test/Analysis/CostModel/X86/trunc-codesize.ll b/llvm/test/Analysis/CostModel/X86/trunc-codesize.ll --- a/llvm/test/Analysis/CostModel/X86/trunc-codesize.ll +++ b/llvm/test/Analysis/CostModel/X86/trunc-codesize.ll @@ -537,26 +537,26 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1> @@ -572,26 +572,26 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1> @@ -607,26 +607,26 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1> @@ -663,292 +663,6 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef -; -; AVX512-LABEL: 'trunc_vXi1' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i1 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V5i16 = trunc <5 x i16> undef to <5 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V7i16 = trunc <7 x i16> undef to <7 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V5i8 = trunc <5 x i8> undef to <5 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V6i8 = trunc <6 x i8> undef to <6 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V7i8 = trunc <7 x i8> undef to <7 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V10i8 = trunc <10 x i8> undef to <10 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V12i8 = trunc <12 x i8> undef to <12 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V14i8 = trunc <14 x i8> undef to <14 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef -; -; AVX256-LABEL: 'trunc_vXi1' -; AVX256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i1 -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1 -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1 -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V5i16 = trunc <5 x i16> undef to <5 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V7i16 = trunc <7 x i16> undef to <7 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1 -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V5i8 = trunc <5 x i8> undef to <5 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V6i8 = trunc <6 x i8> undef to <6 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V7i8 = trunc <7 x i8> undef to <7 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V10i8 = trunc <10 x i8> undef to <10 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V12i8 = trunc <12 x i8> undef to <12 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V14i8 = trunc <14 x i8> undef to <14 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; %i64 = trunc i64 undef to i1 %V2i64 = trunc <2 x i64> undef to <2 x i1> @@ -1096,3 +810,6 @@ ret i32 undef } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX256: {{.*}} +; AVX512: {{.*}} diff --git a/llvm/test/Analysis/CostModel/X86/trunc-latency.ll b/llvm/test/Analysis/CostModel/X86/trunc-latency.ll --- a/llvm/test/Analysis/CostModel/X86/trunc-latency.ll +++ b/llvm/test/Analysis/CostModel/X86/trunc-latency.ll @@ -537,26 +537,26 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1> @@ -572,26 +572,26 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1> @@ -607,26 +607,26 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1> @@ -663,292 +663,6 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef -; -; AVX512-LABEL: 'trunc_vXi1' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i1 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V5i16 = trunc <5 x i16> undef to <5 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V7i16 = trunc <7 x i16> undef to <7 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V5i8 = trunc <5 x i8> undef to <5 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V6i8 = trunc <6 x i8> undef to <6 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V7i8 = trunc <7 x i8> undef to <7 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V10i8 = trunc <10 x i8> undef to <10 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V12i8 = trunc <12 x i8> undef to <12 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V14i8 = trunc <14 x i8> undef to <14 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef -; -; AVX256-LABEL: 'trunc_vXi1' -; AVX256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i1 -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1 -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1 -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V5i16 = trunc <5 x i16> undef to <5 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V7i16 = trunc <7 x i16> undef to <7 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1 -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V5i8 = trunc <5 x i8> undef to <5 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V6i8 = trunc <6 x i8> undef to <6 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V7i8 = trunc <7 x i8> undef to <7 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V10i8 = trunc <10 x i8> undef to <10 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V12i8 = trunc <12 x i8> undef to <12 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V14i8 = trunc <14 x i8> undef to <14 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; %i64 = trunc i64 undef to i1 %V2i64 = trunc <2 x i64> undef to <2 x i1> @@ -1096,3 +810,6 @@ ret i32 undef } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX256: {{.*}} +; AVX512: {{.*}} diff --git a/llvm/test/Analysis/CostModel/X86/trunc-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/trunc-sizelatency.ll --- a/llvm/test/Analysis/CostModel/X86/trunc-sizelatency.ll +++ b/llvm/test/Analysis/CostModel/X86/trunc-sizelatency.ll @@ -537,26 +537,26 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1> @@ -572,26 +572,26 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1> @@ -607,26 +607,26 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1> @@ -663,292 +663,6 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef -; -; AVX512-LABEL: 'trunc_vXi1' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i1 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V5i16 = trunc <5 x i16> undef to <5 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V7i16 = trunc <7 x i16> undef to <7 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V5i8 = trunc <5 x i8> undef to <5 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V6i8 = trunc <6 x i8> undef to <6 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V7i8 = trunc <7 x i8> undef to <7 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V10i8 = trunc <10 x i8> undef to <10 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V12i8 = trunc <12 x i8> undef to <12 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V14i8 = trunc <14 x i8> undef to <14 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1> -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef -; -; AVX256-LABEL: 'trunc_vXi1' -; AVX256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i1 -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1 -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1 -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V5i16 = trunc <5 x i16> undef to <5 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V7i16 = trunc <7 x i16> undef to <7 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1 -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V5i8 = trunc <5 x i8> undef to <5 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V6i8 = trunc <6 x i8> undef to <6 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V7i8 = trunc <7 x i8> undef to <7 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V10i8 = trunc <10 x i8> undef to <10 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V12i8 = trunc <12 x i8> undef to <12 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V14i8 = trunc <14 x i8> undef to <14 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1> -; AVX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; %i64 = trunc i64 undef to i1 %V2i64 = trunc <2 x i64> undef to <2 x i1> @@ -1096,3 +810,6 @@ ret i32 undef } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX256: {{.*}} +; AVX512: {{.*}} diff --git a/llvm/test/Analysis/CostModel/X86/trunc.ll b/llvm/test/Analysis/CostModel/X86/trunc.ll --- a/llvm/test/Analysis/CostModel/X86/trunc.ll +++ b/llvm/test/Analysis/CostModel/X86/trunc.ll @@ -2754,26 +2754,26 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1> -; AVX1-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 83 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 117 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1> -; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 165 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 231 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1> -; AVX1-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 396 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 462 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1> -; AVX1-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 660 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 792 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 924 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1> -; AVX1-NEXT: Cost Model: Found an estimated cost of 368 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1320 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1584 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1848 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1> -; AVX1-NEXT: Cost Model: Found an estimated cost of 736 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1> @@ -2789,26 +2789,26 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1> -; AVX1-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 83 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 117 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1> -; AVX1-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 165 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 231 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1> -; AVX1-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 396 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 462 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1> -; AVX1-NEXT: Cost Model: Found an estimated cost of 168 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 660 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 792 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 924 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1> -; AVX1-NEXT: Cost Model: Found an estimated cost of 336 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1320 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1584 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1848 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1> -; AVX1-NEXT: Cost Model: Found an estimated cost of 672 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1> @@ -2824,26 +2824,26 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1> -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 83 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 117 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1> -; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 165 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 231 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1> -; AVX1-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 396 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 462 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1> -; AVX1-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 660 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 792 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 924 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1> -; AVX1-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1320 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1584 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1848 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1> -; AVX1-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1> @@ -2897,26 +2897,26 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1> -; AVX2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 83 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 117 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1> -; AVX2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 165 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 231 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1> -; AVX2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 396 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 462 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1> -; AVX2-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 660 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 792 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 924 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1> -; AVX2-NEXT: Cost Model: Found an estimated cost of 368 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1320 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1584 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1848 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1> -; AVX2-NEXT: Cost Model: Found an estimated cost of 736 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1> @@ -2932,26 +2932,26 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1> -; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 83 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 117 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1> -; AVX2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 165 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 231 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1> -; AVX2-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 396 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 462 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1> -; AVX2-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 660 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 792 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 924 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1> -; AVX2-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1320 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1584 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1848 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1> -; AVX2-NEXT: Cost Model: Found an estimated cost of 544 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1> @@ -2967,26 +2967,26 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1> -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 83 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 117 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1> -; AVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 165 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 231 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1> -; AVX2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 396 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 462 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1> -; AVX2-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 660 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 792 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 924 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1> -; AVX2-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1320 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1584 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1848 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1> -; AVX2-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1> @@ -3040,26 +3040,26 @@ ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1> -; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> +; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1> -; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> +; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1> -; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1> +; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1> -; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1> +; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 768 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 896 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1> -; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 352 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1> +; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 1280 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 1536 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 1792 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1> -; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 704 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1> +; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1 ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1> @@ -3110,26 +3110,26 @@ ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1> -; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> +; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1> -; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1> +; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1> -; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1> +; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1> -; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1> +; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 768 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 896 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1> -; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1> +; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 1280 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 1536 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 1792 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1> -; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1> +; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1 ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1> @@ -3145,26 +3145,26 @@ ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1> -; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1> +; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1> -; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1> +; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1> -; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1> +; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1> -; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1> +; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 768 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 896 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1> -; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1> +; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 1280 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 1536 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 1792 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1> -; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1> +; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1> ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512FVEC256-LABEL: 'trunc_vXi1' @@ -3183,26 +3183,26 @@ ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1> -; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> +; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1> -; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> +; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1> -; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1> +; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1> -; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1> +; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 768 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 896 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1> -; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 352 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1> +; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 1280 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 1536 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 1792 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1> -; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 704 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1> +; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1 ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1> @@ -3288,26 +3288,26 @@ ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1> -; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1> +; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1> -; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1> +; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1> -; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1> +; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1> -; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1> +; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 768 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 896 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1> -; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1> +; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 1280 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 1536 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 1792 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1> -; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 544 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1> +; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQVEC512-LABEL: 'trunc_vXi1' @@ -3396,26 +3396,26 @@ ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1> -; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> +; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1> -; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1> +; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1> -; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1> +; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1> -; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1> +; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 768 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 896 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1> -; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1> +; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 1280 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 1536 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 1792 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1> -; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1> +; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1 ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1> @@ -3431,26 +3431,26 @@ ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1> -; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1> +; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1> -; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1> +; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1> -; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1> +; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1> -; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1> +; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 768 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 896 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1> -; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1> +; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 1280 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 1536 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 1792 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1> -; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1> +; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1> ; AVX512DQVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQVEC256-LABEL: 'trunc_vXi1' @@ -3574,26 +3574,26 @@ ; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1> ; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1> ; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1> -; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1> +; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1> ; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1> ; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1> ; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1> -; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1> +; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1> ; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1> ; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1> -; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1> +; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1> ; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1> ; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1> ; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1> -; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1> +; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1> ; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1> ; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 768 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1> ; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 896 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1> -; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1> +; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1> ; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 1280 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1> ; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 1536 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1> ; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 1792 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1> -; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 544 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1> +; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1> ; AVX512DQVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BWVEC512-LABEL: 'trunc_vXi1' @@ -3612,26 +3612,26 @@ ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1> -; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> +; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1> -; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> +; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1> -; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1> +; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1> -; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 188 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1> +; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 768 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 896 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1> -; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 376 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1> +; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 1280 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 1536 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 1792 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1> -; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 752 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1> +; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1 ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1> @@ -3647,26 +3647,26 @@ ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1> -; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> +; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1> -; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> +; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1> -; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1> +; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1> -; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1> +; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 768 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 896 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1> -; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1> +; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 1280 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 1536 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 1792 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1> -; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1> +; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1 ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1> ; AVX512BWVEC512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1> @@ -3755,26 +3755,26 @@ ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1> -; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> +; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1> -; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> +; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1> -; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1> +; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1> -; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 188 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1> +; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 768 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 896 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1> -; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 376 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1> +; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 1280 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 1536 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 1792 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1> -; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 752 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1> +; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1 ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1> @@ -3790,26 +3790,26 @@ ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1> -; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> +; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1> -; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> +; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1> -; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1> +; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1> -; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1> +; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 768 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 896 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1> -; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 152 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1> +; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 1280 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 1536 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 1792 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1> -; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 304 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1> +; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1 ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1> ; AVX512BWVEC256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1> @@ -3898,26 +3898,26 @@ ; BTVER2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 83 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 117 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 165 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 231 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 396 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 462 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 660 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 792 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 924 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 368 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1320 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1584 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1848 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 736 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1> @@ -3933,26 +3933,26 @@ ; BTVER2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 83 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 117 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 165 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 231 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 396 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 462 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 168 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 660 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 792 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 924 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 336 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1320 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1584 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1848 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 672 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1> @@ -3968,26 +3968,26 @@ ; BTVER2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 83 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 117 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 165 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 231 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 396 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 462 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 660 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 792 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 924 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1320 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1584 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1848 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1> diff --git a/llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll b/llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll --- a/llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll +++ b/llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll @@ -8,8 +8,9 @@ ; CHECK-NEXT: [[TST:%.*]] = load i1, ptr [[ADDR]], align 1 ; CHECK-NEXT: br i1 [[TST]], label [[NEXT:%.*]], label [[END:%.*]] ; CHECK: next: -; CHECK-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[OFFSET]] -; CHECK-NEXT: [[TMP1:%.*]] = load volatile i1, ptr [[SUNKADDR]], align 1 +; CHECK-NEXT: [[SUNKADDR:%.*]] = trunc i64 [[OFFSET]] to i32 +; CHECK-NEXT: [[SUNKADDR1:%.*]] = getelementptr i8, ptr [[BASE]], i32 [[SUNKADDR]] +; CHECK-NEXT: [[TMP1:%.*]] = load volatile i1, ptr [[SUNKADDR1]], align 1 ; CHECK-NEXT: ret void ; CHECK: end: ; CHECK-NEXT: ret void @@ -33,8 +34,9 @@ ; CHECK-NEXT: [[TST:%.*]] = load i1, ptr [[ADDR]], align 1 ; CHECK-NEXT: br i1 [[TST]], label [[NEXT:%.*]], label [[END:%.*]] ; CHECK: next: -; CHECK-NEXT: [[SUNKADDR:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[OFFSET]] -; CHECK-NEXT: [[TMP1:%.*]] = load volatile i1, ptr [[SUNKADDR]], align 1 +; CHECK-NEXT: [[SUNKADDR:%.*]] = trunc i64 [[OFFSET]] to i32 +; CHECK-NEXT: [[SUNKADDR1:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i32 [[SUNKADDR]] +; CHECK-NEXT: [[TMP1:%.*]] = load volatile i1, ptr [[SUNKADDR1]], align 1 ; CHECK-NEXT: ret void ; CHECK: end: ; CHECK-NEXT: ret void @@ -61,8 +63,8 @@ ; CHECK-NEXT: [[TST:%.*]] = load i1, ptr [[ADDR]], align 1 ; CHECK-NEXT: br i1 [[TST]], label [[NEXT:%.*]], label [[END:%.*]] ; CHECK: next: -; CHECK-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[OFFSET]] -; CHECK-NEXT: [[TMP1:%.*]] = load volatile i1, ptr [[SUNKADDR]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[ADDR64]] to ptr +; CHECK-NEXT: [[TMP2:%.*]] = load volatile i1, ptr [[TMP1]], align 1 ; CHECK-NEXT: ret void ; CHECK: end: ; CHECK-NEXT: ret void diff --git a/llvm/test/CodeGen/AArch64/sme-aarch64-svcount-O3.ll b/llvm/test/CodeGen/AArch64/sme-aarch64-svcount-O3.ll --- a/llvm/test/CodeGen/AArch64/sme-aarch64-svcount-O3.ll +++ b/llvm/test/CodeGen/AArch64/sme-aarch64-svcount-O3.ll @@ -5,14 +5,14 @@ define target("aarch64.svcount") @test_alloca_store_reload(target("aarch64.svcount") %val0, target("aarch64.svcount") %val1, ptr %iptr, ptr %pptr, i64 %N) nounwind { ; CHECK-LABEL: @test_alloca_store_reload( ; CHECK-NEXT: entry: -; CHECK-NEXT: store i64 0, ptr [[IPTR:%.*]], align 4 +; CHECK-NEXT: store i64 0, ptr [[IPTR:%.*]], align 8 ; CHECK-NEXT: store target("aarch64.svcount") [[VAL0:%.*]], ptr [[PPTR:%.*]], align 2 ; CHECK-NEXT: [[I1_PEEL:%.*]] = icmp eq i64 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[I1_PEEL]], label [[LOOP_EXIT:%.*]], label [[LOOP_BODY:%.*]] ; CHECK: loop.body: ; CHECK-NEXT: [[IND:%.*]] = phi i64 [ [[IND_NEXT:%.*]], [[LOOP_BODY]] ], [ 1, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[IPTR_GEP:%.*]] = getelementptr i64, ptr [[IPTR]], i64 [[IND]] -; CHECK-NEXT: store i64 [[IND]], ptr [[IPTR_GEP]], align 4 +; CHECK-NEXT: store i64 [[IND]], ptr [[IPTR_GEP]], align 8 ; CHECK-NEXT: store target("aarch64.svcount") [[VAL1:%.*]], ptr [[PPTR]], align 2 ; CHECK-NEXT: [[IND_NEXT]] = add i64 [[IND]], 1 ; CHECK-NEXT: [[I1:%.*]] = icmp eq i64 [[IND]], [[N]] diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll --- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll @@ -183,12 +183,12 @@ define amdgpu_kernel void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(ptr addrspace(1) %out) #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat ; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) [[OUT]], align 4 +; AKF_HSA-NEXT: store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) [[OUT]], align 8 ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] { -; ATTRIBUTOR_HSA-NEXT: store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) [[OUT]], align 4 +; ATTRIBUTOR_HSA-NEXT: store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) [[OUT]], align 8 ; ATTRIBUTOR_HSA-NEXT: ret void ; store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -69,7 +69,6 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: udiv_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -164,7 +163,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: urem_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -280,7 +278,6 @@ ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: sdiv_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -398,7 +395,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: srem_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -480,7 +476,6 @@ ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: udiv_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c @@ -551,7 +546,6 @@ ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: urem_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c @@ -630,7 +624,6 @@ ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: sdiv_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c @@ -715,7 +708,6 @@ ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: srem_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c @@ -788,7 +780,6 @@ ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: udiv_i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c @@ -856,7 +847,6 @@ ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: urem_i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c @@ -934,7 +924,6 @@ ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: sdiv_i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c @@ -1020,7 +1009,6 @@ ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: srem_i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c @@ -1283,7 +1271,6 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: udiv_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 @@ -1591,7 +1578,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v3, s4 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: urem_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 @@ -1983,7 +1969,6 @@ ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s0, v3 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: sdiv_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 @@ -2391,7 +2376,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: srem_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 @@ -2657,7 +2641,6 @@ ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: udiv_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c @@ -2879,7 +2862,6 @@ ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: urem_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c @@ -3129,7 +3111,6 @@ ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: sdiv_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c @@ -3411,7 +3392,6 @@ ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: srem_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c @@ -3544,7 +3524,6 @@ ; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: udiv_i3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c @@ -3618,7 +3597,6 @@ ; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: urem_i3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c @@ -3700,7 +3678,6 @@ ; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: sdiv_i3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c @@ -3788,7 +3765,6 @@ ; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: srem_i3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c @@ -3934,7 +3910,6 @@ ; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: udiv_v3i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c @@ -4107,7 +4082,6 @@ ; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: urem_v3i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c @@ -4302,7 +4276,6 @@ ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: sdiv_v3i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c @@ -4519,7 +4492,6 @@ ; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: srem_v3i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c @@ -4713,7 +4685,6 @@ ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: udiv_v3i15: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 @@ -4908,7 +4879,6 @@ ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: urem_v3i15: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 @@ -5123,7 +5093,6 @@ ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: sdiv_v3i15: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 @@ -5360,7 +5329,6 @@ ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: srem_v3i15: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 @@ -5464,7 +5432,6 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: udiv_i32_oddk_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c @@ -5501,7 +5468,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: udiv_i32_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c @@ -5537,7 +5503,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: udiv_i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -5579,7 +5544,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: udiv_v2i32_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -5625,7 +5589,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -5771,7 +5734,6 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c @@ -5854,7 +5816,6 @@ ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: urem_i32_oddk_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c @@ -5893,7 +5854,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: urem_i32_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c @@ -5930,7 +5890,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: urem_i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -5973,7 +5932,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: urem_v2i32_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -6105,7 +6063,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: urem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c @@ -6181,7 +6138,6 @@ ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: sdiv_i32_oddk_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c @@ -6221,7 +6177,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: sdiv_i32_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c @@ -6289,7 +6244,6 @@ ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: sdiv_i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -6365,7 +6319,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: sdiv_v2i32_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -6420,7 +6373,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -6605,7 +6557,6 @@ ; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c @@ -6707,7 +6658,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: srem_i32_oddk_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c @@ -6750,7 +6700,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: srem_i32_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c @@ -6813,7 +6762,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: srem_i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -6888,7 +6836,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: srem_v2i32_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -7060,7 +7007,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: srem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c @@ -7134,7 +7080,7 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; CHECK-LABEL: @udiv_i64_oddk_denom( ; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943 -; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GFX6-LABEL: udiv_i64_oddk_denom: @@ -7233,7 +7179,6 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: udiv_i64_oddk_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 @@ -7344,7 +7289,7 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; CHECK-LABEL: @udiv_i64_pow2k_denom( ; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 4096 -; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GFX6-LABEL: udiv_i64_pow2k_denom: @@ -7360,7 +7305,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: udiv_i64_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -7380,7 +7324,7 @@ ; CHECK-LABEL: @udiv_i64_pow2_shl_denom( ; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]] -; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GFX6-LABEL: udiv_i64_pow2_shl_denom: @@ -7398,7 +7342,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: udiv_i64_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -7443,7 +7386,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v3, s7 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: udiv_v2i64_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 @@ -7560,7 +7502,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 @@ -7692,7 +7633,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v3, s7 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: udiv_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 @@ -7718,7 +7658,7 @@ define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; CHECK-LABEL: @urem_i64_oddk_denom( ; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993 -; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GFX6-LABEL: urem_i64_oddk_denom: @@ -7816,7 +7756,6 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: urem_i64_oddk_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 @@ -7925,7 +7864,7 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; CHECK-LABEL: @urem_i64_pow2k_denom( ; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 4096 -; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GFX6-LABEL: urem_i64_pow2k_denom: @@ -7941,7 +7880,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: urem_i64_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -7960,7 +7898,7 @@ ; CHECK-LABEL: @urem_i64_pow2_shl_denom( ; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]] -; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GFX6-LABEL: urem_i64_pow2_shl_denom: @@ -7981,7 +7919,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: urem_i64_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -8029,7 +7966,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: urem_v2i64_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 @@ -8084,7 +8020,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: urem_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 @@ -8115,7 +8050,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; CHECK-LABEL: @sdiv_i64_oddk_denom( ; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195 -; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GFX6-LABEL: sdiv_i64_oddk_denom: @@ -8212,7 +8147,6 @@ ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: sdiv_i64_oddk_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s4, 0x33fe64 @@ -8315,7 +8249,7 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; CHECK-LABEL: @sdiv_i64_pow2k_denom( ; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 4096 -; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GFX6-LABEL: sdiv_i64_pow2k_denom: @@ -8335,7 +8269,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: sdiv_i64_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -8359,7 +8292,7 @@ ; CHECK-LABEL: @sdiv_i64_pow2_shl_denom( ; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]] -; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GFX6-LABEL: sdiv_i64_pow2_shl_denom: @@ -8498,7 +8431,6 @@ ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: sdiv_i64_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 @@ -8690,7 +8622,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v3, s7 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: sdiv_v2i64_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 @@ -8829,7 +8760,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 @@ -9208,7 +9138,6 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 @@ -9510,7 +9439,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; CHECK-LABEL: @srem_i64_oddk_denom( ; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 1235195 -; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GFX6-LABEL: srem_i64_oddk_denom: @@ -9605,7 +9534,6 @@ ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: srem_i64_oddk_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s4, 0x33fe64 @@ -9711,7 +9639,7 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; CHECK-LABEL: @srem_i64_pow2k_denom( ; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 4096 -; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GFX6-LABEL: srem_i64_pow2k_denom: @@ -9733,7 +9661,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: srem_i64_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -9759,7 +9686,7 @@ ; CHECK-LABEL: @srem_i64_pow2_shl_denom( ; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]] -; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GFX6-LABEL: srem_i64_pow2_shl_denom: @@ -9896,7 +9823,6 @@ ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: srem_i64_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 @@ -10089,7 +10015,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v3, s7 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: srem_v2i64_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 @@ -10388,7 +10313,6 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX6-NEXT: s_endpgm -; ; GFX9-LABEL: srem_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll @@ -56,12 +56,13 @@ ; CHECK-LABEL: define void @sincos_f32 ; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4 -; CHECK-NEXT: [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[__SINCOS_]]) -; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[__SINCOS_]], align 4 -; CHECK-NEXT: store float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4 +; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5) +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr +; CHECK-NEXT: [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4 +; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4 ; CHECK-NEXT: [[CALL1:%.*]] = tail call contract float @_Z3cosf(float [[X]]) -; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 4 +; CHECK-NEXT: store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -166,13 +166,13 @@ ; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_id ; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: [[VAL:%.*]] = call i64 @llvm.amdgcn.dispatch.id() -; AKF_HSA-NEXT: store volatile i64 [[VAL]], ptr addrspace(1) undef, align 4 +; AKF_HSA-NEXT: store volatile i64 [[VAL]], ptr addrspace(1) undef, align 8 ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_dispatch_id ; ATTRIBUTOR_HSA-SAME: () #[[ATTR9:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i64 @llvm.amdgcn.dispatch.id() -; ATTRIBUTOR_HSA-NEXT: store volatile i64 [[VAL]], ptr addrspace(1) undef, align 4 +; ATTRIBUTOR_HSA-NEXT: store volatile i64 [[VAL]], ptr addrspace(1) undef, align 8 ; ATTRIBUTOR_HSA-NEXT: ret void ; %val = call i64 @llvm.amdgcn.dispatch.id() diff --git a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll --- a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll +++ b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll @@ -42,7 +42,7 @@ ; CHECK-NEXT: [[VAL8:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() ; CHECK-NEXT: store volatile i32 [[VAL8]], ptr addrspace(1) null, align 4 ; CHECK-NEXT: [[VAL9:%.*]] = call i64 @llvm.amdgcn.dispatch.id() -; CHECK-NEXT: store volatile i64 [[VAL9]], ptr addrspace(1) null, align 4 +; CHECK-NEXT: store volatile i64 [[VAL9]], ptr addrspace(1) null, align 8 ; CHECK-NEXT: ret void ; %val0 = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/lds-reject-absolute-addresses.ll b/llvm/test/CodeGen/AMDGPU/lds-reject-absolute-addresses.ll --- a/llvm/test/CodeGen/AMDGPU/lds-reject-absolute-addresses.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-reject-absolute-addresses.ll @@ -11,5 +11,5 @@ ret void } -!0 = !{i64 0, i64 1} +!0 = !{i32 0, i32 1} diff --git a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll --- a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll @@ -34,7 +34,7 @@ ; CHECK: @[[__INIT_ARRAY_END:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)] ; CHECK: @[[__FINI_ARRAY_START:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)] ; CHECK: @[[__FINI_ARRAY_END:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)] -; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini], section "llvm.metadata" +; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending addrspace(1) global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini], section "llvm.metadata" ; CHECK: @[[FOO_ALIAS:[a-zA-Z0-9_$"\\.-]+]] = hidden alias void (), ptr @foo ;. ; CHECK-LABEL: define void @foo( diff --git a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll --- a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll @@ -51,7 +51,7 @@ ; CHECK: @[[__INIT_ARRAY_END:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)] ; CHECK: @[[__FINI_ARRAY_START:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)] ; CHECK: @[[__FINI_ARRAY_END:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)] -; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini], section "llvm.metadata" +; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending addrspace(1) global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini], section "llvm.metadata" ;. ; CHECK-LABEL: define internal void @foo() { ; CHECK-NEXT: ret void diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll --- a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll @@ -15,7 +15,7 @@ ;. ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 8, !absolute_symbol !0 -; CHECK: @llvm.compiler.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata" +; CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata" ; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t poison, align 16, !absolute_symbol !0 ; CHECK: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t poison, align 16, !absolute_symbol !0 ; CHECK: @llvm.amdgcn.kernel.k2.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k2.lds.t poison, align 2, !absolute_symbol !0 @@ -99,4 +99,4 @@ ; CHECK: attributes #3 = { "amdgpu-lds-size"="4" } ; CHECK: attributes #4 = { "amdgpu-lds-size"="9" } -; CHECK: !0 = !{i64 0, i64 1} +; CHECK: !0 = !{i32 0, i32 1} diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll --- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll @@ -24,7 +24,7 @@ ; SUPER-ALIGN_ON: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t poison, align 16 ; SUPER-ALIGN_OFF: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t poison, align 8 -; SUPER-ALIGN_ON: @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t poison, align 16 +; SUPER-ALIGN_ON: @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t poison, align 8 ; SUPER-ALIGN_OFF: @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t poison, align 4 ; CHECK-LABEL: @k1 @@ -133,11 +133,9 @@ ; Check that aligment is not propagated if use is not a pointer operand. ; CHECK-LABEL: @k4 -; SUPER-ALIGN_ON: store i32 poison, ptr addrspace(3) %gep, align 8 -; SUPER-ALIGN_OFF: store i32 poison, ptr addrspace(3) %gep, align 4 +; CHECK: store i32 poison, ptr addrspace(3) %gep, align 4 ; CHECK: store ptr addrspace(3) %gep, ptr poison, align 4 -; SUPER-ALIGN_ON: %val1 = cmpxchg volatile ptr addrspace(3) %gep, i32 1, i32 2 monotonic monotonic, align 8 -; SUPER-ALIGN_OFF: %val1 = cmpxchg volatile ptr addrspace(3) %gep, i32 1, i32 2 monotonic monotonic, align 4 +; CHECK: %val1 = cmpxchg volatile ptr addrspace(3) %gep, i32 1, i32 2 monotonic monotonic, align 4 ; CHECK: %val2 = cmpxchg volatile ptr poison, ptr addrspace(3) %gep, ptr addrspace(3) poison monotonic monotonic, align 4 define amdgpu_kernel void @k4() { %gep = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @lds.6, i64 1 diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll --- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll @@ -62,4 +62,4 @@ ret void } -; CHECK: !0 = !{i64 0, i64 1} +; CHECK: !0 = !{i32 0, i32 1} diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll @@ -69,7 +69,7 @@ ; CHECK: attributes #[[ATTR0]] = { "amdgpu-lds-size"="7" } ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } ;. -; CHECK: [[META0:![0-9]+]] = !{i64 0, i64 1} +; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 1} ; CHECK: [[META1:![0-9]+]] = !{!2} ; CHECK: [[META2:![0-9]+]] = distinct !{!2, !3} ; CHECK: [[META3:![0-9]+]] = distinct !{!3} diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll @@ -42,7 +42,7 @@ !8 = !{!"omnipotent char", !9, i64 0} !9 = !{!"Simple C++ TBAA"} -; CHECK:!0 = !{i64 0, i64 1} +; CHECK:!0 = !{i32 0, i32 1} ; CHECK:!1 = !{!2, !3, i64 0} ; CHECK:!2 = !{!"no_clobber_ds_load_stores_x2_preexisting_aa", !3, i64 0} ; CHECK:!3 = !{!"int", !4, i64 0} diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll @@ -110,7 +110,7 @@ ret void } -; CHECK: !0 = !{i64 0, i64 1} +; CHECK: !0 = !{i32 0, i32 1} ; CHECK: !1 = !{!2} ; CHECK: !2 = distinct !{!2, !3} ; CHECK: !3 = distinct !{!3} diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll @@ -16,7 +16,7 @@ ; CHECK: @dynamic_kernel_only = external addrspace(3) global [0 x double] ; CHECK: @dynamic_shared8 = external addrspace(3) global [0 x i64], align 8 ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 4, !absolute_symbol !0 -; CHECK: @llvm.compiler.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata" +; CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata" ; Alignment of these must be the maximum of the alignment of the reachable symbols ; CHECK: @llvm.amdgcn.expect_align1.dynlds = external addrspace(3) global [0 x i8], align 1, !absolute_symbol !0 @@ -103,7 +103,7 @@ ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[DYNAMIC_SHARED8]], align 4 ; CHECK-NEXT: [[DYNAMIC_SHARED81:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3) ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i64], ptr addrspace(3) [[DYNAMIC_SHARED81]], i32 0, i32 7 -; CHECK-NEXT: store i64 3, ptr addrspace(3) [[ARRAYIDX]], align 4 +; CHECK-NEXT: store i64 3, ptr addrspace(3) [[ARRAYIDX]], align 8 ; CHECK-NEXT: ret void ; %arrayidx = getelementptr inbounds [0 x i64], ptr addrspace(3) @dynamic_shared8, i32 0, i32 7 @@ -149,7 +149,7 @@ ; CHECK-LABEL: define amdgpu_kernel void @expect_align8() !llvm.amdgcn.lds.kernel.id !5 { ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align8.dynlds) ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i64], ptr addrspace(3) @dynamic_shared8, i32 0, i32 9 -; CHECK-NEXT: store i64 3, ptr addrspace(3) [[ARRAYIDX]], align 4 +; CHECK-NEXT: store i64 3, ptr addrspace(3) [[ARRAYIDX]], align 8 ; CHECK-NEXT: call void @use_shared8() ; CHECK-NEXT: ret void ; @@ -188,8 +188,8 @@ ; CHECK: attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) } ; CHECK: attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; CHECK: !0 = !{i64 0, i64 1} -; CHECK: !1 = !{i64 4, i64 5} +; CHECK: !0 = !{i32 0, i32 1} +; CHECK: !1 = !{i32 4, i32 5} ; CHECK: !2 = !{i32 0} ; CHECK: !3 = !{i32 1} ; CHECK: !4 = !{i32 2} diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll @@ -26,7 +26,7 @@ @llvm.used = appending global [2 x ptr] [ptr addrspacecast (ptr addrspace(3) @tolower to ptr), ptr addrspacecast (ptr addrspace(1) @ignored to ptr)], section "llvm.metadata" ; @ignored still in list, @tolower removed, llvm.amdgcn.module.lds appended -; CHECK: @llvm.compiler.used = appending global [2 x ptr] [ptr addrspacecast (ptr addrspace(1) @ignored to ptr), ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata" +; CHECK: @llvm.compiler.used = appending addrspace(1) global [2 x ptr] [ptr addrspacecast (ptr addrspace(1) @ignored to ptr), ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata" @llvm.compiler.used = appending global [2 x ptr] [ptr addrspacecast (ptr addrspace(3) @tolower to ptr), ptr addrspacecast (ptr addrspace(1) @ignored to ptr)], section "llvm.metadata" diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -mtriple=amdgcn--amdhsa -passes=amdgpu-lower-module-lds < %s --amdgpu-lower-module-lds-strategy=hybrid | FileCheck -check-prefix=OPT %s ; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=hybrid | FileCheck -check-prefix=GCN %s @@ -12,7 +12,7 @@ @unused = addrspace(3) global i16 poison ; OPT: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 16, !absolute_symbol !0 -; OPT: @llvm.compiler.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata" +; OPT: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata" ; OPT: @llvm.amdgcn.kernel.kernel_no_table.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kernel_no_table.lds.t poison, align 8, !absolute_symbol !0 ; OPT: @llvm.amdgcn.kernel.k01.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k01.lds.t poison, align 4, !absolute_symbol !1 ; OPT: @llvm.amdgcn.kernel.k23.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k23.lds.t poison, align 8, !absolute_symbol !0 @@ -73,12 +73,12 @@ ; OPT-NEXT: [[V22:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0 ; OPT-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[V22]], align 4 ; OPT-NEXT: [[V23:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3) -; OPT-NEXT: [[LD:%.*]] = load i64, ptr addrspace(3) [[V23]], align 4 +; OPT-NEXT: [[LD:%.*]] = load i64, ptr addrspace(3) [[V23]], align 8 ; OPT-NEXT: [[MUL:%.*]] = mul i64 [[LD]], 4 ; OPT-NEXT: [[V2:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0 ; OPT-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[V2]], align 4 ; OPT-NEXT: [[V21:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3) -; OPT-NEXT: store i64 [[MUL]], ptr addrspace(3) [[V21]], align 4 +; OPT-NEXT: store i64 [[MUL]], ptr addrspace(3) [[V21]], align 8 ; OPT-NEXT: ret void ; ; GCN-LABEL: f2: @@ -193,7 +193,7 @@ define amdgpu_kernel void @k23() { ; OPT-LABEL: @k23( -; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds) ], !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]] +; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds) ], !alias.scope !4, !noalias !7 ; OPT-NEXT: call void @f2() ; OPT-NEXT: call void @f3() ; OPT-NEXT: ret void @@ -231,12 +231,12 @@ ; Access and allocate three variables define amdgpu_kernel void @k123() { ; OPT-LABEL: @k123( -; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ], !alias.scope [[META10:![0-9]+]], !noalias [[META13:![0-9]+]] +; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ], !alias.scope !10, !noalias !13 ; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; OPT-NEXT: call void @f1() -; OPT-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope [[META13]], !noalias [[META10]] +; OPT-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope !13, !noalias !10 ; OPT-NEXT: [[MUL:%.*]] = mul i8 [[LD]], 8 -; OPT-NEXT: store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope [[META13]], !noalias [[META10]] +; OPT-NEXT: store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope !13, !noalias !10 ; OPT-NEXT: call void @f2() ; OPT-NEXT: ret void ; @@ -285,14 +285,14 @@ ; OPT: attributes #0 = { "amdgpu-lds-size"="8" } -; OPT: attributes #1 = { "amdgpu-lds-size"="12" } -; OPT: attributes #2 = { "amdgpu-lds-size"="20" } +; OPT: attributes #1 = { "amdgpu-lds-size"="16" } +; OPT: attributes #2 = { "amdgpu-lds-size"="24" } ; OPT: attributes #3 = { nocallback nofree nosync nounwind willreturn memory(none) } ; OPT: attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; OPT: !0 = !{i64 0, i64 1} -; OPT: !1 = !{i64 4, i64 5} -; OPT: !2 = !{i64 8, i64 9} +; OPT: !0 = !{i32 0, i32 1} +; OPT: !1 = !{i32 4, i32 5} +; OPT: !2 = !{i32 8, i32 9} ; OPT: !3 = !{i32 1} ; OPT: !4 = !{!5} ; OPT: !5 = distinct !{!5, !6} diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll @@ -118,12 +118,12 @@ ; OPT-NEXT: [[V22:%.*]] = getelementptr inbounds [3 x [4 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 2 ; OPT-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[V22]], align 4 ; OPT-NEXT: [[V23:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3) -; OPT-NEXT: [[LD:%.*]] = load i64, ptr addrspace(3) [[V23]], align 4 +; OPT-NEXT: [[LD:%.*]] = load i64, ptr addrspace(3) [[V23]], align 8 ; OPT-NEXT: [[MUL:%.*]] = mul i64 [[LD]], 4 ; OPT-NEXT: [[V2:%.*]] = getelementptr inbounds [3 x [4 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 2 ; OPT-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[V2]], align 4 ; OPT-NEXT: [[V21:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3) -; OPT-NEXT: store i64 [[MUL]], ptr addrspace(3) [[V21]], align 4 +; OPT-NEXT: store i64 [[MUL]], ptr addrspace(3) [[V21]], align 8 ; OPT-NEXT: ret void ; ; GCN-LABEL: f2: @@ -300,7 +300,7 @@ ; Access and allocate three variables define amdgpu_kernel void @k123() { ; OPT-LABEL: define amdgpu_kernel void @k123( -; OPT-SAME: ) #[[ATTR2:[0-9]+]] !llvm.amdgcn.lds.kernel.id !13 { +; OPT-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id !13 { ; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ], !alias.scope !14, !noalias !17 ; OPT-NEXT: call void @f1() ; OPT-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 2, !alias.scope !20, !noalias !21 @@ -352,8 +352,7 @@ ; OPT: declare i32 @llvm.amdgcn.lds.kernel.id() ; OPT: attributes #0 = { "amdgpu-lds-size"="8" } -; OPT: attributes #1 = { "amdgpu-lds-size"="12" } -; OPT: attributes #2 = { "amdgpu-lds-size"="16" } +; OPT: attributes #1 = { "amdgpu-lds-size"="16" } !0 = !{i64 0, i64 1} !1 = !{i32 0} diff --git a/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll b/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll --- a/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll @@ -11,7 +11,7 @@ ; CHECK: @__init_array_end = external addrspace(1) constant [0 x ptr addrspace(1)] ; CHECK: @__fini_array_start = external addrspace(1) constant [0 x ptr addrspace(1)] ; CHECK: @__fini_array_end = external addrspace(1) constant [0 x ptr addrspace(1)] -; CHECK: @llvm.used = appending global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini] +; CHECK: @llvm.used = appending addrspace(1) global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini] ; UTC_ARGS: --enable diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll --- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll @@ -50,7 +50,7 @@ ; IR-NEXT: bb: ; IR-NEXT: [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #[[ATTR4:[0-9]+]] ; IR-NEXT: [[MY_TMP1:%.*]] = getelementptr inbounds i64, ptr addrspace(3) [[ARG:%.*]], i32 [[MY_TMP]] -; IR-NEXT: [[MY_TMP2:%.*]] = load volatile i64, ptr addrspace(3) [[MY_TMP1]], align 4 +; IR-NEXT: [[MY_TMP2:%.*]] = load volatile i64, ptr addrspace(3) [[MY_TMP1]], align 8 ; IR-NEXT: br label [[BB5:%.*]] ; IR: bb3: ; IR-NEXT: br i1 true, label [[BB4:%.*]], label [[BB13:%.*]] @@ -93,6 +93,7 @@ ; IR: bb23: ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP6]]) ; IR-NEXT: ret void +; bb: %my.tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %my.tmp1 = getelementptr inbounds i64, ptr addrspace(3) %arg, i32 %my.tmp @@ -276,6 +277,7 @@ ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP7]]) ; IR-NEXT: store volatile i32 0, ptr addrspace(1) undef, align 4 ; IR-NEXT: ret void +; bb: %my.tmp1134 = load volatile i32, ptr addrspace(1) undef %my.tmp1235 = icmp slt i32 %my.tmp1134, 9 diff --git a/llvm/test/CodeGen/AMDGPU/opencl-printf.ll b/llvm/test/CodeGen/AMDGPU/opencl-printf.ll --- a/llvm/test/CodeGen/AMDGPU/opencl-printf.ll +++ b/llvm/test/CodeGen/AMDGPU/opencl-printf.ll @@ -86,7 +86,7 @@ ; GCN-NEXT: [[PRINTBUFFNEXTPTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], i32 4 ; GCN-NEXT: store i32 [[I32:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 4 ; GCN-NEXT: [[PRINTBUFFNEXTPTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], i32 4 -; GCN-NEXT: store i64 [[I64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 4 +; GCN-NEXT: store i64 [[I64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 8 ; GCN-NEXT: [[PRINTBUFFNEXTPTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], i32 8 ; GCN-NEXT: store <2 x float> , ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 8 ; GCN-NEXT: [[PRINTBUFFNEXTPTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], i32 8 @@ -108,7 +108,7 @@ ; R600-NEXT: ret void ; ; GCN-LABEL: @format_str_ptr( -; GCN-NEXT: [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 44) +; GCN-NEXT: [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 36) ; GCN-NEXT: br label [[DOTSPLIT:%.*]] ; GCN: .split: ; GCN-NEXT: [[TMP1:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null @@ -120,12 +120,12 @@ ; GCN-NEXT: [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4 ; GCN-NEXT: store ptr [[PTR_FLAT:%.*]], ptr addrspace(1) [[PRINTBUFFGEP]], align 8 ; GCN-NEXT: [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 8 -; GCN-NEXT: store ptr addrspace(3) [[PTR_LDS:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR]], align 8 -; GCN-NEXT: [[PRINTBUFFNEXTPTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], i32 8 +; GCN-NEXT: store ptr addrspace(3) [[PTR_LDS:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR]], align 4 +; GCN-NEXT: [[PRINTBUFFNEXTPTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], i32 4 ; GCN-NEXT: store ptr addrspace(1) [[PTR_GLOBAL:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR1]], align 8 ; GCN-NEXT: [[PRINTBUFFNEXTPTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR1]], i32 8 -; GCN-NEXT: store ptr addrspace(5) [[PTR_STACK:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], align 8 -; GCN-NEXT: [[PRINTBUFFNEXTPTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], i32 8 +; GCN-NEXT: store ptr addrspace(5) [[PTR_STACK:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], align 4 +; GCN-NEXT: [[PRINTBUFFNEXTPTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], i32 4 ; GCN-NEXT: store ptr addrspace(4) [[PTR_CONST:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], align 8 ; GCN-NEXT: br label [[TMP3]] ; GCN: 3: @@ -145,7 +145,7 @@ ; GCN-NEXT: [[TMP2:%.*]] = sext i4 [[I4:%.*]] to i32 ; GCN-NEXT: [[TMP3:%.*]] = sext i8 [[I8:%.*]] to i32 ; GCN-NEXT: [[TMP4:%.*]] = sext i16 [[I16:%.*]] to i32 -; GCN-NEXT: [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 68) +; GCN-NEXT: [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 72) ; GCN-NEXT: br label [[DOTSPLIT:%.*]] ; GCN: .split: ; GCN-NEXT: [[TMP5:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null @@ -167,11 +167,11 @@ ; GCN-NEXT: [[PRINTBUFFNEXTPTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], i32 4 ; GCN-NEXT: store i32 [[I32:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], align 4 ; GCN-NEXT: [[PRINTBUFFNEXTPTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], i32 4 -; GCN-NEXT: store i64 [[I64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 4 +; GCN-NEXT: store i64 [[I64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 8 ; GCN-NEXT: [[PRINTBUFFNEXTPTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], i32 8 -; GCN-NEXT: store i96 [[I96:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 4 -; GCN-NEXT: [[PRINTBUFFNEXTPTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], i32 12 -; GCN-NEXT: store i128 [[I128:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 4 +; GCN-NEXT: store i96 [[I96:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 8 +; GCN-NEXT: [[PRINTBUFFNEXTPTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], i32 16 +; GCN-NEXT: store i128 [[I128:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 8 ; GCN-NEXT: [[PRINTBUFFNEXTPTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], i32 16 ; GCN-NEXT: store i32 1234, ptr addrspace(1) [[PRINTBUFFNEXTPTR8]], align 4 ; GCN-NEXT: br label [[TMP7]] @@ -192,7 +192,7 @@ ; GCN-NEXT: [[TMP2:%.*]] = zext i4 [[I4:%.*]] to i32 ; GCN-NEXT: [[TMP3:%.*]] = zext i8 [[I8:%.*]] to i32 ; GCN-NEXT: [[TMP4:%.*]] = zext i16 [[I16:%.*]] to i32 -; GCN-NEXT: [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 68) +; GCN-NEXT: [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 72) ; GCN-NEXT: br label [[DOTSPLIT:%.*]] ; GCN: .split: ; GCN-NEXT: [[TMP5:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null @@ -214,11 +214,11 @@ ; GCN-NEXT: [[PRINTBUFFNEXTPTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], i32 4 ; GCN-NEXT: store i32 [[I32:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], align 4 ; GCN-NEXT: [[PRINTBUFFNEXTPTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], i32 4 -; GCN-NEXT: store i64 [[I64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 4 +; GCN-NEXT: store i64 [[I64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 8 ; GCN-NEXT: [[PRINTBUFFNEXTPTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], i32 8 -; GCN-NEXT: store i96 [[I96:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 4 -; GCN-NEXT: [[PRINTBUFFNEXTPTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], i32 12 -; GCN-NEXT: store i128 [[I128:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 4 +; GCN-NEXT: store i96 [[I96:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 8 +; GCN-NEXT: [[PRINTBUFFNEXTPTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], i32 16 +; GCN-NEXT: store i128 [[I128:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 8 ; GCN-NEXT: [[PRINTBUFFNEXTPTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], i32 16 ; GCN-NEXT: store i32 1234, ptr addrspace(1) [[PRINTBUFFNEXTPTR8]], align 4 ; GCN-NEXT: br label [[TMP7]] diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll --- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll @@ -134,10 +134,10 @@ define i64 @cmpxchg_private_i64(ptr addrspace(5) %ptr) { ; IR-LABEL: @cmpxchg_private_i64( -; IR-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(5) [[PTR:%.*]], align 4 +; IR-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(5) [[PTR:%.*]], align 8 ; IR-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 0 ; IR-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 1, i64 [[TMP1]] -; IR-NEXT: store i64 [[TMP3]], ptr addrspace(5) [[PTR]], align 4 +; IR-NEXT: store i64 [[TMP3]], ptr addrspace(5) [[PTR]], align 8 ; IR-NEXT: [[TMP4:%.*]] = insertvalue { i64, i1 } poison, i64 [[TMP1]], 0 ; IR-NEXT: [[TMP5:%.*]] = insertvalue { i64, i1 } [[TMP4]], i1 [[TMP2]], 1 ; IR-NEXT: [[RESULT_0:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll @@ -93,7 +93,7 @@ ; CHECK-NEXT: [[FOO3_FCA_0_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> undef, float [[FOO3_FCA_0_EXTRACT]], i32 0 ; CHECK-NEXT: [[FOO3_FCA_1_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3_FCA_1_EXTRACT]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3_FCA_1_EXTRACT]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 [[FOO1]] ; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0 ; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP3]], i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll @@ -7,7 +7,7 @@ ; CHECK-LABEL: @memset_all_zero( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <6 x i64> zeroinitializer, i64 [[VAL:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <6 x i64> [[TMP0]], i64 [[VAL]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <6 x i64> [[TMP0]], i64 [[VAL]], i32 1 ; CHECK-NEXT: ret void ; entry: @@ -24,7 +24,7 @@ ; CHECK-LABEL: @memset_all_5( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> , i64 [[VAL:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[VAL]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[VAL]], i32 1 ; CHECK-NEXT: ret void ; entry: @@ -42,7 +42,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5) ; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 32, i1 true) -; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4 +; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -57,7 +57,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5) ; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 31, i1 true) -; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4 +; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -73,7 +73,7 @@ ; CHECK-NEXT: [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5) ; CHECK-NEXT: [[GEP:%.*]] = getelementptr [4 x i64], ptr addrspace(5) [[STACK]], i64 0, i64 1 ; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) [[GEP]], i8 0, i64 24, i1 true) -; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4 +; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 8 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll b/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll --- a/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll @@ -1,8 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-rewrite-out-arguments < %s | FileCheck %s -; Temporarily add an explicit datalayout until https://reviews.llvm.org/D141060 lands -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8" -target triple = "amdgcn-amd-amdhsa" define void @no_ret_blocks() #0 { unreachable diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll --- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll +++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll @@ -36,11 +36,11 @@ ; CHECK-ALU64-NEXT: # %bb.0: # %entry ; CHECK-ALU64-NEXT: #DEBUG_VALUE: test:arg <- $r1 ; CHECK-ALU64-NEXT: .Ltmp0: -; CHECK-ALU64-NEXT: r1 = 20 +; CHECK-ALU64-NEXT: r1 = 16 ; CHECK-ALU64-NEXT: .Ltmp1: ; CHECK-ALU64-NEXT: .Ltmp2: ; CHECK-ALU64-NEXT: .Ltmp3: -; CHECK-ALU64-NEXT: r0 = 4 +; CHECK-ALU64-NEXT: r0 = 8 ; CHECK-ALU64-NEXT: .Ltmp4: ; CHECK-ALU64-NEXT: .loc 1 12 69 prologue_end # test.c:12:69 ; CHECK-ALU64-NEXT: .Ltmp5: @@ -67,11 +67,11 @@ ; CHECK-ALU32-NEXT: # %bb.0: # %entry ; CHECK-ALU32-NEXT: #DEBUG_VALUE: test:arg <- $r1 ; CHECK-ALU32-NEXT: .Ltmp0: -; CHECK-ALU32-NEXT: r1 = 20 +; CHECK-ALU32-NEXT: r1 = 16 ; CHECK-ALU32-NEXT: .Ltmp1: ; CHECK-ALU32-NEXT: .Ltmp2: ; CHECK-ALU32-NEXT: .Ltmp3: -; CHECK-ALU32-NEXT: r0 = 4 +; CHECK-ALU32-NEXT: r0 = 8 ; CHECK-ALU32-NEXT: .Ltmp4: ; CHECK-ALU32-NEXT: .loc 1 12 69 prologue_end # test.c:12:69 ; CHECK-ALU32-NEXT: .Ltmp5: diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll --- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll +++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll @@ -36,18 +36,18 @@ ; CHECK-ALU64-NEXT: # %bb.0: # %entry ; CHECK-ALU64-NEXT: #DEBUG_VALUE: test:arg <- $r1 ; CHECK-ALU64-NEXT: .Ltmp0: -; CHECK-ALU64-NEXT: r1 = 20 +; CHECK-ALU64-NEXT: r1 = 16 ; CHECK-ALU64-NEXT: .Ltmp1: ; CHECK-ALU64-NEXT: .Ltmp2: ; CHECK-ALU64-NEXT: .Ltmp3: -; CHECK-ALU64-NEXT: r0 = 4 +; CHECK-ALU64-NEXT: r0 = 8 ; CHECK-ALU64-NEXT: .Ltmp4: ; CHECK-ALU64-NEXT: .loc 1 12 69 prologue_end # test.c:12:69 ; CHECK-ALU64-NEXT: .Ltmp5: ; CHECK-ALU64-NEXT: .Ltmp6: ; CHECK-ALU64-NEXT: r0 += r1 ; CHECK-ALU64-NEXT: .Ltmp7: -; CHECK-ALU64-NEXT: r1 = 50 +; CHECK-ALU64-NEXT: r1 = 18 ; CHECK-ALU64-NEXT: .loc 1 13 67 # test.c:13:67 ; CHECK-ALU64-NEXT: .Ltmp8: ; CHECK-ALU64-NEXT: r0 += r1 @@ -67,18 +67,18 @@ ; CHECK-ALU32-NEXT: # %bb.0: # %entry ; CHECK-ALU32-NEXT: #DEBUG_VALUE: test:arg <- $r1 ; CHECK-ALU32-NEXT: .Ltmp0: -; CHECK-ALU32-NEXT: r1 = 20 +; CHECK-ALU32-NEXT: r1 = 16 ; CHECK-ALU32-NEXT: .Ltmp1: ; CHECK-ALU32-NEXT: .Ltmp2: ; CHECK-ALU32-NEXT: .Ltmp3: -; CHECK-ALU32-NEXT: r0 = 4 +; CHECK-ALU32-NEXT: r0 = 8 ; CHECK-ALU32-NEXT: .Ltmp4: ; CHECK-ALU32-NEXT: .loc 1 12 69 prologue_end # test.c:12:69 ; CHECK-ALU32-NEXT: .Ltmp5: ; CHECK-ALU32-NEXT: .Ltmp6: ; CHECK-ALU32-NEXT: w0 += w1 ; CHECK-ALU32-NEXT: .Ltmp7: -; CHECK-ALU32-NEXT: r1 = 50 +; CHECK-ALU32-NEXT: r1 = 18 ; CHECK-ALU32-NEXT: .loc 1 13 67 # test.c:13:67 ; CHECK-ALU32-NEXT: .Ltmp8: ; CHECK-ALU32-NEXT: w0 += w1 diff --git a/llvm/test/CodeGen/X86/expand-large-div-rem-sdiv129.ll b/llvm/test/CodeGen/X86/expand-large-div-rem-sdiv129.ll --- a/llvm/test/CodeGen/X86/expand-large-div-rem-sdiv129.ll +++ b/llvm/test/CodeGen/X86/expand-large-div-rem-sdiv129.ll @@ -4,7 +4,7 @@ define void @sdiv129(ptr %ptr, ptr %out) nounwind { ; CHECK-LABEL: @sdiv129( ; CHECK-NEXT: _udiv-special-cases: -; CHECK-NEXT: [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 4 +; CHECK-NEXT: [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 16 ; CHECK-NEXT: [[TMP0:%.*]] = freeze i129 [[A]] ; CHECK-NEXT: [[TMP1:%.*]] = freeze i129 3 ; CHECK-NEXT: [[TMP2:%.*]] = ashr i129 [[TMP0]], 128 @@ -66,7 +66,7 @@ ; CHECK-NEXT: [[TMP48:%.*]] = phi i129 [ [[TMP25]], [[UDIV_LOOP_EXIT]] ], [ [[TMP20]], [[_UDIV_SPECIAL_CASES:%.*]] ] ; CHECK-NEXT: [[TMP49:%.*]] = xor i129 [[TMP48]], [[TMP8]] ; CHECK-NEXT: [[TMP50:%.*]] = sub i129 [[TMP49]], [[TMP8]] -; CHECK-NEXT: store i129 [[TMP50]], ptr [[OUT:%.*]], align 4 +; CHECK-NEXT: store i129 [[TMP50]], ptr [[OUT:%.*]], align 16 ; CHECK-NEXT: ret void ; %a = load i129, ptr %ptr diff --git a/llvm/test/CodeGen/X86/expand-large-div-rem-srem129.ll b/llvm/test/CodeGen/X86/expand-large-div-rem-srem129.ll --- a/llvm/test/CodeGen/X86/expand-large-div-rem-srem129.ll +++ b/llvm/test/CodeGen/X86/expand-large-div-rem-srem129.ll @@ -4,7 +4,7 @@ define void @test(ptr %ptr, ptr %out) nounwind { ; CHECK-LABEL: @test( ; CHECK-NEXT: _udiv-special-cases: -; CHECK-NEXT: [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 4 +; CHECK-NEXT: [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 16 ; CHECK-NEXT: [[TMP0:%.*]] = freeze i129 [[A]] ; CHECK-NEXT: [[TMP1:%.*]] = freeze i129 3 ; CHECK-NEXT: [[TMP2:%.*]] = ashr i129 [[TMP0]], 128 @@ -69,7 +69,7 @@ ; CHECK-NEXT: [[TMP51:%.*]] = sub i129 [[TMP8]], [[TMP50]] ; CHECK-NEXT: [[TMP52:%.*]] = xor i129 [[TMP51]], [[TMP2]] ; CHECK-NEXT: [[TMP53:%.*]] = sub i129 [[TMP52]], [[TMP2]] -; CHECK-NEXT: store i129 [[TMP53]], ptr [[OUT:%.*]], align 4 +; CHECK-NEXT: store i129 [[TMP53]], ptr [[OUT:%.*]], align 16 ; CHECK-NEXT: ret void ; %a = load i129, ptr %ptr diff --git a/llvm/test/CodeGen/X86/expand-large-div-rem-udiv129.ll b/llvm/test/CodeGen/X86/expand-large-div-rem-udiv129.ll --- a/llvm/test/CodeGen/X86/expand-large-div-rem-udiv129.ll +++ b/llvm/test/CodeGen/X86/expand-large-div-rem-udiv129.ll @@ -4,7 +4,7 @@ define void @test(ptr %ptr, ptr %out) nounwind { ; CHECK-LABEL: @test( ; CHECK-NEXT: _udiv-special-cases: -; CHECK-NEXT: [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 4 +; CHECK-NEXT: [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 16 ; CHECK-NEXT: [[TMP0:%.*]] = freeze i129 3 ; CHECK-NEXT: [[TMP1:%.*]] = freeze i129 [[A]] ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i129 [[TMP0]], 0 @@ -55,7 +55,7 @@ ; CHECK-NEXT: br i1 [[TMP38]], label [[UDIV_LOOP_EXIT]], label [[UDIV_PREHEADER]] ; CHECK: udiv-end: ; CHECK-NEXT: [[TMP39:%.*]] = phi i129 [ [[TMP16]], [[UDIV_LOOP_EXIT]] ], [ [[TMP11]], [[_UDIV_SPECIAL_CASES:%.*]] ] -; CHECK-NEXT: store i129 [[TMP39]], ptr [[OUT:%.*]], align 4 +; CHECK-NEXT: store i129 [[TMP39]], ptr [[OUT:%.*]], align 16 ; CHECK-NEXT: ret void ; %a = load i129, ptr %ptr diff --git a/llvm/test/CodeGen/X86/expand-large-div-rem-urem129.ll b/llvm/test/CodeGen/X86/expand-large-div-rem-urem129.ll --- a/llvm/test/CodeGen/X86/expand-large-div-rem-urem129.ll +++ b/llvm/test/CodeGen/X86/expand-large-div-rem-urem129.ll @@ -4,7 +4,7 @@ define void @test(ptr %ptr, ptr %out) nounwind { ; CHECK-LABEL: @test( ; CHECK-NEXT: _udiv-special-cases: -; CHECK-NEXT: [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 4 +; CHECK-NEXT: [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 16 ; CHECK-NEXT: [[TMP0:%.*]] = freeze i129 [[A]] ; CHECK-NEXT: [[TMP1:%.*]] = freeze i129 3 ; CHECK-NEXT: [[TMP2:%.*]] = freeze i129 [[TMP1]] @@ -59,7 +59,7 @@ ; CHECK-NEXT: [[TMP41:%.*]] = phi i129 [ [[TMP18]], [[UDIV_LOOP_EXIT]] ], [ [[TMP13]], [[_UDIV_SPECIAL_CASES:%.*]] ] ; CHECK-NEXT: [[TMP42:%.*]] = mul i129 [[TMP1]], [[TMP41]] ; CHECK-NEXT: [[TMP43:%.*]] = sub i129 [[TMP0]], [[TMP42]] -; CHECK-NEXT: store i129 [[TMP43]], ptr [[OUT:%.*]], align 4 +; CHECK-NEXT: store i129 [[TMP43]], ptr [[OUT:%.*]], align 16 ; CHECK-NEXT: ret void ; %a = load i129, ptr %ptr diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll b/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll --- a/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll @@ -80,7 +80,7 @@ ; AARCH64-SCOPE-LABEL: @standard_lifetime( ; AARCH64-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() ; AARCH64-SCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 -; AARCH64-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4 +; AARCH64-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 ; AARCH64-SCOPE-NEXT: [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1:![0-9]+]]) ; AARCH64-SCOPE-NEXT: [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -88,13 +88,13 @@ ; AARCH64-SCOPE-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 44 ; AARCH64-SCOPE-NEXT: [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]] ; AARCH64-SCOPE-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr -; AARCH64-SCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 4 +; AARCH64-SCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 8 ; AARCH64-SCOPE-NEXT: [[TMP11:%.*]] = ashr i64 [[TMP3]], 56 ; AARCH64-SCOPE-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12 ; AARCH64-SCOPE-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], -1 ; AARCH64-SCOPE-NEXT: [[TMP14:%.*]] = add i64 [[TMP3]], 8 ; AARCH64-SCOPE-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]] -; AARCH64-SCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 4 +; AARCH64-SCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 8 ; AARCH64-SCOPE-NEXT: [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295 ; AARCH64-SCOPE-NEXT: [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1 ; AARCH64-SCOPE-NEXT: [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr @@ -132,7 +132,7 @@ ; AARCH64-NOSCOPE-LABEL: @standard_lifetime( ; AARCH64-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() ; AARCH64-NOSCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 -; AARCH64-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4 +; AARCH64-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-NOSCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 ; AARCH64-NOSCOPE-NEXT: [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1:![0-9]+]]) ; AARCH64-NOSCOPE-NEXT: [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -140,13 +140,13 @@ ; AARCH64-NOSCOPE-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 44 ; AARCH64-NOSCOPE-NEXT: [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]] ; AARCH64-NOSCOPE-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr -; AARCH64-NOSCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 4 +; AARCH64-NOSCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 8 ; AARCH64-NOSCOPE-NEXT: [[TMP11:%.*]] = ashr i64 [[TMP3]], 56 ; AARCH64-NOSCOPE-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12 ; AARCH64-NOSCOPE-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], -1 ; AARCH64-NOSCOPE-NEXT: [[TMP14:%.*]] = add i64 [[TMP3]], 8 ; AARCH64-NOSCOPE-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]] -; AARCH64-NOSCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 4 +; AARCH64-NOSCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 8 ; AARCH64-NOSCOPE-NEXT: [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295 ; AARCH64-NOSCOPE-NEXT: [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1 ; AARCH64-NOSCOPE-NEXT: [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr @@ -182,7 +182,7 @@ ; AARCH64-SHORT-SCOPE-LABEL: @standard_lifetime( ; AARCH64-SHORT-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() ; AARCH64-SHORT-SCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 -; AARCH64-SHORT-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4 +; AARCH64-SHORT-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1:![0-9]+]]) ; AARCH64-SHORT-SCOPE-NEXT: [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -190,13 +190,13 @@ ; AARCH64-SHORT-SCOPE-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 44 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]] ; AARCH64-SHORT-SCOPE-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr -; AARCH64-SHORT-SCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 4 +; AARCH64-SHORT-SCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 8 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP11:%.*]] = ashr i64 [[TMP3]], 56 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], -1 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP14:%.*]] = add i64 [[TMP3]], 8 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]] -; AARCH64-SHORT-SCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 4 +; AARCH64-SHORT-SCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 8 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295 ; AARCH64-SHORT-SCOPE-NEXT: [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr @@ -237,7 +237,7 @@ ; AARCH64-SHORT-NOSCOPE-LABEL: @standard_lifetime( ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 -; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4 +; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1:![0-9]+]]) ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -245,13 +245,13 @@ ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 44 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]] ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr -; AARCH64-SHORT-NOSCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 4 +; AARCH64-SHORT-NOSCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 8 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP11:%.*]] = ashr i64 [[TMP3]], 56 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], -1 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP14:%.*]] = add i64 [[TMP3]], 8 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]] -; AARCH64-SHORT-NOSCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 4 +; AARCH64-SHORT-NOSCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 8 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295 ; AARCH64-SHORT-NOSCOPE-NEXT: [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr @@ -361,7 +361,7 @@ ; AARCH64-SCOPE-LABEL: @standard_lifetime_optnone( ; AARCH64-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() ; AARCH64-SCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 -; AARCH64-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4 +; AARCH64-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 ; AARCH64-SCOPE-NEXT: [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]]) ; AARCH64-SCOPE-NEXT: [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -369,13 +369,13 @@ ; AARCH64-SCOPE-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 44 ; AARCH64-SCOPE-NEXT: [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]] ; AARCH64-SCOPE-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr -; AARCH64-SCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 4 +; AARCH64-SCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 8 ; AARCH64-SCOPE-NEXT: [[TMP11:%.*]] = ashr i64 [[TMP3]], 56 ; AARCH64-SCOPE-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12 ; AARCH64-SCOPE-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], -1 ; AARCH64-SCOPE-NEXT: [[TMP14:%.*]] = add i64 [[TMP3]], 8 ; AARCH64-SCOPE-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]] -; AARCH64-SCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 4 +; AARCH64-SCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 8 ; AARCH64-SCOPE-NEXT: [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295 ; AARCH64-SCOPE-NEXT: [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1 ; AARCH64-SCOPE-NEXT: [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr @@ -413,7 +413,7 @@ ; AARCH64-NOSCOPE-LABEL: @standard_lifetime_optnone( ; AARCH64-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() ; AARCH64-NOSCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 -; AARCH64-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4 +; AARCH64-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-NOSCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 ; AARCH64-NOSCOPE-NEXT: [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]]) ; AARCH64-NOSCOPE-NEXT: [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -421,13 +421,13 @@ ; AARCH64-NOSCOPE-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 44 ; AARCH64-NOSCOPE-NEXT: [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]] ; AARCH64-NOSCOPE-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr -; AARCH64-NOSCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 4 +; AARCH64-NOSCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 8 ; AARCH64-NOSCOPE-NEXT: [[TMP11:%.*]] = ashr i64 [[TMP3]], 56 ; AARCH64-NOSCOPE-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12 ; AARCH64-NOSCOPE-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], -1 ; AARCH64-NOSCOPE-NEXT: [[TMP14:%.*]] = add i64 [[TMP3]], 8 ; AARCH64-NOSCOPE-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]] -; AARCH64-NOSCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 4 +; AARCH64-NOSCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 8 ; AARCH64-NOSCOPE-NEXT: [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295 ; AARCH64-NOSCOPE-NEXT: [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1 ; AARCH64-NOSCOPE-NEXT: [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr @@ -463,7 +463,7 @@ ; AARCH64-SHORT-SCOPE-LABEL: @standard_lifetime_optnone( ; AARCH64-SHORT-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() ; AARCH64-SHORT-SCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 -; AARCH64-SHORT-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4 +; AARCH64-SHORT-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]]) ; AARCH64-SHORT-SCOPE-NEXT: [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -471,13 +471,13 @@ ; AARCH64-SHORT-SCOPE-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 44 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]] ; AARCH64-SHORT-SCOPE-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr -; AARCH64-SHORT-SCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 4 +; AARCH64-SHORT-SCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 8 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP11:%.*]] = ashr i64 [[TMP3]], 56 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], -1 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP14:%.*]] = add i64 [[TMP3]], 8 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]] -; AARCH64-SHORT-SCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 4 +; AARCH64-SHORT-SCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 8 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295 ; AARCH64-SHORT-SCOPE-NEXT: [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr @@ -518,7 +518,7 @@ ; AARCH64-SHORT-NOSCOPE-LABEL: @standard_lifetime_optnone( ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 -; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4 +; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]]) ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -526,13 +526,13 @@ ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 44 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]] ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr -; AARCH64-SHORT-NOSCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 4 +; AARCH64-SHORT-NOSCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 8 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP11:%.*]] = ashr i64 [[TMP3]], 56 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], -1 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP14:%.*]] = add i64 [[TMP3]], 8 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]] -; AARCH64-SHORT-NOSCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 4 +; AARCH64-SHORT-NOSCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 8 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295 ; AARCH64-SHORT-NOSCOPE-NEXT: [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr @@ -632,7 +632,7 @@ ; AARCH64-SCOPE-LABEL: @multiple_lifetimes( ; AARCH64-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() ; AARCH64-SCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 -; AARCH64-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4 +; AARCH64-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 ; AARCH64-SCOPE-NEXT: [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]]) ; AARCH64-SCOPE-NEXT: [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -640,13 +640,13 @@ ; AARCH64-SCOPE-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 44 ; AARCH64-SCOPE-NEXT: [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]] ; AARCH64-SCOPE-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr -; AARCH64-SCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 4 +; AARCH64-SCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 8 ; AARCH64-SCOPE-NEXT: [[TMP11:%.*]] = ashr i64 [[TMP3]], 56 ; AARCH64-SCOPE-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12 ; AARCH64-SCOPE-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], -1 ; AARCH64-SCOPE-NEXT: [[TMP14:%.*]] = add i64 [[TMP3]], 8 ; AARCH64-SCOPE-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]] -; AARCH64-SCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 4 +; AARCH64-SCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 8 ; AARCH64-SCOPE-NEXT: [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295 ; AARCH64-SCOPE-NEXT: [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1 ; AARCH64-SCOPE-NEXT: [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr @@ -678,7 +678,7 @@ ; AARCH64-NOSCOPE-LABEL: @multiple_lifetimes( ; AARCH64-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() ; AARCH64-NOSCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 -; AARCH64-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4 +; AARCH64-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-NOSCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 ; AARCH64-NOSCOPE-NEXT: [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]]) ; AARCH64-NOSCOPE-NEXT: [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -686,13 +686,13 @@ ; AARCH64-NOSCOPE-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 44 ; AARCH64-NOSCOPE-NEXT: [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]] ; AARCH64-NOSCOPE-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr -; AARCH64-NOSCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 4 +; AARCH64-NOSCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 8 ; AARCH64-NOSCOPE-NEXT: [[TMP11:%.*]] = ashr i64 [[TMP3]], 56 ; AARCH64-NOSCOPE-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12 ; AARCH64-NOSCOPE-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], -1 ; AARCH64-NOSCOPE-NEXT: [[TMP14:%.*]] = add i64 [[TMP3]], 8 ; AARCH64-NOSCOPE-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]] -; AARCH64-NOSCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 4 +; AARCH64-NOSCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 8 ; AARCH64-NOSCOPE-NEXT: [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295 ; AARCH64-NOSCOPE-NEXT: [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1 ; AARCH64-NOSCOPE-NEXT: [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr @@ -724,7 +724,7 @@ ; AARCH64-SHORT-SCOPE-LABEL: @multiple_lifetimes( ; AARCH64-SHORT-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() ; AARCH64-SHORT-SCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 -; AARCH64-SHORT-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4 +; AARCH64-SHORT-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]]) ; AARCH64-SHORT-SCOPE-NEXT: [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -732,13 +732,13 @@ ; AARCH64-SHORT-SCOPE-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 44 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]] ; AARCH64-SHORT-SCOPE-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr -; AARCH64-SHORT-SCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 4 +; AARCH64-SHORT-SCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 8 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP11:%.*]] = ashr i64 [[TMP3]], 56 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], -1 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP14:%.*]] = add i64 [[TMP3]], 8 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]] -; AARCH64-SHORT-SCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 4 +; AARCH64-SHORT-SCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 8 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295 ; AARCH64-SHORT-SCOPE-NEXT: [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr @@ -773,7 +773,7 @@ ; AARCH64-SHORT-NOSCOPE-LABEL: @multiple_lifetimes( ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 -; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4 +; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]]) ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -781,13 +781,13 @@ ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 44 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]] ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr -; AARCH64-SHORT-NOSCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 4 +; AARCH64-SHORT-NOSCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 8 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP11:%.*]] = ashr i64 [[TMP3]], 56 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], -1 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP14:%.*]] = add i64 [[TMP3]], 8 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]] -; AARCH64-SHORT-NOSCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 4 +; AARCH64-SHORT-NOSCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 8 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295 ; AARCH64-SHORT-NOSCOPE-NEXT: [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr @@ -892,7 +892,7 @@ ; AARCH64-SCOPE-LABEL: @unreachable_exit( ; AARCH64-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() ; AARCH64-SCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 -; AARCH64-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4 +; AARCH64-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 ; AARCH64-SCOPE-NEXT: [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]]) ; AARCH64-SCOPE-NEXT: [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -900,13 +900,13 @@ ; AARCH64-SCOPE-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 44 ; AARCH64-SCOPE-NEXT: [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]] ; AARCH64-SCOPE-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr -; AARCH64-SCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 4 +; AARCH64-SCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 8 ; AARCH64-SCOPE-NEXT: [[TMP11:%.*]] = ashr i64 [[TMP3]], 56 ; AARCH64-SCOPE-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12 ; AARCH64-SCOPE-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], -1 ; AARCH64-SCOPE-NEXT: [[TMP14:%.*]] = add i64 [[TMP3]], 8 ; AARCH64-SCOPE-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]] -; AARCH64-SCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 4 +; AARCH64-SCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 8 ; AARCH64-SCOPE-NEXT: [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295 ; AARCH64-SCOPE-NEXT: [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1 ; AARCH64-SCOPE-NEXT: [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr @@ -949,7 +949,7 @@ ; AARCH64-NOSCOPE-LABEL: @unreachable_exit( ; AARCH64-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() ; AARCH64-NOSCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 -; AARCH64-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4 +; AARCH64-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-NOSCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 ; AARCH64-NOSCOPE-NEXT: [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]]) ; AARCH64-NOSCOPE-NEXT: [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -957,13 +957,13 @@ ; AARCH64-NOSCOPE-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 44 ; AARCH64-NOSCOPE-NEXT: [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]] ; AARCH64-NOSCOPE-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr -; AARCH64-NOSCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 4 +; AARCH64-NOSCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 8 ; AARCH64-NOSCOPE-NEXT: [[TMP11:%.*]] = ashr i64 [[TMP3]], 56 ; AARCH64-NOSCOPE-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12 ; AARCH64-NOSCOPE-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], -1 ; AARCH64-NOSCOPE-NEXT: [[TMP14:%.*]] = add i64 [[TMP3]], 8 ; AARCH64-NOSCOPE-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]] -; AARCH64-NOSCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 4 +; AARCH64-NOSCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 8 ; AARCH64-NOSCOPE-NEXT: [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295 ; AARCH64-NOSCOPE-NEXT: [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1 ; AARCH64-NOSCOPE-NEXT: [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr @@ -1005,7 +1005,7 @@ ; AARCH64-SHORT-SCOPE-LABEL: @unreachable_exit( ; AARCH64-SHORT-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() ; AARCH64-SHORT-SCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 -; AARCH64-SHORT-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4 +; AARCH64-SHORT-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]]) ; AARCH64-SHORT-SCOPE-NEXT: [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -1013,13 +1013,13 @@ ; AARCH64-SHORT-SCOPE-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 44 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]] ; AARCH64-SHORT-SCOPE-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr -; AARCH64-SHORT-SCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 4 +; AARCH64-SHORT-SCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 8 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP11:%.*]] = ashr i64 [[TMP3]], 56 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], -1 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP14:%.*]] = add i64 [[TMP3]], 8 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]] -; AARCH64-SHORT-SCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 4 +; AARCH64-SHORT-SCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 8 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295 ; AARCH64-SHORT-SCOPE-NEXT: [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr @@ -1065,7 +1065,7 @@ ; AARCH64-SHORT-NOSCOPE-LABEL: @unreachable_exit( ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 -; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4 +; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]]) ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -1073,13 +1073,13 @@ ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 44 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]] ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr -; AARCH64-SHORT-NOSCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 4 +; AARCH64-SHORT-NOSCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 8 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP11:%.*]] = ashr i64 [[TMP3]], 56 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], -1 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP14:%.*]] = add i64 [[TMP3]], 8 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]] -; AARCH64-SHORT-NOSCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 4 +; AARCH64-SHORT-NOSCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 8 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295 ; AARCH64-SHORT-NOSCOPE-NEXT: [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr @@ -1200,7 +1200,7 @@ ; AARCH64-SCOPE-LABEL: @diamond_lifetime( ; AARCH64-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() ; AARCH64-SCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 -; AARCH64-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4 +; AARCH64-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 ; AARCH64-SCOPE-NEXT: [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]]) ; AARCH64-SCOPE-NEXT: [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -1208,13 +1208,13 @@ ; AARCH64-SCOPE-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 44 ; AARCH64-SCOPE-NEXT: [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]] ; AARCH64-SCOPE-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr -; AARCH64-SCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 4 +; AARCH64-SCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 8 ; AARCH64-SCOPE-NEXT: [[TMP11:%.*]] = ashr i64 [[TMP3]], 56 ; AARCH64-SCOPE-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12 ; AARCH64-SCOPE-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], -1 ; AARCH64-SCOPE-NEXT: [[TMP14:%.*]] = add i64 [[TMP3]], 8 ; AARCH64-SCOPE-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]] -; AARCH64-SCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 4 +; AARCH64-SCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 8 ; AARCH64-SCOPE-NEXT: [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295 ; AARCH64-SCOPE-NEXT: [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1 ; AARCH64-SCOPE-NEXT: [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr @@ -1261,7 +1261,7 @@ ; AARCH64-NOSCOPE-LABEL: @diamond_lifetime( ; AARCH64-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() ; AARCH64-NOSCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 -; AARCH64-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4 +; AARCH64-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-NOSCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 ; AARCH64-NOSCOPE-NEXT: [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]]) ; AARCH64-NOSCOPE-NEXT: [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -1269,13 +1269,13 @@ ; AARCH64-NOSCOPE-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 44 ; AARCH64-NOSCOPE-NEXT: [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]] ; AARCH64-NOSCOPE-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr -; AARCH64-NOSCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 4 +; AARCH64-NOSCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 8 ; AARCH64-NOSCOPE-NEXT: [[TMP11:%.*]] = ashr i64 [[TMP3]], 56 ; AARCH64-NOSCOPE-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12 ; AARCH64-NOSCOPE-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], -1 ; AARCH64-NOSCOPE-NEXT: [[TMP14:%.*]] = add i64 [[TMP3]], 8 ; AARCH64-NOSCOPE-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]] -; AARCH64-NOSCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 4 +; AARCH64-NOSCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 8 ; AARCH64-NOSCOPE-NEXT: [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295 ; AARCH64-NOSCOPE-NEXT: [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1 ; AARCH64-NOSCOPE-NEXT: [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr @@ -1313,7 +1313,7 @@ ; AARCH64-SHORT-SCOPE-LABEL: @diamond_lifetime( ; AARCH64-SHORT-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() ; AARCH64-SHORT-SCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 -; AARCH64-SHORT-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4 +; AARCH64-SHORT-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]]) ; AARCH64-SHORT-SCOPE-NEXT: [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -1321,13 +1321,13 @@ ; AARCH64-SHORT-SCOPE-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 44 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]] ; AARCH64-SHORT-SCOPE-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr -; AARCH64-SHORT-SCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 4 +; AARCH64-SHORT-SCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 8 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP11:%.*]] = ashr i64 [[TMP3]], 56 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], -1 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP14:%.*]] = add i64 [[TMP3]], 8 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]] -; AARCH64-SHORT-SCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 4 +; AARCH64-SHORT-SCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 8 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295 ; AARCH64-SHORT-SCOPE-NEXT: [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr @@ -1377,7 +1377,7 @@ ; AARCH64-SHORT-NOSCOPE-LABEL: @diamond_lifetime( ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 -; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4 +; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]]) ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -1385,13 +1385,13 @@ ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 44 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]] ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr -; AARCH64-SHORT-NOSCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 4 +; AARCH64-SHORT-NOSCOPE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 8 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP11:%.*]] = ashr i64 [[TMP3]], 56 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], -1 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP14:%.*]] = add i64 [[TMP3]], 8 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]] -; AARCH64-SHORT-NOSCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 4 +; AARCH64-SHORT-NOSCOPE-NEXT: store i64 [[TMP15]], ptr [[TMP2]], align 8 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295 ; AARCH64-SHORT-NOSCOPE-NEXT: [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr diff --git a/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll b/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll --- a/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll @@ -6,14 +6,14 @@ define i32 @foo() #0 { ; CHECK-LABEL: define i32 @foo() comdat { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @__sanitizer_cov_trace_pc_guard(ptr @__sancov_gen_) #[[ATTR1:[0-9]+]] +; CHECK-NEXT: call void @__sanitizer_cov_trace_pc_guard(ptr inttoptr (i32 ptrtoint (ptr @__sancov_gen_ to i32) to ptr)) #[[ATTR1:[0-9]+]] ; CHECK-NEXT: ret i32 0 ; entry: ret i32 0 } -; CHECK-DAG: declare void @__sanitizer_cov_trace_pc_indir(i64) +; CHECK-DAG: declare void @__sanitizer_cov_trace_pc_indir(i32) ; CHECK-DAG: declare void @__sanitizer_cov_trace_cmp1(i8 zeroext, i8 zeroext) ; CHECK-DAG: declare void @__sanitizer_cov_trace_cmp2(i16 zeroext, i16 zeroext) ; CHECK-DAG: declare void @__sanitizer_cov_trace_cmp4(i32 zeroext, i32 zeroext) @@ -24,7 +24,7 @@ ; CHECK-DAG: declare void @__sanitizer_cov_trace_const_cmp8(i64, i64) ; CHECK-DAG: declare void @__sanitizer_cov_trace_div4(i32 zeroext) ; CHECK-DAG: declare void @__sanitizer_cov_trace_div8(i64) -; CHECK-DAG: declare void @__sanitizer_cov_trace_gep(i64) +; CHECK-DAG: declare void @__sanitizer_cov_trace_gep(i32) ; CHECK-DAG: declare void @__sanitizer_cov_trace_switch(i64, ptr) ; CHECK-DAG: declare void @__sanitizer_cov_trace_pc() ; CHECK-DAG: declare void @__sanitizer_cov_trace_pc_guard(ptr) diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll @@ -1,96 +1,172 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -atomic-expand %s | FileCheck %s -; RUN: opt -mtriple=r600-mesa-mesa3d -S -atomic-expand %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -atomic-expand %s | FileCheck %s --check-prefixes=CHECK,GCN +; RUN: opt -mtriple=r600-mesa-mesa3d -S -atomic-expand %s | FileCheck %s --check-prefixes=CHECK,R600 define i8 @test_atomicrmw_xchg_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) { -; CHECK-LABEL: @test_atomicrmw_xchg_i8_global_agent( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 -; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 -; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] -; CHECK: atomicrmw.start: -; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; CHECK-NEXT: [[TMP6:%.*]] = or i32 [[TMP5]], [[VALOPERAND_SHIFTED]] -; CHECK-NEXT: [[TMP7:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP6]] syncscope("agent") seq_cst seq_cst, align 4 -; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP7]], 1 -; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0 -; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; CHECK: atomicrmw.end: -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 -; CHECK-NEXT: ret i8 [[EXTRACTED]] +; GCN-LABEL: @test_atomicrmw_xchg_i8_global_agent( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 +; GCN-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; GCN-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[TMP5:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[TMP6:%.*]] = or i32 [[TMP5]], [[VALOPERAND_SHIFTED]] +; GCN-NEXT: [[TMP7:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP6]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP7]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; GCN-NEXT: ret i8 [[EXTRACTED]] +; +; R600-LABEL: @test_atomicrmw_xchg_i8_global_agent( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32 +; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 +; R600-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]] +; R600-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; R600-NEXT: br label [[ATOMICRMW_START:%.*]] +; R600: atomicrmw.start: +; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; R600-NEXT: [[TMP5:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; R600-NEXT: [[TMP6:%.*]] = or i32 [[TMP5]], [[VALOPERAND_SHIFTED]] +; R600-NEXT: [[TMP7:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP6]] syncscope("agent") seq_cst seq_cst, align 4 +; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP7]], 1 +; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0 +; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; R600: atomicrmw.end: +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; R600-NEXT: ret i8 [[EXTRACTED]] ; %res = atomicrmw xchg ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst ret i8 %res } define i8 @test_atomicrmw_add_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) { -; CHECK-LABEL: @test_atomicrmw_add_i8_global_agent( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 -; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 -; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] -; CHECK: atomicrmw.start: -; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CHECK-NEXT: [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]] -; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]] -; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4 -; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 -; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 -; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; CHECK: atomicrmw.end: -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 -; CHECK-NEXT: ret i8 [[EXTRACTED]] +; GCN-LABEL: @test_atomicrmw_add_i8_global_agent( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 +; GCN-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; GCN-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; GCN-NEXT: [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]] +; GCN-NEXT: [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]] +; GCN-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; GCN-NEXT: ret i8 [[EXTRACTED]] +; +; R600-LABEL: @test_atomicrmw_add_i8_global_agent( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32 +; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 +; R600-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]] +; R600-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; R600-NEXT: br label [[ATOMICRMW_START:%.*]] +; R600: atomicrmw.start: +; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; R600-NEXT: [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; R600-NEXT: [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]] +; R600-NEXT: [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; R600-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]] +; R600-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4 +; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 +; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 +; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; R600: atomicrmw.end: +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; R600-NEXT: ret i8 [[EXTRACTED]] ; %res = atomicrmw add ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst ret i8 %res } define i8 @test_atomicrmw_add_i8_global_agent_align2(ptr addrspace(1) %ptr, i8 %value) { -; CHECK-LABEL: @test_atomicrmw_add_i8_global_agent_align2( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 -; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 -; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] -; CHECK: atomicrmw.start: -; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CHECK-NEXT: [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]] -; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]] -; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4 -; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 -; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 -; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; CHECK: atomicrmw.end: -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 -; CHECK-NEXT: ret i8 [[EXTRACTED]] +; GCN-LABEL: @test_atomicrmw_add_i8_global_agent_align2( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 +; GCN-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; GCN-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; GCN-NEXT: [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]] +; GCN-NEXT: [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]] +; GCN-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; GCN-NEXT: ret i8 [[EXTRACTED]] +; +; R600-LABEL: @test_atomicrmw_add_i8_global_agent_align2( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32 +; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 +; R600-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]] +; R600-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; R600-NEXT: br label [[ATOMICRMW_START:%.*]] +; R600: atomicrmw.start: +; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; R600-NEXT: [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; R600-NEXT: [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]] +; R600-NEXT: [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; R600-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]] +; R600-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4 +; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 +; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 +; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; R600: atomicrmw.end: +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; R600-NEXT: ret i8 [[EXTRACTED]] ; %res = atomicrmw add ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst, align 2 ret i8 %res @@ -120,303 +196,546 @@ } define i8 @test_atomicrmw_sub_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) { -; CHECK-LABEL: @test_atomicrmw_sub_i8_global_agent( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 -; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 -; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] -; CHECK: atomicrmw.start: -; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CHECK-NEXT: [[NEW:%.*]] = sub i32 [[LOADED]], [[VALOPERAND_SHIFTED]] -; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]] -; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4 -; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 -; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 -; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; CHECK: atomicrmw.end: -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 -; CHECK-NEXT: ret i8 [[EXTRACTED]] +; GCN-LABEL: @test_atomicrmw_sub_i8_global_agent( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 +; GCN-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; GCN-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[NEW:%.*]] = sub i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; GCN-NEXT: [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]] +; GCN-NEXT: [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]] +; GCN-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; GCN-NEXT: ret i8 [[EXTRACTED]] +; +; R600-LABEL: @test_atomicrmw_sub_i8_global_agent( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32 +; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 +; R600-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]] +; R600-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; R600-NEXT: br label [[ATOMICRMW_START:%.*]] +; R600: atomicrmw.start: +; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; R600-NEXT: [[NEW:%.*]] = sub i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; R600-NEXT: [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]] +; R600-NEXT: [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; R600-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]] +; R600-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4 +; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 +; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 +; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; R600: atomicrmw.end: +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; R600-NEXT: ret i8 [[EXTRACTED]] ; %res = atomicrmw sub ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst ret i8 %res } define i8 @test_atomicrmw_and_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) { -; CHECK-LABEL: @test_atomicrmw_and_i8_global_agent( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 -; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[ANDOPERAND:%.*]] = or i32 [[INV_MASK]], [[VALOPERAND_SHIFTED]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 -; CHECK-NEXT: ret i8 [[EXTRACTED]] +; GCN-LABEL: @test_atomicrmw_and_i8_global_agent( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 +; GCN-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; GCN-NEXT: [[ANDOPERAND:%.*]] = or i32 [[INV_MASK]], [[VALOPERAND_SHIFTED]] +; GCN-NEXT: [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4 +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; GCN-NEXT: ret i8 [[EXTRACTED]] +; +; R600-LABEL: @test_atomicrmw_and_i8_global_agent( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32 +; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 +; R600-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]] +; R600-NEXT: [[ANDOPERAND:%.*]] = or i32 [[INV_MASK]], [[VALOPERAND_SHIFTED]] +; R600-NEXT: [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4 +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[TMP2]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; R600-NEXT: ret i8 [[EXTRACTED]] ; %res = atomicrmw and ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst ret i8 %res } define i8 @test_atomicrmw_nand_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) { -; CHECK-LABEL: @test_atomicrmw_nand_i8_global_agent( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 -; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 -; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] -; CHECK: atomicrmw.start: -; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[LOADED]], [[VALOPERAND_SHIFTED]] -; CHECK-NEXT: [[NEW:%.*]] = xor i32 [[TMP5]], -1 -; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[NEW]], [[MASK]] -; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; CHECK-NEXT: [[TMP8:%.*]] = or i32 [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP8]] syncscope("agent") seq_cst seq_cst, align 4 -; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP9]], 1 -; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP9]], 0 -; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; CHECK: atomicrmw.end: -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 -; CHECK-NEXT: ret i8 [[EXTRACTED]] +; GCN-LABEL: @test_atomicrmw_nand_i8_global_agent( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 +; GCN-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; GCN-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[TMP5:%.*]] = and i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; GCN-NEXT: [[NEW:%.*]] = xor i32 [[TMP5]], -1 +; GCN-NEXT: [[TMP6:%.*]] = and i32 [[NEW]], [[MASK]] +; GCN-NEXT: [[TMP7:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[TMP8:%.*]] = or i32 [[TMP7]], [[TMP6]] +; GCN-NEXT: [[TMP9:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP8]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP9]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP9]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; GCN-NEXT: ret i8 [[EXTRACTED]] +; +; R600-LABEL: @test_atomicrmw_nand_i8_global_agent( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32 +; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 +; R600-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]] +; R600-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; R600-NEXT: br label [[ATOMICRMW_START:%.*]] +; R600: atomicrmw.start: +; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; R600-NEXT: [[TMP5:%.*]] = and i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; R600-NEXT: [[NEW:%.*]] = xor i32 [[TMP5]], -1 +; R600-NEXT: [[TMP6:%.*]] = and i32 [[NEW]], [[MASK]] +; R600-NEXT: [[TMP7:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; R600-NEXT: [[TMP8:%.*]] = or i32 [[TMP7]], [[TMP6]] +; R600-NEXT: [[TMP9:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP8]] syncscope("agent") seq_cst seq_cst, align 4 +; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP9]], 1 +; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP9]], 0 +; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; R600: atomicrmw.end: +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; R600-NEXT: ret i8 [[EXTRACTED]] ; %res = atomicrmw nand ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst ret i8 %res } define i8 @test_atomicrmw_or_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) { -; CHECK-LABEL: @test_atomicrmw_or_i8_global_agent( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 -; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw or ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] syncscope("agent") seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 -; CHECK-NEXT: ret i8 [[EXTRACTED]] +; GCN-LABEL: @test_atomicrmw_or_i8_global_agent( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 +; GCN-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; GCN-NEXT: [[TMP4:%.*]] = atomicrmw or ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] syncscope("agent") seq_cst, align 4 +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; GCN-NEXT: ret i8 [[EXTRACTED]] +; +; R600-LABEL: @test_atomicrmw_or_i8_global_agent( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32 +; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 +; R600-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]] +; R600-NEXT: [[TMP4:%.*]] = atomicrmw or ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] syncscope("agent") seq_cst, align 4 +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[TMP2]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; R600-NEXT: ret i8 [[EXTRACTED]] ; %res = atomicrmw or ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst ret i8 %res } define i8 @test_atomicrmw_xor_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) { -; CHECK-LABEL: @test_atomicrmw_xor_i8_global_agent( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 -; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw xor ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] syncscope("agent") seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 -; CHECK-NEXT: ret i8 [[EXTRACTED]] +; GCN-LABEL: @test_atomicrmw_xor_i8_global_agent( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 +; GCN-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; GCN-NEXT: [[TMP4:%.*]] = atomicrmw xor ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] syncscope("agent") seq_cst, align 4 +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; GCN-NEXT: ret i8 [[EXTRACTED]] +; +; R600-LABEL: @test_atomicrmw_xor_i8_global_agent( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32 +; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 +; R600-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]] +; R600-NEXT: [[TMP4:%.*]] = atomicrmw xor ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] syncscope("agent") seq_cst, align 4 +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[TMP2]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; R600-NEXT: ret i8 [[EXTRACTED]] ; %res = atomicrmw xor ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst ret i8 %res } define i8 @test_atomicrmw_max_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) { -; CHECK-LABEL: @test_atomicrmw_max_i8_global_agent( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 -; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] -; CHECK: atomicrmw.start: -; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 -; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt i8 [[EXTRACTED]], [[VALUE:%.*]] -; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]] -; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 -; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 -; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 -; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; CHECK: atomicrmw.end: -; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 -; CHECK-NEXT: ret i8 [[EXTRACTED3]] +; GCN-LABEL: @test_atomicrmw_max_i8_global_agent( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; GCN-NEXT: [[TMP4:%.*]] = icmp sgt i8 [[EXTRACTED]], [[VALUE:%.*]] +; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]] +; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 +; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 +; GCN-NEXT: ret i8 [[EXTRACTED3]] +; +; R600-LABEL: @test_atomicrmw_max_i8_global_agent( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32 +; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; R600-NEXT: br label [[ATOMICRMW_START:%.*]] +; R600: atomicrmw.start: +; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; R600-NEXT: [[TMP4:%.*]] = icmp sgt i8 [[EXTRACTED]], [[VALUE:%.*]] +; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]] +; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 +; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] +; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; R600-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; R600: atomicrmw.end: +; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 +; R600-NEXT: ret i8 [[EXTRACTED3]] ; %res = atomicrmw max ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst ret i8 %res } define i8 @test_atomicrmw_min_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) { -; CHECK-LABEL: @test_atomicrmw_min_i8_global_agent( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 -; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] -; CHECK: atomicrmw.start: -; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 -; CHECK-NEXT: [[TMP4:%.*]] = icmp sle i8 [[EXTRACTED]], [[VALUE:%.*]] -; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]] -; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 -; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 -; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 -; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; CHECK: atomicrmw.end: -; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 -; CHECK-NEXT: ret i8 [[EXTRACTED3]] +; GCN-LABEL: @test_atomicrmw_min_i8_global_agent( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; GCN-NEXT: [[TMP4:%.*]] = icmp sle i8 [[EXTRACTED]], [[VALUE:%.*]] +; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]] +; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 +; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 +; GCN-NEXT: ret i8 [[EXTRACTED3]] +; +; R600-LABEL: @test_atomicrmw_min_i8_global_agent( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32 +; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; R600-NEXT: br label [[ATOMICRMW_START:%.*]] +; R600: atomicrmw.start: +; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; R600-NEXT: [[TMP4:%.*]] = icmp sle i8 [[EXTRACTED]], [[VALUE:%.*]] +; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]] +; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 +; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] +; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; R600-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; R600: atomicrmw.end: +; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 +; R600-NEXT: ret i8 [[EXTRACTED3]] ; %res = atomicrmw min ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst ret i8 %res } define i8 @test_atomicrmw_umax_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) { -; CHECK-LABEL: @test_atomicrmw_umax_i8_global_agent( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 -; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] -; CHECK: atomicrmw.start: -; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 -; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]] -; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]] -; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 -; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 -; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 -; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; CHECK: atomicrmw.end: -; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 -; CHECK-NEXT: ret i8 [[EXTRACTED3]] +; GCN-LABEL: @test_atomicrmw_umax_i8_global_agent( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; GCN-NEXT: [[TMP4:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]] +; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]] +; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 +; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 +; GCN-NEXT: ret i8 [[EXTRACTED3]] +; +; R600-LABEL: @test_atomicrmw_umax_i8_global_agent( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32 +; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; R600-NEXT: br label [[ATOMICRMW_START:%.*]] +; R600: atomicrmw.start: +; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; R600-NEXT: [[TMP4:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]] +; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]] +; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 +; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] +; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; R600-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; R600: atomicrmw.end: +; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 +; R600-NEXT: ret i8 [[EXTRACTED3]] ; %res = atomicrmw umax ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst ret i8 %res } define i8 @test_atomicrmw_umin_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) { -; CHECK-LABEL: @test_atomicrmw_umin_i8_global_agent( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 -; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] -; CHECK: atomicrmw.start: -; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 -; CHECK-NEXT: [[TMP4:%.*]] = icmp ule i8 [[EXTRACTED]], [[VALUE:%.*]] -; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]] -; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 -; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 -; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 -; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; CHECK: atomicrmw.end: -; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 -; CHECK-NEXT: ret i8 [[EXTRACTED3]] +; GCN-LABEL: @test_atomicrmw_umin_i8_global_agent( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; GCN-NEXT: [[TMP4:%.*]] = icmp ule i8 [[EXTRACTED]], [[VALUE:%.*]] +; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]] +; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 +; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 +; GCN-NEXT: ret i8 [[EXTRACTED3]] +; +; R600-LABEL: @test_atomicrmw_umin_i8_global_agent( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32 +; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; R600-NEXT: br label [[ATOMICRMW_START:%.*]] +; R600: atomicrmw.start: +; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; R600-NEXT: [[TMP4:%.*]] = icmp ule i8 [[EXTRACTED]], [[VALUE:%.*]] +; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]] +; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 +; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] +; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; R600-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; R600: atomicrmw.end: +; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 +; R600-NEXT: ret i8 [[EXTRACTED3]] ; %res = atomicrmw umin ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst ret i8 %res } define i8 @test_cmpxchg_i8_global_agent(ptr addrspace(1) %out, i8 %in, i8 %old) { -; CHECK-LABEL: @test_cmpxchg_i8_global_agent( -; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT:%.*]], i64 4 -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[GEP]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[GEP]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[IN:%.*]] to i32 -; CHECK-NEXT: [[TMP4:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[TMP5:%.*]] = zext i8 [[OLD:%.*]] to i32 -; CHECK-NEXT: [[TMP6:%.*]] = shl i32 [[TMP5]], [[SHIFTAMT]] -; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = and i32 [[TMP7]], [[INV_MASK]] -; CHECK-NEXT: br label [[PARTWORD_CMPXCHG_LOOP:%.*]] -; CHECK: partword.cmpxchg.loop: -; CHECK-NEXT: [[TMP9:%.*]] = phi i32 [ [[TMP8]], [[TMP0:%.*]] ], [ [[TMP15:%.*]], [[PARTWORD_CMPXCHG_FAILURE:%.*]] ] -; CHECK-NEXT: [[TMP10:%.*]] = or i32 [[TMP9]], [[TMP4]] -; CHECK-NEXT: [[TMP11:%.*]] = or i32 [[TMP9]], [[TMP6]] -; CHECK-NEXT: [[TMP12:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[TMP11]], i32 [[TMP10]] syncscope("agent") seq_cst seq_cst, align 4 -; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { i32, i1 } [[TMP12]], 0 -; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { i32, i1 } [[TMP12]], 1 -; CHECK-NEXT: br i1 [[TMP14]], label [[PARTWORD_CMPXCHG_END:%.*]], label [[PARTWORD_CMPXCHG_FAILURE]] -; CHECK: partword.cmpxchg.failure: -; CHECK-NEXT: [[TMP15]] = and i32 [[TMP13]], [[INV_MASK]] -; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP9]], [[TMP15]] -; CHECK-NEXT: br i1 [[TMP16]], label [[PARTWORD_CMPXCHG_LOOP]], label [[PARTWORD_CMPXCHG_END]] -; CHECK: partword.cmpxchg.end: -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP13]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 -; CHECK-NEXT: [[TMP17:%.*]] = insertvalue { i8, i1 } poison, i8 [[EXTRACTED]], 0 -; CHECK-NEXT: [[TMP18:%.*]] = insertvalue { i8, i1 } [[TMP17]], i1 [[TMP14]], 1 -; CHECK-NEXT: [[EXTRACT:%.*]] = extractvalue { i8, i1 } [[TMP18]], 0 -; CHECK-NEXT: ret i8 [[EXTRACT]] +; GCN-LABEL: @test_cmpxchg_i8_global_agent( +; GCN-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT:%.*]], i64 4 +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[GEP]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[GEP]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = zext i8 [[IN:%.*]] to i32 +; GCN-NEXT: [[TMP4:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; GCN-NEXT: [[TMP5:%.*]] = zext i8 [[OLD:%.*]] to i32 +; GCN-NEXT: [[TMP6:%.*]] = shl i32 [[TMP5]], [[SHIFTAMT]] +; GCN-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GCN-NEXT: [[TMP8:%.*]] = and i32 [[TMP7]], [[INV_MASK]] +; GCN-NEXT: br label [[PARTWORD_CMPXCHG_LOOP:%.*]] +; GCN: partword.cmpxchg.loop: +; GCN-NEXT: [[TMP9:%.*]] = phi i32 [ [[TMP8]], [[TMP0:%.*]] ], [ [[TMP15:%.*]], [[PARTWORD_CMPXCHG_FAILURE:%.*]] ] +; GCN-NEXT: [[TMP10:%.*]] = or i32 [[TMP9]], [[TMP4]] +; GCN-NEXT: [[TMP11:%.*]] = or i32 [[TMP9]], [[TMP6]] +; GCN-NEXT: [[TMP12:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[TMP11]], i32 [[TMP10]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[TMP13:%.*]] = extractvalue { i32, i1 } [[TMP12]], 0 +; GCN-NEXT: [[TMP14:%.*]] = extractvalue { i32, i1 } [[TMP12]], 1 +; GCN-NEXT: br i1 [[TMP14]], label [[PARTWORD_CMPXCHG_END:%.*]], label [[PARTWORD_CMPXCHG_FAILURE]] +; GCN: partword.cmpxchg.failure: +; GCN-NEXT: [[TMP15]] = and i32 [[TMP13]], [[INV_MASK]] +; GCN-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP9]], [[TMP15]] +; GCN-NEXT: br i1 [[TMP16]], label [[PARTWORD_CMPXCHG_LOOP]], label [[PARTWORD_CMPXCHG_END]] +; GCN: partword.cmpxchg.end: +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP13]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; GCN-NEXT: [[TMP17:%.*]] = insertvalue { i8, i1 } poison, i8 [[EXTRACTED]], 0 +; GCN-NEXT: [[TMP18:%.*]] = insertvalue { i8, i1 } [[TMP17]], i1 [[TMP14]], 1 +; GCN-NEXT: [[EXTRACT:%.*]] = extractvalue { i8, i1 } [[TMP18]], 0 +; GCN-NEXT: ret i8 [[EXTRACT]] +; +; R600-LABEL: @test_cmpxchg_i8_global_agent( +; R600-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT:%.*]], i64 4 +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[GEP]], i32 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[GEP]] to i32 +; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = zext i8 [[IN:%.*]] to i32 +; R600-NEXT: [[TMP4:%.*]] = shl i32 [[TMP3]], [[TMP2]] +; R600-NEXT: [[TMP5:%.*]] = zext i8 [[OLD:%.*]] to i32 +; R600-NEXT: [[TMP6:%.*]] = shl i32 [[TMP5]], [[TMP2]] +; R600-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; R600-NEXT: [[TMP8:%.*]] = and i32 [[TMP7]], [[INV_MASK]] +; R600-NEXT: br label [[PARTWORD_CMPXCHG_LOOP:%.*]] +; R600: partword.cmpxchg.loop: +; R600-NEXT: [[TMP9:%.*]] = phi i32 [ [[TMP8]], [[TMP0:%.*]] ], [ [[TMP15:%.*]], [[PARTWORD_CMPXCHG_FAILURE:%.*]] ] +; R600-NEXT: [[TMP10:%.*]] = or i32 [[TMP9]], [[TMP4]] +; R600-NEXT: [[TMP11:%.*]] = or i32 [[TMP9]], [[TMP6]] +; R600-NEXT: [[TMP12:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[TMP11]], i32 [[TMP10]] syncscope("agent") seq_cst seq_cst, align 4 +; R600-NEXT: [[TMP13:%.*]] = extractvalue { i32, i1 } [[TMP12]], 0 +; R600-NEXT: [[TMP14:%.*]] = extractvalue { i32, i1 } [[TMP12]], 1 +; R600-NEXT: br i1 [[TMP14]], label [[PARTWORD_CMPXCHG_END:%.*]], label [[PARTWORD_CMPXCHG_FAILURE]] +; R600: partword.cmpxchg.failure: +; R600-NEXT: [[TMP15]] = and i32 [[TMP13]], [[INV_MASK]] +; R600-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP9]], [[TMP15]] +; R600-NEXT: br i1 [[TMP16]], label [[PARTWORD_CMPXCHG_LOOP]], label [[PARTWORD_CMPXCHG_END]] +; R600: partword.cmpxchg.end: +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP13]], [[TMP2]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; R600-NEXT: [[TMP17:%.*]] = insertvalue { i8, i1 } poison, i8 [[EXTRACTED]], 0 +; R600-NEXT: [[TMP18:%.*]] = insertvalue { i8, i1 } [[TMP17]], i1 [[TMP14]], 1 +; R600-NEXT: [[EXTRACT:%.*]] = extractvalue { i8, i1 } [[TMP18]], 0 +; R600-NEXT: ret i8 [[EXTRACT]] ; %gep = getelementptr i8, ptr addrspace(1) %out, i64 4 %res = cmpxchg ptr addrspace(1) %gep, i8 %old, i8 %in syncscope("agent") seq_cst seq_cst @@ -427,17 +746,16 @@ define i8 @test_cmpxchg_i8_local_align2(ptr addrspace(3) %out, i8 %in, i8 %old) { ; CHECK-LABEL: @test_cmpxchg_i8_local_align2( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT:%.*]], i64 4 -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[GEP]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[GEP]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[GEP]], i32 -4) +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[GEP]] to i32 +; CHECK-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] ; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[IN:%.*]] to i32 -; CHECK-NEXT: [[TMP4:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = shl i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = zext i8 [[OLD:%.*]] to i32 -; CHECK-NEXT: [[TMP6:%.*]] = shl i32 [[TMP5]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP6:%.*]] = shl i32 [[TMP5]], [[TMP2]] ; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; CHECK-NEXT: [[TMP8:%.*]] = and i32 [[TMP7]], [[INV_MASK]] ; CHECK-NEXT: br label [[PARTWORD_CMPXCHG_LOOP:%.*]] @@ -454,7 +772,7 @@ ; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP9]], [[TMP15]] ; CHECK-NEXT: br i1 [[TMP16]], label [[PARTWORD_CMPXCHG_LOOP]], label [[PARTWORD_CMPXCHG_END]] ; CHECK: partword.cmpxchg.end: -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP13]], [[SHIFTAMT]] +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP13]], [[TMP2]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 ; CHECK-NEXT: [[TMP17:%.*]] = insertvalue { i8, i1 } poison, i8 [[EXTRACTED]], 0 ; CHECK-NEXT: [[TMP18:%.*]] = insertvalue { i8, i1 } [[TMP17]], i1 [[TMP14]], 1 @@ -468,70 +786,128 @@ } define i8 @test_atomicrmw_inc_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) { -; CHECK-LABEL: @test_atomicrmw_inc_i8_global_agent( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 -; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] -; CHECK: atomicrmw.start: -; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 -; CHECK-NEXT: [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1 -; CHECK-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]] -; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]] -; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 -; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 -; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; CHECK: atomicrmw.end: -; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 -; CHECK-NEXT: ret i8 [[EXTRACTED3]] +; GCN-LABEL: @test_atomicrmw_inc_i8_global_agent( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; GCN-NEXT: [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1 +; GCN-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]] +; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]] +; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 +; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; GCN-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 +; GCN-NEXT: ret i8 [[EXTRACTED3]] +; +; R600-LABEL: @test_atomicrmw_inc_i8_global_agent( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32 +; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; R600-NEXT: br label [[ATOMICRMW_START:%.*]] +; R600: atomicrmw.start: +; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; R600-NEXT: [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1 +; R600-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]] +; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]] +; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 +; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] +; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; R600-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; R600: atomicrmw.end: +; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 +; R600-NEXT: ret i8 [[EXTRACTED3]] ; %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst ret i8 %res } define i8 @test_atomicrmw_inc_i8_global_agent_align2(ptr addrspace(1) %ptr, i8 %value) { -; CHECK-LABEL: @test_atomicrmw_inc_i8_global_agent_align2( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 -; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] -; CHECK: atomicrmw.start: -; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 -; CHECK-NEXT: [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1 -; CHECK-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]] -; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]] -; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 -; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 -; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; CHECK: atomicrmw.end: -; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 -; CHECK-NEXT: ret i8 [[EXTRACTED3]] +; GCN-LABEL: @test_atomicrmw_inc_i8_global_agent_align2( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; GCN-NEXT: [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1 +; GCN-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]] +; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]] +; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 +; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; GCN-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 +; GCN-NEXT: ret i8 [[EXTRACTED3]] +; +; R600-LABEL: @test_atomicrmw_inc_i8_global_agent_align2( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32 +; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; R600-NEXT: br label [[ATOMICRMW_START:%.*]] +; R600: atomicrmw.start: +; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; R600-NEXT: [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1 +; R600-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]] +; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]] +; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 +; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] +; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; R600-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; R600: atomicrmw.end: +; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 +; R600-NEXT: ret i8 [[EXTRACTED3]] ; %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst, align 2 ret i8 %res @@ -564,24 +940,23 @@ define i8 @test_atomicrmw_inc_i8_local(ptr addrspace(3) %ptr, i8 %value) { ; CHECK-LABEL: @test_atomicrmw_inc_i8_local( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; CHECK-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] ; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] ; CHECK: atomicrmw.start: ; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 ; CHECK-NEXT: [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1 ; CHECK-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]] ; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]] ; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 -; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] ; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] ; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] ; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 @@ -589,7 +964,7 @@ ; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 ; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; CHECK: atomicrmw.end: -; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] ; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 ; CHECK-NEXT: ret i8 [[EXTRACTED3]] ; @@ -599,24 +974,23 @@ define i8 @test_atomicrmw_inc_i8_local_align2(ptr addrspace(3) %ptr, i8 %value) { ; CHECK-LABEL: @test_atomicrmw_inc_i8_local_align2( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; CHECK-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] ; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] ; CHECK: atomicrmw.start: ; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 ; CHECK-NEXT: [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1 ; CHECK-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]] ; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]] ; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 -; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] ; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] ; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] ; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 @@ -624,7 +998,7 @@ ; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 ; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; CHECK: atomicrmw.end: -; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] ; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 ; CHECK-NEXT: ret i8 [[EXTRACTED3]] ; @@ -658,70 +1032,128 @@ } define i8 @test_atomicrmw_inc_i8_flat_agent(ptr %ptr, i8 %value) { -; CHECK-LABEL: @test_atomicrmw_inc_i8_flat_agent( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 -; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] -; CHECK: atomicrmw.start: -; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 -; CHECK-NEXT: [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1 -; CHECK-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]] -; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]] -; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 -; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 -; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; CHECK: atomicrmw.end: -; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 -; CHECK-NEXT: ret i8 [[EXTRACTED3]] +; GCN-LABEL: @test_atomicrmw_inc_i8_flat_agent( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; GCN-NEXT: [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1 +; GCN-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]] +; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]] +; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 +; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; GCN-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 +; GCN-NEXT: ret i8 [[EXTRACTED3]] +; +; R600-LABEL: @test_atomicrmw_inc_i8_flat_agent( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[PTR:%.*]], i32 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i32 +; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 +; R600-NEXT: br label [[ATOMICRMW_START:%.*]] +; R600: atomicrmw.start: +; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; R600-NEXT: [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1 +; R600-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]] +; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]] +; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 +; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] +; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; R600-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; R600: atomicrmw.end: +; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 +; R600-NEXT: ret i8 [[EXTRACTED3]] ; %res = atomicrmw uinc_wrap ptr %ptr, i8 %value syncscope("agent") seq_cst ret i8 %res } define i8 @test_atomicrmw_inc_i8_flat_agent_align2(ptr %ptr, i8 %value) { -; CHECK-LABEL: @test_atomicrmw_inc_i8_flat_agent_align2( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 -; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] -; CHECK: atomicrmw.start: -; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 -; CHECK-NEXT: [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1 -; CHECK-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]] -; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]] -; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 -; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 -; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; CHECK: atomicrmw.end: -; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 -; CHECK-NEXT: ret i8 [[EXTRACTED3]] +; GCN-LABEL: @test_atomicrmw_inc_i8_flat_agent_align2( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; GCN-NEXT: [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1 +; GCN-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]] +; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]] +; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 +; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; GCN-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 +; GCN-NEXT: ret i8 [[EXTRACTED3]] +; +; R600-LABEL: @test_atomicrmw_inc_i8_flat_agent_align2( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[PTR:%.*]], i32 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i32 +; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 +; R600-NEXT: br label [[ATOMICRMW_START:%.*]] +; R600: atomicrmw.start: +; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; R600-NEXT: [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1 +; R600-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]] +; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]] +; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 +; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] +; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; R600-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; R600: atomicrmw.end: +; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 +; R600-NEXT: ret i8 [[EXTRACTED3]] ; %res = atomicrmw uinc_wrap ptr %ptr, i8 %value syncscope("agent") seq_cst, align 2 ret i8 %res @@ -753,74 +1185,136 @@ } define i8 @test_atomicrmw_dec_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) { -; CHECK-LABEL: @test_atomicrmw_dec_i8_global_agent( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 -; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] -; CHECK: atomicrmw.start: -; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 -; CHECK-NEXT: [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]] -; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]] -; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 -; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; CHECK-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 -; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 -; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 -; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; CHECK: atomicrmw.end: -; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 -; CHECK-NEXT: ret i8 [[EXTRACTED3]] +; GCN-LABEL: @test_atomicrmw_dec_i8_global_agent( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; GCN-NEXT: [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1 +; GCN-NEXT: [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0 +; GCN-NEXT: [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]] +; GCN-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] +; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]] +; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 +; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; GCN-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 +; GCN-NEXT: ret i8 [[EXTRACTED3]] +; +; R600-LABEL: @test_atomicrmw_dec_i8_global_agent( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32 +; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; R600-NEXT: br label [[ATOMICRMW_START:%.*]] +; R600: atomicrmw.start: +; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; R600-NEXT: [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1 +; R600-NEXT: [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0 +; R600-NEXT: [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]] +; R600-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] +; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]] +; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 +; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] +; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; R600-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 +; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 +; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; R600: atomicrmw.end: +; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 +; R600-NEXT: ret i8 [[EXTRACTED3]] ; %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst ret i8 %res } define i8 @test_atomicrmw_dec_i8_global_agent_align2(ptr addrspace(1) %ptr, i8 %value) { -; CHECK-LABEL: @test_atomicrmw_dec_i8_global_agent_align2( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 -; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] -; CHECK: atomicrmw.start: -; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 -; CHECK-NEXT: [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]] -; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]] -; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 -; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; CHECK-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 -; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 -; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 -; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; CHECK: atomicrmw.end: -; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 -; CHECK-NEXT: ret i8 [[EXTRACTED3]] +; GCN-LABEL: @test_atomicrmw_dec_i8_global_agent_align2( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; GCN-NEXT: [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1 +; GCN-NEXT: [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0 +; GCN-NEXT: [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]] +; GCN-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] +; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]] +; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 +; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; GCN-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 +; GCN-NEXT: ret i8 [[EXTRACTED3]] +; +; R600-LABEL: @test_atomicrmw_dec_i8_global_agent_align2( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32 +; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; R600-NEXT: br label [[ATOMICRMW_START:%.*]] +; R600: atomicrmw.start: +; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; R600-NEXT: [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1 +; R600-NEXT: [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0 +; R600-NEXT: [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]] +; R600-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] +; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]] +; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 +; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] +; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; R600-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 +; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 +; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; R600: atomicrmw.end: +; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 +; R600-NEXT: ret i8 [[EXTRACTED3]] ; %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst, align 2 ret i8 %res @@ -855,18 +1349,17 @@ define i8 @test_atomicrmw_dec_i8_local(ptr addrspace(3) %ptr, i8 %value) { ; CHECK-LABEL: @test_atomicrmw_dec_i8_local( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; CHECK-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] ; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] ; CHECK: atomicrmw.start: ; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 ; CHECK-NEXT: [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0 @@ -874,7 +1367,7 @@ ; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]] ; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 -; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] ; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] ; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] ; CHECK-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 @@ -882,7 +1375,7 @@ ; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 ; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; CHECK: atomicrmw.end: -; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] ; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 ; CHECK-NEXT: ret i8 [[EXTRACTED3]] ; @@ -892,18 +1385,17 @@ define i8 @test_atomicrmw_dec_i8_local_align2(ptr addrspace(3) %ptr, i8 %value) { ; CHECK-LABEL: @test_atomicrmw_dec_i8_local_align2( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; CHECK-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] ; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] ; CHECK: atomicrmw.start: ; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 ; CHECK-NEXT: [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0 @@ -911,7 +1403,7 @@ ; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]] ; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 -; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] ; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] ; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] ; CHECK-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 @@ -919,7 +1411,7 @@ ; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 ; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; CHECK: atomicrmw.end: -; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] ; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 ; CHECK-NEXT: ret i8 [[EXTRACTED3]] ; @@ -955,74 +1447,136 @@ } define i8 @test_atomicrmw_dec_i8_flat_agent(ptr %ptr, i8 %value) { -; CHECK-LABEL: @test_atomicrmw_dec_i8_flat_agent( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 -; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] -; CHECK: atomicrmw.start: -; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 -; CHECK-NEXT: [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]] -; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]] -; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 -; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; CHECK-NEXT: [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 -; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 -; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 -; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; CHECK: atomicrmw.end: -; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 -; CHECK-NEXT: ret i8 [[EXTRACTED3]] +; GCN-LABEL: @test_atomicrmw_dec_i8_flat_agent( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; GCN-NEXT: [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1 +; GCN-NEXT: [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0 +; GCN-NEXT: [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]] +; GCN-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] +; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]] +; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 +; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; GCN-NEXT: [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 +; GCN-NEXT: ret i8 [[EXTRACTED3]] +; +; R600-LABEL: @test_atomicrmw_dec_i8_flat_agent( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[PTR:%.*]], i32 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i32 +; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 +; R600-NEXT: br label [[ATOMICRMW_START:%.*]] +; R600: atomicrmw.start: +; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; R600-NEXT: [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1 +; R600-NEXT: [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0 +; R600-NEXT: [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]] +; R600-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] +; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]] +; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 +; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] +; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; R600-NEXT: [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 +; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 +; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; R600: atomicrmw.end: +; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 +; R600-NEXT: ret i8 [[EXTRACTED3]] ; %res = atomicrmw udec_wrap ptr %ptr, i8 %value syncscope("agent") seq_cst ret i8 %res } define i8 @test_atomicrmw_dec_i8_flat_agent_align2(ptr %ptr, i8 %value) { -; CHECK-LABEL: @test_atomicrmw_dec_i8_flat_agent_align2( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 -; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] -; CHECK: atomicrmw.start: -; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 -; CHECK-NEXT: [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]] -; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]] -; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 -; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; CHECK-NEXT: [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 -; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 -; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 -; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; CHECK: atomicrmw.end: -; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 -; CHECK-NEXT: ret i8 [[EXTRACTED3]] +; GCN-LABEL: @test_atomicrmw_dec_i8_flat_agent_align2( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; GCN-NEXT: [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1 +; GCN-NEXT: [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0 +; GCN-NEXT: [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]] +; GCN-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] +; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]] +; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 +; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; GCN-NEXT: [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 +; GCN-NEXT: ret i8 [[EXTRACTED3]] +; +; R600-LABEL: @test_atomicrmw_dec_i8_flat_agent_align2( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[PTR:%.*]], i32 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i32 +; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 +; R600-NEXT: br label [[ATOMICRMW_START:%.*]] +; R600: atomicrmw.start: +; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; R600-NEXT: [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1 +; R600-NEXT: [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0 +; R600-NEXT: [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]] +; R600-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] +; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]] +; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32 +; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] +; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; R600-NEXT: [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 +; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 +; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; R600: atomicrmw.end: +; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8 +; R600-NEXT: ret i8 [[EXTRACTED3]] ; %res = atomicrmw udec_wrap ptr %ptr, i8 %value syncscope("agent") seq_cst, align 2 ret i8 %res diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll @@ -1279,24 +1279,23 @@ define half @test_atomicrmw_fadd_f16_local(ptr addrspace(3) %ptr, half %value) { ; CI-LABEL: @test_atomicrmw_fadd_f16_local( -; CI-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4) -; CI-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64 -; CI-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CI-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CI-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CI-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; CI-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; CI-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; CI-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; CI-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; CI-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] ; CI-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; CI-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: ; CI-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; CI-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; CI-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; CI-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half ; CI-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] ; CI-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 ; CI-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; CI-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; CI-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] ; CI-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] ; CI-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] ; CI-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 @@ -1304,30 +1303,29 @@ ; CI-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 ; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; CI: atomicrmw.end: -; CI-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; CI-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] ; CI-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 ; CI-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half ; CI-NEXT: ret half [[TMP7]] ; ; GFX9-LABEL: @test_atomicrmw_fadd_f16_local( -; GFX9-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4) -; GFX9-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64 -; GFX9-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX9-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX9-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX9-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX9-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; GFX9-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; GFX9-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; GFX9-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; GFX9-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] ; GFX9-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; GFX9-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: ; GFX9-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX9-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; GFX9-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; GFX9-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half ; GFX9-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] ; GFX9-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 ; GFX9-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX9-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX9-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] ; GFX9-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] ; GFX9-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] ; GFX9-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 @@ -1335,30 +1333,29 @@ ; GFX9-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 ; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX9: atomicrmw.end: -; GFX9-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX9-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] ; GFX9-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 ; GFX9-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half ; GFX9-NEXT: ret half [[TMP7]] ; ; GFX908-LABEL: @test_atomicrmw_fadd_f16_local( -; GFX908-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4) -; GFX908-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64 -; GFX908-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX908-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX908-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX908-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX908-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; GFX908-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; GFX908-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; GFX908-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; GFX908-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] ; GFX908-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; GFX908-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX908: atomicrmw.start: ; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX908-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX908-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; GFX908-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half ; GFX908-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] ; GFX908-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 ; GFX908-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX908-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX908-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] ; GFX908-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] ; GFX908-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] ; GFX908-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 @@ -1366,30 +1363,29 @@ ; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 ; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX908: atomicrmw.end: -; GFX908-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX908-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] ; GFX908-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 ; GFX908-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half ; GFX908-NEXT: ret half [[TMP7]] ; ; GFX90A-LABEL: @test_atomicrmw_fadd_f16_local( -; GFX90A-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4) -; GFX90A-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64 -; GFX90A-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX90A-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX90A-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX90A-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX90A-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; GFX90A-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; GFX90A-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; GFX90A-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; GFX90A-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] ; GFX90A-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; GFX90A-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX90A: atomicrmw.start: ; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX90A-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX90A-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; GFX90A-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half ; GFX90A-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] ; GFX90A-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 ; GFX90A-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX90A-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX90A-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] ; GFX90A-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] ; GFX90A-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] ; GFX90A-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 @@ -1397,30 +1393,29 @@ ; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 ; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX90A: atomicrmw.end: -; GFX90A-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX90A-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] ; GFX90A-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 ; GFX90A-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half ; GFX90A-NEXT: ret half [[TMP7]] ; ; GFX940-LABEL: @test_atomicrmw_fadd_f16_local( -; GFX940-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4) -; GFX940-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64 -; GFX940-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX940-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX940-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX940-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX940-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; GFX940-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; GFX940-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; GFX940-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; GFX940-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] ; GFX940-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; GFX940-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX940: atomicrmw.start: ; GFX940-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX940-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; GFX940-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half ; GFX940-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] ; GFX940-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 ; GFX940-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX940-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX940-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] ; GFX940-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] ; GFX940-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] ; GFX940-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 @@ -1428,30 +1423,29 @@ ; GFX940-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 ; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX940: atomicrmw.end: -; GFX940-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX940-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] ; GFX940-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 ; GFX940-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half ; GFX940-NEXT: ret half [[TMP7]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f16_local( -; GFX11-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4) -; GFX11-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64 -; GFX11-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX11-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX11-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX11-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX11-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; GFX11-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; GFX11-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; GFX11-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; GFX11-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] ; GFX11-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; GFX11-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX11: atomicrmw.start: ; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX11-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX11-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; GFX11-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; GFX11-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half ; GFX11-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] ; GFX11-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 ; GFX11-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX11-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX11-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] ; GFX11-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] ; GFX11-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] ; GFX11-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 @@ -1459,7 +1453,7 @@ ; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 ; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX11: atomicrmw.end: -; GFX11-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX11-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] ; GFX11-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 ; GFX11-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half ; GFX11-NEXT: ret half [[TMP7]] @@ -2074,24 +2068,23 @@ define bfloat @test_atomicrmw_fadd_bf16_local(ptr addrspace(3) %ptr, bfloat %value) { ; CI-LABEL: @test_atomicrmw_fadd_bf16_local( -; CI-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4) -; CI-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64 -; CI-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CI-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CI-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CI-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; CI-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; CI-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; CI-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; CI-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; CI-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] ; CI-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; CI-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: ; CI-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; CI-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; CI-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; CI-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat ; CI-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] ; CI-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 ; CI-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; CI-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; CI-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] ; CI-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] ; CI-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] ; CI-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 @@ -2099,30 +2092,29 @@ ; CI-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 ; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; CI: atomicrmw.end: -; CI-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; CI-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] ; CI-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 ; CI-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat ; CI-NEXT: ret bfloat [[TMP7]] ; ; GFX9-LABEL: @test_atomicrmw_fadd_bf16_local( -; GFX9-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4) -; GFX9-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64 -; GFX9-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX9-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX9-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX9-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX9-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; GFX9-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; GFX9-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; GFX9-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; GFX9-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] ; GFX9-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; GFX9-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: ; GFX9-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX9-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; GFX9-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; GFX9-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat ; GFX9-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] ; GFX9-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 ; GFX9-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX9-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX9-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] ; GFX9-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] ; GFX9-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] ; GFX9-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 @@ -2130,30 +2122,29 @@ ; GFX9-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 ; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX9: atomicrmw.end: -; GFX9-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX9-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] ; GFX9-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 ; GFX9-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat ; GFX9-NEXT: ret bfloat [[TMP7]] ; ; GFX908-LABEL: @test_atomicrmw_fadd_bf16_local( -; GFX908-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4) -; GFX908-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64 -; GFX908-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX908-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX908-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX908-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX908-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; GFX908-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; GFX908-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; GFX908-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; GFX908-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] ; GFX908-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; GFX908-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX908: atomicrmw.start: ; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX908-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX908-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; GFX908-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat ; GFX908-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] ; GFX908-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 ; GFX908-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX908-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX908-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] ; GFX908-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] ; GFX908-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] ; GFX908-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 @@ -2161,30 +2152,29 @@ ; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 ; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX908: atomicrmw.end: -; GFX908-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX908-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] ; GFX908-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 ; GFX908-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat ; GFX908-NEXT: ret bfloat [[TMP7]] ; ; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_local( -; GFX90A-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4) -; GFX90A-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64 -; GFX90A-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX90A-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX90A-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX90A-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX90A-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; GFX90A-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; GFX90A-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; GFX90A-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; GFX90A-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] ; GFX90A-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; GFX90A-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX90A: atomicrmw.start: ; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX90A-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX90A-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; GFX90A-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat ; GFX90A-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] ; GFX90A-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 ; GFX90A-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX90A-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX90A-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] ; GFX90A-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] ; GFX90A-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] ; GFX90A-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 @@ -2192,30 +2182,29 @@ ; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 ; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX90A: atomicrmw.end: -; GFX90A-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX90A-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] ; GFX90A-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 ; GFX90A-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat ; GFX90A-NEXT: ret bfloat [[TMP7]] ; ; GFX940-LABEL: @test_atomicrmw_fadd_bf16_local( -; GFX940-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4) -; GFX940-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64 -; GFX940-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX940-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX940-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX940-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX940-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; GFX940-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; GFX940-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; GFX940-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; GFX940-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] ; GFX940-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; GFX940-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX940: atomicrmw.start: ; GFX940-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX940-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; GFX940-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat ; GFX940-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] ; GFX940-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 ; GFX940-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX940-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX940-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] ; GFX940-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] ; GFX940-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] ; GFX940-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 @@ -2223,30 +2212,29 @@ ; GFX940-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 ; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX940: atomicrmw.end: -; GFX940-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX940-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] ; GFX940-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 ; GFX940-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat ; GFX940-NEXT: ret bfloat [[TMP7]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_bf16_local( -; GFX11-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4) -; GFX11-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64 -; GFX11-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX11-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX11-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX11-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX11-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; GFX11-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; GFX11-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; GFX11-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; GFX11-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] ; GFX11-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; GFX11-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX11: atomicrmw.start: ; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX11-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX11-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; GFX11-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; GFX11-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat ; GFX11-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] ; GFX11-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 ; GFX11-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX11-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX11-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] ; GFX11-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] ; GFX11-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] ; GFX11-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 @@ -2254,7 +2242,7 @@ ; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 ; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX11: atomicrmw.end: -; GFX11-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX11-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] ; GFX11-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 ; GFX11-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat ; GFX11-NEXT: ret bfloat [[TMP7]] @@ -3040,24 +3028,23 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bfloat %value) #2 { ; CI-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp( -; CI-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4) -; CI-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64 -; CI-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CI-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CI-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CI-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; CI-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; CI-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; CI-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; CI-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; CI-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] ; CI-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; CI-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: ; CI-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; CI-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; CI-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; CI-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat ; CI-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] ; CI-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 ; CI-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; CI-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; CI-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] ; CI-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] ; CI-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] ; CI-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 @@ -3065,30 +3052,29 @@ ; CI-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 ; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; CI: atomicrmw.end: -; CI-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; CI-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] ; CI-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 ; CI-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat ; CI-NEXT: ret bfloat [[TMP7]] ; ; GFX9-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp( -; GFX9-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4) -; GFX9-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64 -; GFX9-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX9-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX9-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX9-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX9-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; GFX9-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; GFX9-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; GFX9-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; GFX9-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] ; GFX9-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; GFX9-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: ; GFX9-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX9-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; GFX9-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; GFX9-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat ; GFX9-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] ; GFX9-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 ; GFX9-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX9-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX9-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] ; GFX9-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] ; GFX9-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] ; GFX9-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 @@ -3096,30 +3082,29 @@ ; GFX9-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 ; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX9: atomicrmw.end: -; GFX9-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX9-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] ; GFX9-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 ; GFX9-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat ; GFX9-NEXT: ret bfloat [[TMP7]] ; ; GFX908-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp( -; GFX908-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4) -; GFX908-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64 -; GFX908-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX908-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX908-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX908-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX908-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; GFX908-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; GFX908-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; GFX908-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; GFX908-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] ; GFX908-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; GFX908-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX908: atomicrmw.start: ; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX908-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX908-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; GFX908-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat ; GFX908-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] ; GFX908-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 ; GFX908-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX908-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX908-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] ; GFX908-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] ; GFX908-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] ; GFX908-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 @@ -3127,30 +3112,29 @@ ; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 ; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX908: atomicrmw.end: -; GFX908-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX908-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] ; GFX908-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 ; GFX908-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat ; GFX908-NEXT: ret bfloat [[TMP7]] ; ; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp( -; GFX90A-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4) -; GFX90A-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64 -; GFX90A-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX90A-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX90A-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX90A-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX90A-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; GFX90A-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; GFX90A-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; GFX90A-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; GFX90A-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] ; GFX90A-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; GFX90A-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX90A: atomicrmw.start: ; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX90A-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX90A-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; GFX90A-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat ; GFX90A-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] ; GFX90A-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 ; GFX90A-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX90A-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX90A-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] ; GFX90A-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] ; GFX90A-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] ; GFX90A-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 @@ -3158,30 +3142,29 @@ ; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 ; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX90A: atomicrmw.end: -; GFX90A-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX90A-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] ; GFX90A-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 ; GFX90A-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat ; GFX90A-NEXT: ret bfloat [[TMP7]] ; ; GFX940-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp( -; GFX940-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4) -; GFX940-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64 -; GFX940-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX940-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX940-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX940-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX940-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; GFX940-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; GFX940-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; GFX940-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; GFX940-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] ; GFX940-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; GFX940-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX940: atomicrmw.start: ; GFX940-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX940-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; GFX940-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat ; GFX940-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] ; GFX940-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 ; GFX940-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX940-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX940-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] ; GFX940-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] ; GFX940-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] ; GFX940-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 @@ -3189,30 +3172,29 @@ ; GFX940-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 ; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX940: atomicrmw.end: -; GFX940-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX940-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] ; GFX940-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 ; GFX940-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat ; GFX940-NEXT: ret bfloat [[TMP7]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp( -; GFX11-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4) -; GFX11-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64 -; GFX11-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX11-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX11-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX11-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GFX11-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; GFX11-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; GFX11-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; GFX11-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; GFX11-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] ; GFX11-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; GFX11-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX11: atomicrmw.start: ; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX11-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GFX11-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; GFX11-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; GFX11-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat ; GFX11-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] ; GFX11-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 ; GFX11-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX11-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GFX11-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] ; GFX11-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] ; GFX11-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] ; GFX11-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 @@ -3220,7 +3202,7 @@ ; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 ; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX11: atomicrmw.end: -; GFX11-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GFX11-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] ; GFX11-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 ; GFX11-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat ; GFX11-NEXT: ret bfloat [[TMP7]] diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll @@ -165,24 +165,23 @@ define half @test_atomicrmw_fmax_f16_local(ptr addrspace(3) %ptr, half %value) { ; GCN-LABEL: @test_atomicrmw_fmax_f16_local( -; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4) -; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64 -; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GCN-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; GCN-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; GCN-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] ; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] ; GCN: atomicrmw.start: ; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; GCN-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half ; GCN-NEXT: [[TMP5:%.*]] = call half @llvm.maxnum.f16(half [[TMP4]], half [[VALUE:%.*]]) ; GCN-NEXT: [[TMP6:%.*]] = bitcast half [[TMP5]] to i16 ; GCN-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP6]] to i32 -; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] ; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] ; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] ; GCN-NEXT: [[TMP7:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 @@ -190,7 +189,7 @@ ; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0 ; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GCN: atomicrmw.end: -; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] ; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 ; GCN-NEXT: [[TMP8:%.*]] = bitcast i16 [[EXTRACTED3]] to half ; GCN-NEXT: ret half [[TMP8]] diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll @@ -165,24 +165,23 @@ define half @test_atomicrmw_fmin_f16_local(ptr addrspace(3) %ptr, half %value) { ; GCN-LABEL: @test_atomicrmw_fmin_f16_local( -; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4) -; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64 -; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GCN-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; GCN-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; GCN-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] ; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] ; GCN: atomicrmw.start: ; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; GCN-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half ; GCN-NEXT: [[TMP5:%.*]] = call half @llvm.minnum.f16(half [[TMP4]], half [[VALUE:%.*]]) ; GCN-NEXT: [[TMP6:%.*]] = bitcast half [[TMP5]] to i16 ; GCN-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP6]] to i32 -; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] ; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] ; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] ; GCN-NEXT: [[TMP7:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 @@ -190,7 +189,7 @@ ; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0 ; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GCN: atomicrmw.end: -; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] ; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 ; GCN-NEXT: [[TMP8:%.*]] = bitcast i16 [[EXTRACTED3]] to half ; GCN-NEXT: ret half [[TMP8]] diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll @@ -165,24 +165,23 @@ define half @test_atomicrmw_fsub_f16_local(ptr addrspace(3) %ptr, half %value) { ; GCN-LABEL: @test_atomicrmw_fsub_f16_local( -; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4) -; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64 -; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GCN-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; GCN-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; GCN-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] ; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] ; GCN: atomicrmw.start: ; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; GCN-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half ; GCN-NEXT: [[NEW:%.*]] = fsub half [[TMP4]], [[VALUE:%.*]] ; GCN-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 ; GCN-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] ; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] ; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] ; GCN-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 @@ -190,7 +189,7 @@ ; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 ; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GCN: atomicrmw.end: -; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] ; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 ; GCN-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half ; GCN-NEXT: ret half [[TMP7]] @@ -285,24 +284,23 @@ define bfloat @test_atomicrmw_fadd_bf16_local(ptr addrspace(3) %ptr, bfloat %value) { ; GCN-LABEL: @test_atomicrmw_fadd_bf16_local( -; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4) -; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64 -; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GCN-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; GCN-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; GCN-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] ; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] ; GCN: atomicrmw.start: ; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; GCN-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat ; GCN-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] ; GCN-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 ; GCN-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] ; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] ; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] ; GCN-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 @@ -310,7 +308,7 @@ ; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 ; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GCN: atomicrmw.end: -; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] ; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 ; GCN-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat ; GCN-NEXT: ret bfloat [[TMP7]] @@ -471,24 +469,23 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bfloat %value) #2 { ; GCN-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp( -; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4) -; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64 -; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GCN-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; GCN-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; GCN-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] ; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 ; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] ; GCN: atomicrmw.start: ; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; GCN-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat ; GCN-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] ; GCN-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 ; GCN-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] ; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] ; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] ; GCN-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 @@ -496,7 +493,7 @@ ; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 ; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GCN: atomicrmw.end: -; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] ; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 ; GCN-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat ; GCN-NEXT: ret bfloat [[TMP7]] diff --git a/llvm/test/Transforms/AtomicExpand/PowerPC/cmpxchg.ll b/llvm/test/Transforms/AtomicExpand/PowerPC/cmpxchg.ll --- a/llvm/test/Transforms/AtomicExpand/PowerPC/cmpxchg.ll +++ b/llvm/test/Transforms/AtomicExpand/PowerPC/cmpxchg.ll @@ -33,16 +33,12 @@ ; PWR7-NEXT: [[TMP0:%.*]] = alloca i128, align 8 ; PWR7-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP0]]) ; PWR7-NEXT: store i128 [[DESIRE:%.*]], ptr [[TMP0]], align 8 -; PWR7-NEXT: [[TMP1:%.*]] = alloca i128, align 8 -; PWR7-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP1]]) -; PWR7-NEXT: store i128 [[NEW:%.*]], ptr [[TMP1]], align 8 -; PWR7-NEXT: [[TMP2:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 16, ptr [[ADDR:%.*]], ptr [[TMP0]], ptr [[TMP1]], i32 5, i32 5) -; PWR7-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr [[TMP1]]) -; PWR7-NEXT: [[TMP3:%.*]] = load i128, ptr [[TMP0]], align 8 +; PWR7-NEXT: [[TMP1:%.*]] = call zeroext i1 @__atomic_compare_exchange_16(ptr [[ADDR:%.*]], ptr [[TMP0]], i128 [[NEW:%.*]], i32 5, i32 5) +; PWR7-NEXT: [[TMP2:%.*]] = load i128, ptr [[TMP0]], align 8 ; PWR7-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr [[TMP0]]) -; PWR7-NEXT: [[TMP4:%.*]] = insertvalue { i128, i1 } poison, i128 [[TMP3]], 0 -; PWR7-NEXT: [[TMP5:%.*]] = insertvalue { i128, i1 } [[TMP4]], i1 [[TMP2]], 1 -; PWR7-NEXT: [[SUCC:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1 +; PWR7-NEXT: [[TMP3:%.*]] = insertvalue { i128, i1 } poison, i128 [[TMP2]], 0 +; PWR7-NEXT: [[TMP4:%.*]] = insertvalue { i128, i1 } [[TMP3]], i1 [[TMP1]], 1 +; PWR7-NEXT: [[SUCC:%.*]] = extractvalue { i128, i1 } [[TMP4]], 1 ; PWR7-NEXT: ret i1 [[SUCC]] ; entry: diff --git a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-libcall.ll b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-libcall.ll --- a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-libcall.ll +++ b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-libcall.ll @@ -4,10 +4,10 @@ define i256 @atomic_load256_libcall(ptr %ptr) nounwind { ; CHECK-LABEL: @atomic_load256_libcall( -; CHECK-NEXT: [[TMP1:%.*]] = alloca i256, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = alloca i256, align 16 ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr [[TMP1]]) -; CHECK-NEXT: call void @__atomic_load(i64 32, ptr [[PTR:%.*]], ptr [[TMP1]], i32 0) -; CHECK-NEXT: [[TMP2:%.*]] = load i256, ptr [[TMP1]], align 8 +; CHECK-NEXT: call void @__atomic_load(i32 32, ptr [[PTR:%.*]], ptr [[TMP1]], i32 0) +; CHECK-NEXT: [[TMP2:%.*]] = load i256, ptr [[TMP1]], align 16 ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr [[TMP1]]) ; CHECK-NEXT: ret i256 [[TMP2]] ; @@ -18,10 +18,10 @@ define i256 @atomic_load256_libcall_as1(ptr addrspace(1) %ptr) nounwind { ; CHECK-LABEL: @atomic_load256_libcall_as1( ; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[PTR:%.*]] to ptr -; CHECK-NEXT: [[TMP2:%.*]] = alloca i256, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = alloca i256, align 16 ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr [[TMP2]]) -; CHECK-NEXT: call void @__atomic_load(i64 32, ptr [[TMP1]], ptr [[TMP2]], i32 0) -; CHECK-NEXT: [[TMP3:%.*]] = load i256, ptr [[TMP2]], align 8 +; CHECK-NEXT: call void @__atomic_load(i32 32, ptr [[TMP1]], ptr [[TMP2]], i32 0) +; CHECK-NEXT: [[TMP3:%.*]] = load i256, ptr [[TMP2]], align 16 ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr [[TMP2]]) ; CHECK-NEXT: ret i256 [[TMP3]] ; diff --git a/llvm/test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll b/llvm/test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll --- a/llvm/test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll +++ b/llvm/test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll @@ -7,10 +7,10 @@ ; CHECK-NEXT: [[PTR_CAST:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr ; CHECK-NEXT: br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]] ; CHECK: l1: -; CHECK-NEXT: [[V1:%.*]] = load i64, ptr addrspace(3) [[PTR]], align 4 +; CHECK-NEXT: [[V1:%.*]] = load i64, ptr addrspace(3) [[PTR]], align 8 ; CHECK-NEXT: ret i64 [[V1]] ; CHECK: l2: -; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[PTR_CAST]], align 4 +; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[PTR_CAST]], align 8 ; CHECK-NEXT: ret i64 [[V2]] ; %ptr_cast = addrspacecast ptr addrspace(3) %ptr to ptr @@ -31,10 +31,10 @@ ; CHECK-NEXT: [[PTR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr ; CHECK-NEXT: br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]] ; CHECK: l1: -; CHECK-NEXT: [[V1:%.*]] = load i64, ptr addrspace(5) [[PTR]], align 4 +; CHECK-NEXT: [[V1:%.*]] = load i64, ptr addrspace(5) [[PTR]], align 8 ; CHECK-NEXT: ret i64 [[V1]] ; CHECK: l2: -; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[PTR_CAST]], align 4 +; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[PTR_CAST]], align 8 ; CHECK-NEXT: ret i64 [[V2]] ; %ptr_cast = addrspacecast ptr addrspace(5) %ptr to ptr @@ -55,11 +55,11 @@ ; CHECK-SAME: i1 [[PRED:%.*]], ptr addrspace(1) [[PTR:%.*]]) { ; CHECK-NEXT: br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]] ; CHECK: l1: -; CHECK-NEXT: [[V1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: [[V1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 ; CHECK-NEXT: ret i64 [[V1]] ; CHECK: l2: ; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr -; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[TMP1]], align 8 ; CHECK-NEXT: ret i64 [[V2]] ; %ptr_cast = addrspacecast ptr addrspace(1) %ptr to ptr @@ -79,11 +79,11 @@ ; CHECK-SAME: i1 [[PRED:%.*]], ptr [[PTR:%.*]]) { ; CHECK-NEXT: br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]] ; CHECK: l1: -; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[PTR]], align 4 +; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[PTR]], align 8 ; CHECK-NEXT: ret i64 [[V1]] ; CHECK: l2: ; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1) -; CHECK-NEXT: [[V2:%.*]] = load i64, ptr addrspace(1) [[TMP1]], align 4 +; CHECK-NEXT: [[V2:%.*]] = load i64, ptr addrspace(1) [[TMP1]], align 8 ; CHECK-NEXT: ret i64 [[V2]] ; %ptr_cast = addrspacecast ptr %ptr to ptr addrspace(1) @@ -103,11 +103,11 @@ ; CHECK-SAME: i1 [[PRED:%.*]], ptr [[PTR:%.*]]) { ; CHECK-NEXT: br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]] ; CHECK: l1: -; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[PTR]], align 4 +; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[PTR]], align 8 ; CHECK-NEXT: ret i64 [[V1]] ; CHECK: l2: ; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(4) -; CHECK-NEXT: [[V2:%.*]] = load i64, ptr addrspace(4) [[TMP1]], align 4 +; CHECK-NEXT: [[V2:%.*]] = load i64, ptr addrspace(4) [[TMP1]], align 8 ; CHECK-NEXT: ret i64 [[V2]] ; %ptr_cast = addrspacecast ptr %ptr to ptr addrspace(4) @@ -125,13 +125,13 @@ define i64 @sink_flat_to_local(i1 %pred, ptr %ptr) { ; CHECK-LABEL: define i64 @sink_flat_to_local( ; CHECK-SAME: i1 [[PRED:%.*]], ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[PTR_CAST:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3) ; CHECK-NEXT: br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]] ; CHECK: l1: -; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[PTR]], align 4 +; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[PTR]], align 8 ; CHECK-NEXT: ret i64 [[V1]] ; CHECK: l2: -; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3) -; CHECK-NEXT: [[V2:%.*]] = load i64, ptr addrspace(3) [[TMP1]], align 4 +; CHECK-NEXT: [[V2:%.*]] = load i64, ptr addrspace(3) [[PTR_CAST]], align 8 ; CHECK-NEXT: ret i64 [[V2]] ; %ptr_cast = addrspacecast ptr %ptr to ptr addrspace(3) @@ -149,13 +149,13 @@ define i64 @sink_flat_to_private(i1 %pred, ptr %ptr) { ; CHECK-LABEL: define i64 @sink_flat_to_private( ; CHECK-SAME: i1 [[PRED:%.*]], ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[PTR_CAST:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) ; CHECK-NEXT: br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]] ; CHECK: l1: -; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[PTR]], align 4 +; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[PTR]], align 8 ; CHECK-NEXT: ret i64 [[V1]] ; CHECK: l2: -; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) -; CHECK-NEXT: [[V2:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 4 +; CHECK-NEXT: [[V2:%.*]] = load i64, ptr addrspace(5) [[PTR_CAST]], align 8 ; CHECK-NEXT: ret i64 [[V2]] ; %ptr_cast = addrspacecast ptr %ptr to ptr addrspace(5) diff --git a/llvm/test/Transforms/DivRemPairs/X86/div-expanded-rem-pair.ll b/llvm/test/Transforms/DivRemPairs/X86/div-expanded-rem-pair.ll --- a/llvm/test/Transforms/DivRemPairs/X86/div-expanded-rem-pair.ll +++ b/llvm/test/Transforms/DivRemPairs/X86/div-expanded-rem-pair.ll @@ -214,7 +214,7 @@ ; CHECK-NEXT: [[REM:%.*]] = urem i64 [[A]], [[B]] ; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[END:%.*]] ; CHECK: if.then: -; CHECK-NEXT: store i64 [[REM]], ptr [[RP]], align 4 +; CHECK-NEXT: store i64 [[REM]], ptr [[RP]], align 8 ; CHECK-NEXT: br label [[END]] ; CHECK: end: ; CHECK-NEXT: ret i64 [[DIV]] @@ -246,7 +246,7 @@ ; CHECK: if.then: ; CHECK-NEXT: [[TMP0:%.*]] = mul i128 [[DIV]], [[B_FROZEN]] ; CHECK-NEXT: [[REM_DECOMPOSED:%.*]] = sub i128 [[A_FROZEN]], [[TMP0]] -; CHECK-NEXT: store i128 [[REM_DECOMPOSED]], ptr [[RP]], align 4 +; CHECK-NEXT: store i128 [[REM_DECOMPOSED]], ptr [[RP]], align 16 ; CHECK-NEXT: br label [[END]] ; CHECK: end: ; CHECK-NEXT: ret i128 [[DIV]] @@ -275,7 +275,7 @@ ; CHECK-NEXT: i64 2, label [[SW_BB]] ; CHECK-NEXT: ] ; CHECK: sw.bb: -; CHECK-NEXT: store i64 [[REM]], ptr [[RP:%.*]], align 4 +; CHECK-NEXT: store i64 [[REM]], ptr [[RP:%.*]], align 8 ; CHECK-NEXT: br label [[SW_DEFAULT]] ; CHECK: sw.default: ; CHECK-NEXT: ret i64 [[DIV]] @@ -306,7 +306,7 @@ ; CHECK-NEXT: i64 2, label [[SW_BB]] ; CHECK-NEXT: ] ; CHECK: sw.default: -; CHECK-NEXT: store i64 [[REM]], ptr [[RP:%.*]], align 4 +; CHECK-NEXT: store i64 [[REM]], ptr [[RP:%.*]], align 8 ; CHECK-NEXT: br label [[SW_BB]] ; CHECK: sw.bb: ; CHECK-NEXT: ret i64 [[DIV]] @@ -339,7 +339,7 @@ ; CHECK: if.then: ; CHECK-NEXT: call void @maythrow() ; CHECK-NEXT: [[REM:%.*]] = urem i64 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: store i64 [[REM]], ptr [[RP]], align 4 +; CHECK-NEXT: store i64 [[REM]], ptr [[RP]], align 8 ; CHECK-NEXT: br label [[END]] ; CHECK: end: ; CHECK-NEXT: [[DIV:%.*]] = udiv i64 [[A]], [[B]] @@ -369,7 +369,7 @@ ; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[END:%.*]] ; CHECK: if.then: ; CHECK-NEXT: [[REM:%.*]] = urem i64 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: store i64 [[REM]], ptr [[RP]], align 4 +; CHECK-NEXT: store i64 [[REM]], ptr [[RP]], align 8 ; CHECK-NEXT: br label [[END]] ; CHECK: end: ; CHECK-NEXT: call void @maythrow() @@ -401,7 +401,7 @@ ; CHECK: if.then: ; CHECK-NEXT: call void @maythrow() ; CHECK-NEXT: [[REM:%.*]] = urem i128 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: store i128 [[REM]], ptr [[RP]], align 4 +; CHECK-NEXT: store i128 [[REM]], ptr [[RP]], align 16 ; CHECK-NEXT: br label [[END]] ; CHECK: end: ; CHECK-NEXT: [[DIV:%.*]] = udiv i128 [[A]], [[B]] @@ -431,7 +431,7 @@ ; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[END:%.*]] ; CHECK: if.then: ; CHECK-NEXT: [[REM:%.*]] = urem i128 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: store i128 [[REM]], ptr [[RP]], align 4 +; CHECK-NEXT: store i128 [[REM]], ptr [[RP]], align 16 ; CHECK-NEXT: br label [[END]] ; CHECK: end: ; CHECK-NEXT: call void @maythrow() @@ -464,7 +464,7 @@ ; CHECK-NEXT: ] ; CHECK: sw.bb: ; CHECK-NEXT: [[REM:%.*]] = urem i64 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: store i64 [[REM]], ptr [[RP:%.*]], align 4 +; CHECK-NEXT: store i64 [[REM]], ptr [[RP:%.*]], align 8 ; CHECK-NEXT: br label [[SW_BB1]] ; CHECK: sw.bb1: ; CHECK-NEXT: [[DIV:%.*]] = udiv i64 [[A]], [[B]] @@ -501,7 +501,7 @@ ; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END3:%.*]], label [[IF_THEN:%.*]] ; CHECK: if.then: ; CHECK-NEXT: [[REM:%.*]] = urem i64 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: store i64 [[REM]], ptr [[RP]], align 4 +; CHECK-NEXT: store i64 [[REM]], ptr [[RP]], align 8 ; CHECK-NEXT: [[TOBOOL1_NOT:%.*]] = icmp eq i64 [[C:%.*]], 0 ; CHECK-NEXT: br i1 [[TOBOOL1_NOT]], label [[IF_END3]], label [[RETURN:%.*]] ; CHECK: if.end3: diff --git a/llvm/test/Transforms/IndVarSimplify/ARM/indvar-cost.ll b/llvm/test/Transforms/IndVarSimplify/ARM/indvar-cost.ll --- a/llvm/test/Transforms/IndVarSimplify/ARM/indvar-cost.ll +++ b/llvm/test/Transforms/IndVarSimplify/ARM/indvar-cost.ll @@ -13,6 +13,9 @@ ; CHECK-T1-NEXT: [[CMP41080:%.*]] = icmp eq i32 [[SUB]], 0 ; CHECK-T1-NEXT: br i1 [[CMP41080]], label [[WHILE_END13:%.*]], label [[WHILE_COND5_PREHEADER_PREHEADER:%.*]] ; CHECK-T1: while.cond5.preheader.preheader: +; CHECK-T1-NEXT: [[TMP0:%.*]] = add i32 [[SRCALEN_SRCBLEN]], -2 +; CHECK-T1-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 2) +; CHECK-T1-NEXT: [[TMP1:%.*]] = add nuw nsw i32 [[UMIN]], 2 ; CHECK-T1-NEXT: br label [[WHILE_COND5_PREHEADER:%.*]] ; CHECK-T1: while.cond5.preheader: ; CHECK-T1-NEXT: [[COUNT_01084:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_END:%.*]] ], [ 1, [[WHILE_COND5_PREHEADER_PREHEADER]] ] @@ -26,11 +29,11 @@ ; CHECK-T1-NEXT: [[PY_11076:%.*]] = phi ptr [ [[INCDEC_PTR8:%.*]], [[WHILE_BODY7]] ], [ [[PY_01082]], [[WHILE_COND5_PREHEADER]] ] ; CHECK-T1-NEXT: [[PX_11075:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY7]] ], [ [[PSRCB_PSRCA]], [[WHILE_COND5_PREHEADER]] ] ; CHECK-T1-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i16, ptr [[PX_11075]], i32 1 -; CHECK-T1-NEXT: [[TMP0:%.*]] = load i16, ptr [[PX_11075]], align 2 -; CHECK-T1-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32 +; CHECK-T1-NEXT: [[TMP2:%.*]] = load i16, ptr [[PX_11075]], align 2 +; CHECK-T1-NEXT: [[CONV:%.*]] = sext i16 [[TMP2]] to i32 ; CHECK-T1-NEXT: [[INCDEC_PTR8]] = getelementptr inbounds i16, ptr [[PY_11076]], i32 -1 -; CHECK-T1-NEXT: [[TMP1:%.*]] = load i16, ptr [[PY_11076]], align 2 -; CHECK-T1-NEXT: [[CONV9:%.*]] = sext i16 [[TMP1]] to i32 +; CHECK-T1-NEXT: [[TMP3:%.*]] = load i16, ptr [[PY_11076]], align 2 +; CHECK-T1-NEXT: [[CONV9:%.*]] = sext i16 [[TMP3]] to i32 ; CHECK-T1-NEXT: [[MUL_I:%.*]] = mul nsw i32 [[CONV9]], [[CONV]] ; CHECK-T1-NEXT: [[SHR3_I:%.*]] = ashr i32 [[CONV]], 16 ; CHECK-T1-NEXT: [[SHR4_I:%.*]] = ashr i32 [[CONV9]], 16 @@ -42,17 +45,15 @@ ; CHECK-T1-NEXT: br i1 [[CMP6]], label [[WHILE_END]], label [[WHILE_BODY7]] ; CHECK-T1: while.end: ; CHECK-T1-NEXT: [[ADD6_I_LCSSA:%.*]] = phi i32 [ [[ADD6_I]], [[WHILE_BODY7]] ] -; CHECK-T1-NEXT: [[TMP2:%.*]] = lshr i32 [[ADD6_I_LCSSA]], 15 -; CHECK-T1-NEXT: [[CONV10:%.*]] = trunc i32 [[TMP2]] to i16 +; CHECK-T1-NEXT: [[TMP4:%.*]] = lshr i32 [[ADD6_I_LCSSA]], 15 +; CHECK-T1-NEXT: [[CONV10:%.*]] = trunc i32 [[TMP4]] to i16 ; CHECK-T1-NEXT: [[INCDEC_PTR11]] = getelementptr inbounds i16, ptr [[POUT_01081]], i32 1 ; CHECK-T1-NEXT: store i16 [[CONV10]], ptr [[POUT_01081]], align 2 ; CHECK-T1-NEXT: [[ADD_PTR]] = getelementptr inbounds i16, ptr [[PSRCA_PSRCB]], i32 [[COUNT_01084]] ; CHECK-T1-NEXT: [[INC]] = add nuw nsw i32 [[COUNT_01084]], 1 ; CHECK-T1-NEXT: [[DEC12]] = add i32 [[BLOCKSIZE1_01083]], -1 -; CHECK-T1-NEXT: [[CMP3:%.*]] = icmp ult i32 [[COUNT_01084]], 3 -; CHECK-T1-NEXT: [[CMP4:%.*]] = icmp ne i32 [[DEC12]], 0 -; CHECK-T1-NEXT: [[TMP3:%.*]] = and i1 [[CMP4]], [[CMP3]] -; CHECK-T1-NEXT: br i1 [[TMP3]], label [[WHILE_COND5_PREHEADER]], label [[WHILE_END13_LOOPEXIT:%.*]] +; CHECK-T1-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[INC]], [[TMP1]] +; CHECK-T1-NEXT: br i1 [[EXITCOND]], label [[WHILE_COND5_PREHEADER]], label [[WHILE_END13_LOOPEXIT:%.*]] ; CHECK-T1: while.end13.loopexit: ; CHECK-T1-NEXT: [[INCDEC_PTR11_LCSSA:%.*]] = phi ptr [ [[INCDEC_PTR11]], [[WHILE_END]] ] ; CHECK-T1-NEXT: [[ADD_PTR_LCSSA:%.*]] = phi ptr [ [[ADD_PTR]], [[WHILE_END]] ] @@ -85,34 +86,34 @@ ; CHECK-T1-NEXT: [[PY_31056:%.*]] = phi ptr [ [[ADD_PTR_I884:%.*]], [[WHILE_BODY23]] ], [ [[PY_21070]], [[WHILE_BODY23_PREHEADER]] ] ; CHECK-T1-NEXT: [[PX_31055:%.*]] = phi ptr [ [[ADD_PTR_I890:%.*]], [[WHILE_BODY23]] ], [ [[PSRCB_PSRCA]], [[WHILE_BODY23_PREHEADER]] ] ; CHECK-T1-NEXT: [[ARRAYIDX_I907:%.*]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 1 -; CHECK-T1-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_I907]], align 2 -; CHECK-T1-NEXT: [[TMP5:%.*]] = load i16, ptr [[PX_31055]], align 2 +; CHECK-T1-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_I907]], align 2 +; CHECK-T1-NEXT: [[TMP6:%.*]] = load i16, ptr [[PX_31055]], align 2 ; CHECK-T1-NEXT: [[ADD_PTR_I912:%.*]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 2 ; CHECK-T1-NEXT: [[ARRAYIDX_I901:%.*]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 1 -; CHECK-T1-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_I901]], align 2 -; CHECK-T1-NEXT: [[TMP7:%.*]] = load i16, ptr [[PY_31056]], align 2 +; CHECK-T1-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_I901]], align 2 +; CHECK-T1-NEXT: [[TMP8:%.*]] = load i16, ptr [[PY_31056]], align 2 ; CHECK-T1-NEXT: [[ADD_PTR_I906:%.*]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 -2 -; CHECK-T1-NEXT: [[SHR_I892:%.*]] = sext i16 [[TMP5]] to i32 -; CHECK-T1-NEXT: [[SHR1_I893:%.*]] = sext i16 [[TMP6]] to i32 +; CHECK-T1-NEXT: [[SHR_I892:%.*]] = sext i16 [[TMP6]] to i32 +; CHECK-T1-NEXT: [[SHR1_I893:%.*]] = sext i16 [[TMP7]] to i32 ; CHECK-T1-NEXT: [[MUL_I894:%.*]] = mul nsw i32 [[SHR1_I893]], [[SHR_I892]] -; CHECK-T1-NEXT: [[SHR2_I895:%.*]] = sext i16 [[TMP4]] to i32 -; CHECK-T1-NEXT: [[SHR4_I897:%.*]] = sext i16 [[TMP7]] to i32 +; CHECK-T1-NEXT: [[SHR2_I895:%.*]] = sext i16 [[TMP5]] to i32 +; CHECK-T1-NEXT: [[SHR4_I897:%.*]] = sext i16 [[TMP8]] to i32 ; CHECK-T1-NEXT: [[MUL5_I898:%.*]] = mul nsw i32 [[SHR4_I897]], [[SHR2_I895]] ; CHECK-T1-NEXT: [[ADD_I899:%.*]] = add i32 [[MUL_I894]], [[SUM_11057]] ; CHECK-T1-NEXT: [[ADD6_I900:%.*]] = add i32 [[ADD_I899]], [[MUL5_I898]] ; CHECK-T1-NEXT: [[ARRAYIDX_I885:%.*]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 3 -; CHECK-T1-NEXT: [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX_I885]], align 2 -; CHECK-T1-NEXT: [[TMP9:%.*]] = load i16, ptr [[ADD_PTR_I912]], align 2 +; CHECK-T1-NEXT: [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_I885]], align 2 +; CHECK-T1-NEXT: [[TMP10:%.*]] = load i16, ptr [[ADD_PTR_I912]], align 2 ; CHECK-T1-NEXT: [[ADD_PTR_I890]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 4 ; CHECK-T1-NEXT: [[ARRAYIDX_I879:%.*]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 -1 -; CHECK-T1-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_I879]], align 2 -; CHECK-T1-NEXT: [[TMP11:%.*]] = load i16, ptr [[ADD_PTR_I906]], align 2 +; CHECK-T1-NEXT: [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX_I879]], align 2 +; CHECK-T1-NEXT: [[TMP12:%.*]] = load i16, ptr [[ADD_PTR_I906]], align 2 ; CHECK-T1-NEXT: [[ADD_PTR_I884]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 -4 -; CHECK-T1-NEXT: [[SHR_I870:%.*]] = sext i16 [[TMP9]] to i32 -; CHECK-T1-NEXT: [[SHR1_I871:%.*]] = sext i16 [[TMP10]] to i32 +; CHECK-T1-NEXT: [[SHR_I870:%.*]] = sext i16 [[TMP10]] to i32 +; CHECK-T1-NEXT: [[SHR1_I871:%.*]] = sext i16 [[TMP11]] to i32 ; CHECK-T1-NEXT: [[MUL_I872:%.*]] = mul nsw i32 [[SHR1_I871]], [[SHR_I870]] -; CHECK-T1-NEXT: [[SHR2_I873:%.*]] = sext i16 [[TMP8]] to i32 -; CHECK-T1-NEXT: [[SHR4_I875:%.*]] = sext i16 [[TMP11]] to i32 +; CHECK-T1-NEXT: [[SHR2_I873:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-T1-NEXT: [[SHR4_I875:%.*]] = sext i16 [[TMP12]] to i32 ; CHECK-T1-NEXT: [[MUL5_I876:%.*]] = mul nsw i32 [[SHR4_I875]], [[SHR2_I873]] ; CHECK-T1-NEXT: [[ADD_I877:%.*]] = add i32 [[ADD6_I900]], [[MUL_I872]] ; CHECK-T1-NEXT: [[ADD6_I878]] = add i32 [[ADD_I877]], [[MUL5_I876]] @@ -140,11 +141,11 @@ ; CHECK-T1-NEXT: [[PY_41064:%.*]] = phi ptr [ [[INCDEC_PTR39:%.*]], [[WHILE_BODY36]] ], [ [[ADD_PTR32]], [[WHILE_BODY36_PREHEADER]] ] ; CHECK-T1-NEXT: [[PX_41063:%.*]] = phi ptr [ [[INCDEC_PTR37:%.*]], [[WHILE_BODY36]] ], [ [[PX_3_LCSSA]], [[WHILE_BODY36_PREHEADER]] ] ; CHECK-T1-NEXT: [[INCDEC_PTR37]] = getelementptr inbounds i16, ptr [[PX_41063]], i32 1 -; CHECK-T1-NEXT: [[TMP12:%.*]] = load i16, ptr [[PX_41063]], align 2 -; CHECK-T1-NEXT: [[CONV38:%.*]] = sext i16 [[TMP12]] to i32 +; CHECK-T1-NEXT: [[TMP13:%.*]] = load i16, ptr [[PX_41063]], align 2 +; CHECK-T1-NEXT: [[CONV38:%.*]] = sext i16 [[TMP13]] to i32 ; CHECK-T1-NEXT: [[INCDEC_PTR39]] = getelementptr inbounds i16, ptr [[PY_41064]], i32 -1 -; CHECK-T1-NEXT: [[TMP13:%.*]] = load i16, ptr [[PY_41064]], align 2 -; CHECK-T1-NEXT: [[CONV40:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-T1-NEXT: [[TMP14:%.*]] = load i16, ptr [[PY_41064]], align 2 +; CHECK-T1-NEXT: [[CONV40:%.*]] = sext i16 [[TMP14]] to i32 ; CHECK-T1-NEXT: [[MUL_I863:%.*]] = mul nsw i32 [[CONV40]], [[CONV38]] ; CHECK-T1-NEXT: [[SHR3_I864:%.*]] = ashr i32 [[CONV38]], 16 ; CHECK-T1-NEXT: [[SHR4_I865:%.*]] = ashr i32 [[CONV40]], 16 @@ -159,8 +160,8 @@ ; CHECK-T1-NEXT: br label [[WHILE_END43]] ; CHECK-T1: while.end43: ; CHECK-T1-NEXT: [[SUM_2_LCSSA:%.*]] = phi i32 [ [[SUM_1_LCSSA]], [[WHILE_END31]] ], [ [[ADD6_I868_LCSSA]], [[WHILE_END43_LOOPEXIT]] ] -; CHECK-T1-NEXT: [[TMP14:%.*]] = lshr i32 [[SUM_2_LCSSA]], 15 -; CHECK-T1-NEXT: [[CONV45:%.*]] = trunc i32 [[TMP14]] to i16 +; CHECK-T1-NEXT: [[TMP15:%.*]] = lshr i32 [[SUM_2_LCSSA]], 15 +; CHECK-T1-NEXT: [[CONV45:%.*]] = trunc i32 [[TMP15]] to i16 ; CHECK-T1-NEXT: [[INCDEC_PTR46]] = getelementptr inbounds i16, ptr [[POUT_11069]], i32 1 ; CHECK-T1-NEXT: store i16 [[CONV45]], ptr [[POUT_11069]], align 2 ; CHECK-T1-NEXT: [[SUB47:%.*]] = add i32 [[COUNT_11072]], -1 @@ -184,6 +185,9 @@ ; CHECK-T2-NEXT: [[CMP41080:%.*]] = icmp eq i32 [[SUB]], 0 ; CHECK-T2-NEXT: br i1 [[CMP41080]], label [[WHILE_END13:%.*]], label [[WHILE_COND5_PREHEADER_PREHEADER:%.*]] ; CHECK-T2: while.cond5.preheader.preheader: +; CHECK-T2-NEXT: [[TMP0:%.*]] = add i32 [[SRCALEN_SRCBLEN]], -2 +; CHECK-T2-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 2) +; CHECK-T2-NEXT: [[TMP1:%.*]] = add nuw nsw i32 [[UMIN]], 2 ; CHECK-T2-NEXT: br label [[WHILE_COND5_PREHEADER:%.*]] ; CHECK-T2: while.cond5.preheader: ; CHECK-T2-NEXT: [[COUNT_01084:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_END:%.*]] ], [ 1, [[WHILE_COND5_PREHEADER_PREHEADER]] ] @@ -197,11 +201,11 @@ ; CHECK-T2-NEXT: [[PY_11076:%.*]] = phi ptr [ [[INCDEC_PTR8:%.*]], [[WHILE_BODY7]] ], [ [[PY_01082]], [[WHILE_COND5_PREHEADER]] ] ; CHECK-T2-NEXT: [[PX_11075:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY7]] ], [ [[PSRCB_PSRCA]], [[WHILE_COND5_PREHEADER]] ] ; CHECK-T2-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i16, ptr [[PX_11075]], i32 1 -; CHECK-T2-NEXT: [[TMP0:%.*]] = load i16, ptr [[PX_11075]], align 2 -; CHECK-T2-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32 +; CHECK-T2-NEXT: [[TMP2:%.*]] = load i16, ptr [[PX_11075]], align 2 +; CHECK-T2-NEXT: [[CONV:%.*]] = sext i16 [[TMP2]] to i32 ; CHECK-T2-NEXT: [[INCDEC_PTR8]] = getelementptr inbounds i16, ptr [[PY_11076]], i32 -1 -; CHECK-T2-NEXT: [[TMP1:%.*]] = load i16, ptr [[PY_11076]], align 2 -; CHECK-T2-NEXT: [[CONV9:%.*]] = sext i16 [[TMP1]] to i32 +; CHECK-T2-NEXT: [[TMP3:%.*]] = load i16, ptr [[PY_11076]], align 2 +; CHECK-T2-NEXT: [[CONV9:%.*]] = sext i16 [[TMP3]] to i32 ; CHECK-T2-NEXT: [[MUL_I:%.*]] = mul nsw i32 [[CONV9]], [[CONV]] ; CHECK-T2-NEXT: [[SHR3_I:%.*]] = ashr i32 [[CONV]], 16 ; CHECK-T2-NEXT: [[SHR4_I:%.*]] = ashr i32 [[CONV9]], 16 @@ -213,17 +217,15 @@ ; CHECK-T2-NEXT: br i1 [[CMP6]], label [[WHILE_END]], label [[WHILE_BODY7]] ; CHECK-T2: while.end: ; CHECK-T2-NEXT: [[ADD6_I_LCSSA:%.*]] = phi i32 [ [[ADD6_I]], [[WHILE_BODY7]] ] -; CHECK-T2-NEXT: [[TMP2:%.*]] = lshr i32 [[ADD6_I_LCSSA]], 15 -; CHECK-T2-NEXT: [[CONV10:%.*]] = trunc i32 [[TMP2]] to i16 +; CHECK-T2-NEXT: [[TMP4:%.*]] = lshr i32 [[ADD6_I_LCSSA]], 15 +; CHECK-T2-NEXT: [[CONV10:%.*]] = trunc i32 [[TMP4]] to i16 ; CHECK-T2-NEXT: [[INCDEC_PTR11]] = getelementptr inbounds i16, ptr [[POUT_01081]], i32 1 ; CHECK-T2-NEXT: store i16 [[CONV10]], ptr [[POUT_01081]], align 2 ; CHECK-T2-NEXT: [[ADD_PTR]] = getelementptr inbounds i16, ptr [[PSRCA_PSRCB]], i32 [[COUNT_01084]] ; CHECK-T2-NEXT: [[INC]] = add nuw nsw i32 [[COUNT_01084]], 1 ; CHECK-T2-NEXT: [[DEC12]] = add i32 [[BLOCKSIZE1_01083]], -1 -; CHECK-T2-NEXT: [[CMP3:%.*]] = icmp ult i32 [[COUNT_01084]], 3 -; CHECK-T2-NEXT: [[CMP4:%.*]] = icmp ne i32 [[DEC12]], 0 -; CHECK-T2-NEXT: [[TMP3:%.*]] = and i1 [[CMP4]], [[CMP3]] -; CHECK-T2-NEXT: br i1 [[TMP3]], label [[WHILE_COND5_PREHEADER]], label [[WHILE_END13_LOOPEXIT:%.*]] +; CHECK-T2-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[INC]], [[TMP1]] +; CHECK-T2-NEXT: br i1 [[EXITCOND]], label [[WHILE_COND5_PREHEADER]], label [[WHILE_END13_LOOPEXIT:%.*]] ; CHECK-T2: while.end13.loopexit: ; CHECK-T2-NEXT: [[INCDEC_PTR11_LCSSA:%.*]] = phi ptr [ [[INCDEC_PTR11]], [[WHILE_END]] ] ; CHECK-T2-NEXT: [[ADD_PTR_LCSSA:%.*]] = phi ptr [ [[ADD_PTR]], [[WHILE_END]] ] @@ -256,34 +258,34 @@ ; CHECK-T2-NEXT: [[PY_31056:%.*]] = phi ptr [ [[ADD_PTR_I884:%.*]], [[WHILE_BODY23]] ], [ [[PY_21070]], [[WHILE_BODY23_PREHEADER]] ] ; CHECK-T2-NEXT: [[PX_31055:%.*]] = phi ptr [ [[ADD_PTR_I890:%.*]], [[WHILE_BODY23]] ], [ [[PSRCB_PSRCA]], [[WHILE_BODY23_PREHEADER]] ] ; CHECK-T2-NEXT: [[ARRAYIDX_I907:%.*]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 1 -; CHECK-T2-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_I907]], align 2 -; CHECK-T2-NEXT: [[TMP5:%.*]] = load i16, ptr [[PX_31055]], align 2 +; CHECK-T2-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_I907]], align 2 +; CHECK-T2-NEXT: [[TMP6:%.*]] = load i16, ptr [[PX_31055]], align 2 ; CHECK-T2-NEXT: [[ADD_PTR_I912:%.*]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 2 ; CHECK-T2-NEXT: [[ARRAYIDX_I901:%.*]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 1 -; CHECK-T2-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_I901]], align 2 -; CHECK-T2-NEXT: [[TMP7:%.*]] = load i16, ptr [[PY_31056]], align 2 +; CHECK-T2-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_I901]], align 2 +; CHECK-T2-NEXT: [[TMP8:%.*]] = load i16, ptr [[PY_31056]], align 2 ; CHECK-T2-NEXT: [[ADD_PTR_I906:%.*]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 -2 -; CHECK-T2-NEXT: [[SHR_I892:%.*]] = sext i16 [[TMP5]] to i32 -; CHECK-T2-NEXT: [[SHR1_I893:%.*]] = sext i16 [[TMP6]] to i32 +; CHECK-T2-NEXT: [[SHR_I892:%.*]] = sext i16 [[TMP6]] to i32 +; CHECK-T2-NEXT: [[SHR1_I893:%.*]] = sext i16 [[TMP7]] to i32 ; CHECK-T2-NEXT: [[MUL_I894:%.*]] = mul nsw i32 [[SHR1_I893]], [[SHR_I892]] -; CHECK-T2-NEXT: [[SHR2_I895:%.*]] = sext i16 [[TMP4]] to i32 -; CHECK-T2-NEXT: [[SHR4_I897:%.*]] = sext i16 [[TMP7]] to i32 +; CHECK-T2-NEXT: [[SHR2_I895:%.*]] = sext i16 [[TMP5]] to i32 +; CHECK-T2-NEXT: [[SHR4_I897:%.*]] = sext i16 [[TMP8]] to i32 ; CHECK-T2-NEXT: [[MUL5_I898:%.*]] = mul nsw i32 [[SHR4_I897]], [[SHR2_I895]] ; CHECK-T2-NEXT: [[ADD_I899:%.*]] = add i32 [[MUL_I894]], [[SUM_11057]] ; CHECK-T2-NEXT: [[ADD6_I900:%.*]] = add i32 [[ADD_I899]], [[MUL5_I898]] ; CHECK-T2-NEXT: [[ARRAYIDX_I885:%.*]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 3 -; CHECK-T2-NEXT: [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX_I885]], align 2 -; CHECK-T2-NEXT: [[TMP9:%.*]] = load i16, ptr [[ADD_PTR_I912]], align 2 +; CHECK-T2-NEXT: [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_I885]], align 2 +; CHECK-T2-NEXT: [[TMP10:%.*]] = load i16, ptr [[ADD_PTR_I912]], align 2 ; CHECK-T2-NEXT: [[ADD_PTR_I890]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 4 ; CHECK-T2-NEXT: [[ARRAYIDX_I879:%.*]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 -1 -; CHECK-T2-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_I879]], align 2 -; CHECK-T2-NEXT: [[TMP11:%.*]] = load i16, ptr [[ADD_PTR_I906]], align 2 +; CHECK-T2-NEXT: [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX_I879]], align 2 +; CHECK-T2-NEXT: [[TMP12:%.*]] = load i16, ptr [[ADD_PTR_I906]], align 2 ; CHECK-T2-NEXT: [[ADD_PTR_I884]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 -4 -; CHECK-T2-NEXT: [[SHR_I870:%.*]] = sext i16 [[TMP9]] to i32 -; CHECK-T2-NEXT: [[SHR1_I871:%.*]] = sext i16 [[TMP10]] to i32 +; CHECK-T2-NEXT: [[SHR_I870:%.*]] = sext i16 [[TMP10]] to i32 +; CHECK-T2-NEXT: [[SHR1_I871:%.*]] = sext i16 [[TMP11]] to i32 ; CHECK-T2-NEXT: [[MUL_I872:%.*]] = mul nsw i32 [[SHR1_I871]], [[SHR_I870]] -; CHECK-T2-NEXT: [[SHR2_I873:%.*]] = sext i16 [[TMP8]] to i32 -; CHECK-T2-NEXT: [[SHR4_I875:%.*]] = sext i16 [[TMP11]] to i32 +; CHECK-T2-NEXT: [[SHR2_I873:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-T2-NEXT: [[SHR4_I875:%.*]] = sext i16 [[TMP12]] to i32 ; CHECK-T2-NEXT: [[MUL5_I876:%.*]] = mul nsw i32 [[SHR4_I875]], [[SHR2_I873]] ; CHECK-T2-NEXT: [[ADD_I877:%.*]] = add i32 [[ADD6_I900]], [[MUL_I872]] ; CHECK-T2-NEXT: [[ADD6_I878]] = add i32 [[ADD_I877]], [[MUL5_I876]] @@ -311,11 +313,11 @@ ; CHECK-T2-NEXT: [[PY_41064:%.*]] = phi ptr [ [[INCDEC_PTR39:%.*]], [[WHILE_BODY36]] ], [ [[ADD_PTR32]], [[WHILE_BODY36_PREHEADER]] ] ; CHECK-T2-NEXT: [[PX_41063:%.*]] = phi ptr [ [[INCDEC_PTR37:%.*]], [[WHILE_BODY36]] ], [ [[PX_3_LCSSA]], [[WHILE_BODY36_PREHEADER]] ] ; CHECK-T2-NEXT: [[INCDEC_PTR37]] = getelementptr inbounds i16, ptr [[PX_41063]], i32 1 -; CHECK-T2-NEXT: [[TMP12:%.*]] = load i16, ptr [[PX_41063]], align 2 -; CHECK-T2-NEXT: [[CONV38:%.*]] = sext i16 [[TMP12]] to i32 +; CHECK-T2-NEXT: [[TMP13:%.*]] = load i16, ptr [[PX_41063]], align 2 +; CHECK-T2-NEXT: [[CONV38:%.*]] = sext i16 [[TMP13]] to i32 ; CHECK-T2-NEXT: [[INCDEC_PTR39]] = getelementptr inbounds i16, ptr [[PY_41064]], i32 -1 -; CHECK-T2-NEXT: [[TMP13:%.*]] = load i16, ptr [[PY_41064]], align 2 -; CHECK-T2-NEXT: [[CONV40:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-T2-NEXT: [[TMP14:%.*]] = load i16, ptr [[PY_41064]], align 2 +; CHECK-T2-NEXT: [[CONV40:%.*]] = sext i16 [[TMP14]] to i32 ; CHECK-T2-NEXT: [[MUL_I863:%.*]] = mul nsw i32 [[CONV40]], [[CONV38]] ; CHECK-T2-NEXT: [[SHR3_I864:%.*]] = ashr i32 [[CONV38]], 16 ; CHECK-T2-NEXT: [[SHR4_I865:%.*]] = ashr i32 [[CONV40]], 16 @@ -330,8 +332,8 @@ ; CHECK-T2-NEXT: br label [[WHILE_END43]] ; CHECK-T2: while.end43: ; CHECK-T2-NEXT: [[SUM_2_LCSSA:%.*]] = phi i32 [ [[SUM_1_LCSSA]], [[WHILE_END31]] ], [ [[ADD6_I868_LCSSA]], [[WHILE_END43_LOOPEXIT]] ] -; CHECK-T2-NEXT: [[TMP14:%.*]] = lshr i32 [[SUM_2_LCSSA]], 15 -; CHECK-T2-NEXT: [[CONV45:%.*]] = trunc i32 [[TMP14]] to i16 +; CHECK-T2-NEXT: [[TMP15:%.*]] = lshr i32 [[SUM_2_LCSSA]], 15 +; CHECK-T2-NEXT: [[CONV45:%.*]] = trunc i32 [[TMP15]] to i16 ; CHECK-T2-NEXT: [[INCDEC_PTR46]] = getelementptr inbounds i16, ptr [[POUT_11069]], i32 1 ; CHECK-T2-NEXT: store i16 [[CONV45]], ptr [[POUT_11069]], align 2 ; CHECK-T2-NEXT: [[SUB47:%.*]] = add i32 [[COUNT_11072]], -1 diff --git a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll --- a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll +++ b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll @@ -2,7 +2,7 @@ ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes=inferattrs -S | FileCheck --match-full-lines --check-prefixes=CHECK,CHECK-KNOWN,CHECK-NOLINUX,CHECK-OPEN,CHECK-DARWIN %s ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes=inferattrs -S | FileCheck --match-full-lines --check-prefixes=CHECK,CHECK-KNOWN,CHECK-LINUX %s ; RUN: opt < %s -mtriple=nvptx -passes=inferattrs -S | FileCheck --match-full-lines --check-prefixes=CHECK-NOLINUX,CHECK-NVPTX %s -; RUN: opt < %s -mtriple=powerpc-ibm-aix-xcoff -passes=inferattrs -S | FileCheck --match-full-lines --check-prefixes=CHECK-AIX %s +; RUN: opt < %s -mtriple=powerpc64-ibm-aix-xcoff -passes=inferattrs -S | FileCheck --match-full-lines --check-prefixes=CHECK-NOLINUX,CHECK-AIX %s declare i32 @__nvvm_reflect(ptr) ; CHECK-NVPTX: declare noundef i32 @__nvvm_reflect(ptr noundef) [[NOFREE_NOUNWIND_READNONE:#[0-9]+]] diff --git a/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll b/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll --- a/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll +++ b/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=instcombine -S -mtriple "i386-pc-linux" | FileCheck %s -; RUN: opt < %s -passes=instcombine -S -mtriple "i386-pc-win32" | FileCheck %s -; RUN: opt < %s -passes=instcombine -S -mtriple "x86_64-pc-win32" | FileCheck %s -; RUN: opt < %s -passes=instcombine -S -mtriple "i386-pc-mingw32" | FileCheck %s -; RUN: opt < %s -passes=instcombine -S -mtriple "x86_64-pc-mingw32" | FileCheck %s -; RUN: opt < %s -passes=instcombine -S -mtriple "sparc-sun-solaris" | FileCheck %s +; RUN: opt < %s -passes=instcombine -S -mtriple "i386-pc-linux" | FileCheck %s --check-prefixes=CHECK,DOUBLE-4BYTE-ALIGN +; RUN: opt < %s -passes=instcombine -S -mtriple "i386-pc-win32" | FileCheck %s --check-prefixes=CHECK,DOUBLE-8BYTE-ALIGN +; RUN: opt < %s -passes=instcombine -S -mtriple "x86_64-pc-win32" | FileCheck %s --check-prefixes=CHECK,DOUBLE-8BYTE-ALIGN +; RUN: opt < %s -passes=instcombine -S -mtriple "i386-pc-mingw32" | FileCheck %s --check-prefixes=CHECK,DOUBLE-8BYTE-ALIGN +; RUN: opt < %s -passes=instcombine -S -mtriple "x86_64-pc-mingw32" | FileCheck %s --check-prefixes=CHECK,DOUBLE-8BYTE-ALIGN +; RUN: opt < %s -passes=instcombine -S -mtriple "sparc-sun-solaris" | FileCheck %s --check-prefixes=CHECK,DOUBLE-8BYTE-ALIGN ; RUN: opt < %s -passes=instcombine -S -mtriple "x86_64-pc-win32" -enable-debugify 2>&1 | FileCheck --check-prefix=DBG-VALID %s declare double @floor(double) @@ -708,12 +708,19 @@ } define float @test_no_shrink_intrin_floor_multi_use_fpext(half %C) { -; CHECK-LABEL: @test_no_shrink_intrin_floor_multi_use_fpext( -; CHECK-NEXT: [[D:%.*]] = fpext half [[C:%.*]] to double -; CHECK-NEXT: store volatile double [[D]], ptr undef, align 8 -; CHECK-NEXT: [[E:%.*]] = call double @llvm.floor.f64(double [[D]]) -; CHECK-NEXT: [[F:%.*]] = fptrunc double [[E]] to float -; CHECK-NEXT: ret float [[F]] +; DOUBLE-4BYTE-ALIGN-LABEL: @test_no_shrink_intrin_floor_multi_use_fpext( +; DOUBLE-4BYTE-ALIGN-NEXT: [[D:%.*]] = fpext half [[C:%.*]] to double +; DOUBLE-4BYTE-ALIGN-NEXT: store volatile double [[D]], ptr undef, align 4 +; DOUBLE-4BYTE-ALIGN-NEXT: [[E:%.*]] = call double @llvm.floor.f64(double [[D]]) +; DOUBLE-4BYTE-ALIGN-NEXT: [[F:%.*]] = fptrunc double [[E]] to float +; DOUBLE-4BYTE-ALIGN-NEXT: ret float [[F]] +; +; DOUBLE-8BYTE-ALIGN-LABEL: @test_no_shrink_intrin_floor_multi_use_fpext( +; DOUBLE-8BYTE-ALIGN-NEXT: [[D:%.*]] = fpext half [[C:%.*]] to double +; DOUBLE-8BYTE-ALIGN-NEXT: store volatile double [[D]], ptr undef, align 8 +; DOUBLE-8BYTE-ALIGN-NEXT: [[E:%.*]] = call double @llvm.floor.f64(double [[D]]) +; DOUBLE-8BYTE-ALIGN-NEXT: [[F:%.*]] = fptrunc double [[E]] to float +; DOUBLE-8BYTE-ALIGN-NEXT: ret float [[F]] ; %D = fpext half %C to double store volatile double %D, ptr undef @@ -723,12 +730,19 @@ } define float @test_no_shrink_intrin_fabs_multi_use_fpext(half %C) { -; CHECK-LABEL: @test_no_shrink_intrin_fabs_multi_use_fpext( -; CHECK-NEXT: [[D:%.*]] = fpext half [[C:%.*]] to double -; CHECK-NEXT: store volatile double [[D]], ptr undef, align 8 -; CHECK-NEXT: [[E:%.*]] = call double @llvm.fabs.f64(double [[D]]) -; CHECK-NEXT: [[F:%.*]] = fptrunc double [[E]] to float -; CHECK-NEXT: ret float [[F]] +; DOUBLE-4BYTE-ALIGN-LABEL: @test_no_shrink_intrin_fabs_multi_use_fpext( +; DOUBLE-4BYTE-ALIGN-NEXT: [[D:%.*]] = fpext half [[C:%.*]] to double +; DOUBLE-4BYTE-ALIGN-NEXT: store volatile double [[D]], ptr undef, align 4 +; DOUBLE-4BYTE-ALIGN-NEXT: [[E:%.*]] = call double @llvm.fabs.f64(double [[D]]) +; DOUBLE-4BYTE-ALIGN-NEXT: [[F:%.*]] = fptrunc double [[E]] to float +; DOUBLE-4BYTE-ALIGN-NEXT: ret float [[F]] +; +; DOUBLE-8BYTE-ALIGN-LABEL: @test_no_shrink_intrin_fabs_multi_use_fpext( +; DOUBLE-8BYTE-ALIGN-NEXT: [[D:%.*]] = fpext half [[C:%.*]] to double +; DOUBLE-8BYTE-ALIGN-NEXT: store volatile double [[D]], ptr undef, align 8 +; DOUBLE-8BYTE-ALIGN-NEXT: [[E:%.*]] = call double @llvm.fabs.f64(double [[D]]) +; DOUBLE-8BYTE-ALIGN-NEXT: [[F:%.*]] = fptrunc double [[E]] to float +; DOUBLE-8BYTE-ALIGN-NEXT: ret float [[F]] ; %D = fpext half %C to double store volatile double %D, ptr undef diff --git a/llvm/test/Transforms/InstCombine/ffs-i16.ll b/llvm/test/Transforms/InstCombine/ffs-i16.ll --- a/llvm/test/Transforms/InstCombine/ffs-i16.ll +++ b/llvm/test/Transforms/InstCombine/ffs-i16.ll @@ -13,13 +13,13 @@ define void @fold_ffs(i16 %x) { ; AVR-LABEL: @fold_ffs( -; AVR-NEXT: call void @sink(i16 0) -; AVR-NEXT: call void @sink(i16 1) -; AVR-NEXT: [[CTTZ:%.*]] = call i16 @llvm.cttz.i16(i16 [[X:%.*]], i1 true), !range [[RNG0:![0-9]+]] +; AVR-NEXT: call addrspace(1) void @sink(i16 0) +; AVR-NEXT: call addrspace(1) void @sink(i16 1) +; AVR-NEXT: [[CTTZ:%.*]] = call addrspace(1) i16 @llvm.cttz.i16(i16 [[X:%.*]], i1 true), !range [[RNG0:![0-9]+]] ; AVR-NEXT: [[TMP1:%.*]] = add nuw nsw i16 [[CTTZ]], 1 ; AVR-NEXT: [[DOTNOT:%.*]] = icmp eq i16 [[X]], 0 ; AVR-NEXT: [[NX:%.*]] = select i1 [[DOTNOT]], i16 0, i16 [[TMP1]] -; AVR-NEXT: call void @sink(i16 [[NX]]) +; AVR-NEXT: call addrspace(1) void @sink(i16 [[NX]]) ; AVR-NEXT: ret void ; ; MSP430-LABEL: @fold_ffs( diff --git a/llvm/test/Transforms/InstCombine/fls-i16.ll b/llvm/test/Transforms/InstCombine/fls-i16.ll --- a/llvm/test/Transforms/InstCombine/fls-i16.ll +++ b/llvm/test/Transforms/InstCombine/fls-i16.ll @@ -14,11 +14,11 @@ define void @fold_fls(i16 %x) { ; AVR-LABEL: @fold_fls( -; AVR-NEXT: call void @sink(i16 0) -; AVR-NEXT: call void @sink(i16 1) -; AVR-NEXT: [[CTLZ:%.*]] = call i16 @llvm.ctlz.i16(i16 [[X:%.*]], i1 false), !range [[RNG0:![0-9]+]] +; AVR-NEXT: call addrspace(1) void @sink(i16 0) +; AVR-NEXT: call addrspace(1) void @sink(i16 1) +; AVR-NEXT: [[CTLZ:%.*]] = call addrspace(1) i16 @llvm.ctlz.i16(i16 [[X:%.*]], i1 false), !range [[RNG0:![0-9]+]] ; AVR-NEXT: [[NX:%.*]] = sub nuw nsw i16 16, [[CTLZ]] -; AVR-NEXT: call void @sink(i16 [[NX]]) +; AVR-NEXT: call addrspace(1) void @sink(i16 [[NX]]) ; AVR-NEXT: ret void ; ; MSP430-LABEL: @fold_fls( diff --git a/llvm/test/Transforms/InstCombine/isascii-i16.ll b/llvm/test/Transforms/InstCombine/isascii-i16.ll --- a/llvm/test/Transforms/InstCombine/isascii-i16.ll +++ b/llvm/test/Transforms/InstCombine/isascii-i16.ll @@ -12,17 +12,17 @@ define void @fold_isascii(i16 %c) { ; AVR-LABEL: @fold_isascii( -; AVR-NEXT: call void @sink(i16 1) -; AVR-NEXT: call void @sink(i16 1) -; AVR-NEXT: call void @sink(i16 1) -; AVR-NEXT: call void @sink(i16 0) -; AVR-NEXT: call void @sink(i16 0) -; AVR-NEXT: call void @sink(i16 0) -; AVR-NEXT: call void @sink(i16 0) -; AVR-NEXT: call void @sink(i16 0) +; AVR-NEXT: call addrspace(1) void @sink(i16 1) +; AVR-NEXT: call addrspace(1) void @sink(i16 1) +; AVR-NEXT: call addrspace(1) void @sink(i16 1) +; AVR-NEXT: call addrspace(1) void @sink(i16 0) +; AVR-NEXT: call addrspace(1) void @sink(i16 0) +; AVR-NEXT: call addrspace(1) void @sink(i16 0) +; AVR-NEXT: call addrspace(1) void @sink(i16 0) +; AVR-NEXT: call addrspace(1) void @sink(i16 0) ; AVR-NEXT: [[ISASCII:%.*]] = icmp ult i16 [[C:%.*]], 128 ; AVR-NEXT: [[IC:%.*]] = zext i1 [[ISASCII]] to i16 -; AVR-NEXT: call void @sink(i16 [[IC]]) +; AVR-NEXT: call addrspace(1) void @sink(i16 [[IC]]) ; AVR-NEXT: ret void ; ; MSP430-LABEL: @fold_isascii( diff --git a/llvm/test/Transforms/InstCombine/isdigit-i16.ll b/llvm/test/Transforms/InstCombine/isdigit-i16.ll --- a/llvm/test/Transforms/InstCombine/isdigit-i16.ll +++ b/llvm/test/Transforms/InstCombine/isdigit-i16.ll @@ -11,22 +11,22 @@ define void @fold_isdigit(i16 %c) { ; AVR-LABEL: @fold_isdigit( -; AVR-NEXT: call void @sink(i16 0) -; AVR-NEXT: call void @sink(i16 0) -; AVR-NEXT: call void @sink(i16 0) -; AVR-NEXT: call void @sink(i16 1) -; AVR-NEXT: call void @sink(i16 1) -; AVR-NEXT: call void @sink(i16 1) -; AVR-NEXT: call void @sink(i16 0) -; AVR-NEXT: call void @sink(i16 0) -; AVR-NEXT: call void @sink(i16 0) -; AVR-NEXT: call void @sink(i16 0) -; AVR-NEXT: call void @sink(i16 0) -; AVR-NEXT: call void @sink(i16 0) +; AVR-NEXT: call addrspace(1) void @sink(i16 0) +; AVR-NEXT: call addrspace(1) void @sink(i16 0) +; AVR-NEXT: call addrspace(1) void @sink(i16 0) +; AVR-NEXT: call addrspace(1) void @sink(i16 1) +; AVR-NEXT: call addrspace(1) void @sink(i16 1) +; AVR-NEXT: call addrspace(1) void @sink(i16 1) +; AVR-NEXT: call addrspace(1) void @sink(i16 0) +; AVR-NEXT: call addrspace(1) void @sink(i16 0) +; AVR-NEXT: call addrspace(1) void @sink(i16 0) +; AVR-NEXT: call addrspace(1) void @sink(i16 0) +; AVR-NEXT: call addrspace(1) void @sink(i16 0) +; AVR-NEXT: call addrspace(1) void @sink(i16 0) ; AVR-NEXT: [[ISDIGITTMP:%.*]] = add i16 [[C:%.*]], -48 ; AVR-NEXT: [[ISDIGIT:%.*]] = icmp ult i16 [[ISDIGITTMP]], 10 ; AVR-NEXT: [[IC:%.*]] = zext i1 [[ISDIGIT]] to i16 -; AVR-NEXT: call void @sink(i16 [[IC]]) +; AVR-NEXT: call addrspace(1) void @sink(i16 [[IC]]) ; AVR-NEXT: ret void ; ; MSP430-LABEL: @fold_isdigit( diff --git a/llvm/test/Transforms/InstCombine/pow-4.ll b/llvm/test/Transforms/InstCombine/pow-4.ll --- a/llvm/test/Transforms/InstCombine/pow-4.ll +++ b/llvm/test/Transforms/InstCombine/pow-4.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=instcombine -S < %s -mtriple unknown | FileCheck %s --check-prefixes=CHECK,CHECKI32,CHECKSQRT -; RUN: opt -passes=instcombine -S < %s -mtriple unknown -disable-builtin sqrt | FileCheck %s --check-prefixes=CHECK,CHECKI32,CHECKNOSQRT +; RUN: opt -passes=instcombine -S < %s -mtriple unknown -data-layout=e | FileCheck %s --check-prefixes=CHECK,CHECKI32,CHECKSQRT +; RUN: opt -passes=instcombine -S < %s -mtriple unknown -data-layout=e -disable-builtin sqrt | FileCheck %s --check-prefixes=CHECK,CHECKI32,CHECKNOSQRT ; RUN: opt -passes=instcombine -S < %s -mtriple msp430 | FileCheck %s --check-prefixes=CHECK,CHECKI16,CHECKSQRT ; RUN: opt -passes=instcombine -S < %s -mtriple msp430 -disable-builtin sqrt | FileCheck %s --check-prefixes=CHECK,CHECKI16,CHECKNOSQRT diff --git a/llvm/test/Transforms/InstCombine/pow_fp_int.ll b/llvm/test/Transforms/InstCombine/pow_fp_int.ll --- a/llvm/test/Transforms/InstCombine/pow_fp_int.ll +++ b/llvm/test/Transforms/InstCombine/pow_fp_int.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 ; RUN: opt -mtriple unknown -passes=instcombine -S < %s | FileCheck %s +target datalayout = "e" ; PR42190 ; Can't generate test checks due to PR42740. diff --git a/llvm/test/Transforms/InstCombine/printf-i16.ll b/llvm/test/Transforms/InstCombine/printf-i16.ll --- a/llvm/test/Transforms/InstCombine/printf-i16.ll +++ b/llvm/test/Transforms/InstCombine/printf-i16.ll @@ -24,21 +24,21 @@ define void @xform_printf(i8 %c8, i16 %c16) { ; AVR-LABEL: @xform_printf( -; AVR-NEXT: [[PUTCHAR:%.*]] = call i16 @putchar(i16 1) -; AVR-NEXT: [[PUTCHAR1:%.*]] = call i16 @putchar(i16 1) -; AVR-NEXT: [[PUTCHAR2:%.*]] = call i16 @putchar(i16 1) -; AVR-NEXT: [[PUTCHAR3:%.*]] = call i16 @putchar(i16 127) -; AVR-NEXT: [[PUTCHAR4:%.*]] = call i16 @putchar(i16 127) -; AVR-NEXT: [[PUTCHAR5:%.*]] = call i16 @putchar(i16 127) -; AVR-NEXT: [[PUTCHAR6:%.*]] = call i16 @putchar(i16 128) -; AVR-NEXT: [[PUTCHAR7:%.*]] = call i16 @putchar(i16 128) -; AVR-NEXT: [[PUTCHAR8:%.*]] = call i16 @putchar(i16 128) -; AVR-NEXT: [[PUTCHAR9:%.*]] = call i16 @putchar(i16 255) -; AVR-NEXT: [[PUTCHAR10:%.*]] = call i16 @putchar(i16 255) -; AVR-NEXT: [[PUTCHAR11:%.*]] = call i16 @putchar(i16 255) +; AVR-NEXT: [[PUTCHAR:%.*]] = call addrspace(1) i16 @putchar(i16 1) +; AVR-NEXT: [[PUTCHAR1:%.*]] = call addrspace(1) i16 @putchar(i16 1) +; AVR-NEXT: [[PUTCHAR2:%.*]] = call addrspace(1) i16 @putchar(i16 1) +; AVR-NEXT: [[PUTCHAR3:%.*]] = call addrspace(1) i16 @putchar(i16 127) +; AVR-NEXT: [[PUTCHAR4:%.*]] = call addrspace(1) i16 @putchar(i16 127) +; AVR-NEXT: [[PUTCHAR5:%.*]] = call addrspace(1) i16 @putchar(i16 127) +; AVR-NEXT: [[PUTCHAR6:%.*]] = call addrspace(1) i16 @putchar(i16 128) +; AVR-NEXT: [[PUTCHAR7:%.*]] = call addrspace(1) i16 @putchar(i16 128) +; AVR-NEXT: [[PUTCHAR8:%.*]] = call addrspace(1) i16 @putchar(i16 128) +; AVR-NEXT: [[PUTCHAR9:%.*]] = call addrspace(1) i16 @putchar(i16 255) +; AVR-NEXT: [[PUTCHAR10:%.*]] = call addrspace(1) i16 @putchar(i16 255) +; AVR-NEXT: [[PUTCHAR11:%.*]] = call addrspace(1) i16 @putchar(i16 255) ; AVR-NEXT: [[TMP1:%.*]] = zext i8 [[C8:%.*]] to i16 -; AVR-NEXT: [[PUTCHAR12:%.*]] = call i16 @putchar(i16 [[TMP1]]) -; AVR-NEXT: [[PUTCHAR13:%.*]] = call i16 @putchar(i16 [[C16:%.*]]) +; AVR-NEXT: [[PUTCHAR12:%.*]] = call addrspace(1) i16 @putchar(i16 [[TMP1]]) +; AVR-NEXT: [[PUTCHAR13:%.*]] = call addrspace(1) i16 @putchar(i16 [[C16:%.*]]) ; AVR-NEXT: ret void ; ; MSP430-LABEL: @xform_printf( diff --git a/llvm/test/Transforms/InstCombine/puts-i16.ll b/llvm/test/Transforms/InstCombine/puts-i16.ll --- a/llvm/test/Transforms/InstCombine/puts-i16.ll +++ b/llvm/test/Transforms/InstCombine/puts-i16.ll @@ -14,7 +14,7 @@ define void @xform_puts(i16 %c) { ; Transform puts("") to putchar("\n"). ; AVR-LABEL: @xform_puts( -; AVR-NEXT: [[PUTCHAR:%.*]] = call i16 @putchar(i16 10) +; AVR-NEXT: [[PUTCHAR:%.*]] = call addrspace(1) i16 @putchar(i16 10) ; AVR-NEXT: ret void ; ; MSP430-LABEL: @xform_puts( diff --git a/llvm/test/Transforms/InstCombine/sincospi.ll b/llvm/test/Transforms/InstCombine/sincospi.ll --- a/llvm/test/Transforms/InstCombine/sincospi.ll +++ b/llvm/test/Transforms/InstCombine/sincospi.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -passes=instcombine -S < %s -mtriple=x86_64-apple-macosx10.9 | FileCheck %s --check-prefix=CHECK-FLOAT-IN-VEC -; RUN: opt -passes=instcombine -S < %s -mtriple=arm-apple-ios7.0 | FileCheck %s -; RUN: opt -passes=instcombine -S < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s -; RUN: opt -passes=instcombine -S < %s -mtriple=x86_64-apple-macosx10.8 | FileCheck %s --check-prefix=CHECK-NO-SINCOS -; RUN: opt -passes=instcombine -S < %s -mtriple=arm-apple-ios6.0 | FileCheck %s --check-prefix=CHECK-NO-SINCOS -; RUN: opt -passes=instcombine -S < %s -mtriple=x86_64-none-linux-gnu | FileCheck %s --check-prefix=CHECK-NO-SINCOS +; RUN: opt -passes=instcombine -S < %s -mtriple=arm-apple-ios7.0 | FileCheck %s --check-prefixes=CHECK,CHECK-DOUBLE-ALIGN4 +; RUN: opt -passes=instcombine -S < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefixes=CHECK,CHECK-DOUBLE-ALIGN8 +; RUN: opt -passes=instcombine -S < %s -mtriple=x86_64-apple-macosx10.8 | FileCheck %s --check-prefixes=CHECK-NO-SINCOS,CHECK-NO-SINCOS-DOUBLE-ALIGN8 +; RUN: opt -passes=instcombine -S < %s -mtriple=arm-apple-ios6.0 | FileCheck %s --check-prefixes=CHECK-NO-SINCOS,CHECK-NO-SINCOS-DOUBLE-ALIGN4 +; RUN: opt -passes=instcombine -S < %s -mtriple=x86_64-none-linux-gnu | FileCheck %s --check-prefixes=CHECK-NO-SINCOS,CHECK-NO-SINCOS-DOUBLE-ALIGN8 attributes #0 = { readnone nounwind } @@ -129,21 +129,37 @@ ; CHECK-FLOAT-IN-VEC-NEXT: [[RES:%.*]] = fadd double [[SINPI]], [[COSPI]] ; CHECK-FLOAT-IN-VEC-NEXT: ret double [[RES]] ; -; CHECK-LABEL: @test_instbased_f64( -; CHECK-NEXT: [[VAL:%.*]] = load double, ptr @var64, align 8 -; CHECK-NEXT: [[SINCOSPI:%.*]] = call { double, double } @__sincospi_stret(double [[VAL]]) -; CHECK-NEXT: [[SINPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 0 -; CHECK-NEXT: [[COSPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 1 -; CHECK-NEXT: [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]] -; CHECK-NEXT: [[RES:%.*]] = fadd double [[SINPI]], [[COSPI]] -; CHECK-NEXT: ret double [[RES]] +; CHECK-DOUBLE-ALIGN4-LABEL: @test_instbased_f64( +; CHECK-DOUBLE-ALIGN4-NEXT: [[VAL:%.*]] = load double, ptr @var64, align 4 +; CHECK-DOUBLE-ALIGN4-NEXT: [[SINCOSPI:%.*]] = call { double, double } @__sincospi_stret(double [[VAL]]) +; CHECK-DOUBLE-ALIGN4-NEXT: [[SINPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 0 +; CHECK-DOUBLE-ALIGN4-NEXT: [[COSPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 1 +; CHECK-DOUBLE-ALIGN4-NEXT: [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]] +; CHECK-DOUBLE-ALIGN4-NEXT: [[RES:%.*]] = fadd double [[SINPI]], [[COSPI]] +; CHECK-DOUBLE-ALIGN4-NEXT: ret double [[RES]] ; -; CHECK-NO-SINCOS-LABEL: @test_instbased_f64( -; CHECK-NO-SINCOS-NEXT: [[VAL:%.*]] = load double, ptr @var64, align 8 -; CHECK-NO-SINCOS-NEXT: [[SIN:%.*]] = call double @__sinpi(double [[VAL]]) #[[ATTR0]] -; CHECK-NO-SINCOS-NEXT: [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]] -; CHECK-NO-SINCOS-NEXT: [[RES:%.*]] = fadd double [[SIN]], [[COS]] -; CHECK-NO-SINCOS-NEXT: ret double [[RES]] +; CHECK-DOUBLE-ALIGN8-LABEL: @test_instbased_f64( +; CHECK-DOUBLE-ALIGN8-NEXT: [[VAL:%.*]] = load double, ptr @var64, align 8 +; CHECK-DOUBLE-ALIGN8-NEXT: [[SINCOSPI:%.*]] = call { double, double } @__sincospi_stret(double [[VAL]]) +; CHECK-DOUBLE-ALIGN8-NEXT: [[SINPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 0 +; CHECK-DOUBLE-ALIGN8-NEXT: [[COSPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 1 +; CHECK-DOUBLE-ALIGN8-NEXT: [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]] +; CHECK-DOUBLE-ALIGN8-NEXT: [[RES:%.*]] = fadd double [[SINPI]], [[COSPI]] +; CHECK-DOUBLE-ALIGN8-NEXT: ret double [[RES]] +; +; CHECK-NO-SINCOS-DOUBLE-ALIGN8-LABEL: @test_instbased_f64( +; CHECK-NO-SINCOS-DOUBLE-ALIGN8-NEXT: [[VAL:%.*]] = load double, ptr @var64, align 8 +; CHECK-NO-SINCOS-DOUBLE-ALIGN8-NEXT: [[SIN:%.*]] = call double @__sinpi(double [[VAL]]) #[[ATTR0]] +; CHECK-NO-SINCOS-DOUBLE-ALIGN8-NEXT: [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]] +; CHECK-NO-SINCOS-DOUBLE-ALIGN8-NEXT: [[RES:%.*]] = fadd double [[SIN]], [[COS]] +; CHECK-NO-SINCOS-DOUBLE-ALIGN8-NEXT: ret double [[RES]] +; +; CHECK-NO-SINCOS-DOUBLE-ALIGN4-LABEL: @test_instbased_f64( +; CHECK-NO-SINCOS-DOUBLE-ALIGN4-NEXT: [[VAL:%.*]] = load double, ptr @var64, align 4 +; CHECK-NO-SINCOS-DOUBLE-ALIGN4-NEXT: [[SIN:%.*]] = call double @__sinpi(double [[VAL]]) #[[ATTR0]] +; CHECK-NO-SINCOS-DOUBLE-ALIGN4-NEXT: [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]] +; CHECK-NO-SINCOS-DOUBLE-ALIGN4-NEXT: [[RES:%.*]] = fadd double [[SIN]], [[COS]] +; CHECK-NO-SINCOS-DOUBLE-ALIGN4-NEXT: ret double [[RES]] ; %val = load double, ptr @var64 %sin = call double @__sinpi(double %val) #0 diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/calls-math-finite.ll b/llvm/test/Transforms/InstSimplify/ConstProp/calls-math-finite.ll --- a/llvm/test/Transforms/InstSimplify/ConstProp/calls-math-finite.ll +++ b/llvm/test/Transforms/InstSimplify/ConstProp/calls-math-finite.ll @@ -1,13 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -passes=instsimplify -S | FileCheck %s -; RUN: opt < %s -passes=instsimplify -S -mtriple=unknown-unknown-linux-musl | FileCheck -check-prefix=MUSL %s +; RUN: opt < %s -passes=instsimplify -S -mtriple=x86_64-unknown-linux-musl | FileCheck -check-prefix=MUSL %s ; Test to verify constant folding can occur when math routines are mapped ; to the ___finite versions of functions due to __FINITE_MATH_ONLY__ ; being enabled on headers on Linux. All calls should constant fold away ; in this test. -target triple = "unknown-unknown-linux-gnu" +target triple = "x86_64-unknown-linux-gnu" declare double @__acos_finite(double) #0 declare float @__acosf_finite(float) #0 diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll --- a/llvm/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll @@ -10,21 +10,23 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[WHILE_BODY_LR_PH_I:%.*]] ; CHECK: while.body.lr.ph.i: +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 16 ; CHECK-NEXT: br label [[WHILE_BODY_I:%.*]] ; CHECK: while.body.i: ; CHECK-NEXT: [[INDVARS_IV7_I:%.*]] = phi i64 [ 16, [[WHILE_BODY_LR_PH_I]] ], [ [[INDVARS_IV_NEXT8_I:%.*]], [[COND_TRUE29_I:%.*]] ] ; CHECK-NEXT: [[I_05_I:%.*]] = phi i64 [ 0, [[WHILE_BODY_LR_PH_I]] ], [ [[INDVARS_IV7_I]], [[COND_TRUE29_I]] ] +; CHECK-NEXT: [[LSR4:%.*]] = trunc i64 [[I_05_I]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[LSR4]] to i64 +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[UGLYGEP]], i64 [[TMP0]] ; CHECK-NEXT: [[SEXT_I:%.*]] = shl i64 [[I_05_I]], 32 ; CHECK-NEXT: [[IDX_EXT_I:%.*]] = ashr exact i64 [[SEXT_I]], 32 ; CHECK-NEXT: [[ADD_PTR_SUM_I:%.*]] = add i64 [[IDX_EXT_I]], 16 ; CHECK-NEXT: br label [[FOR_BODY_I:%.*]] ; CHECK: for.body.i: -; CHECK-NEXT: [[INDVARS_IV_I:%.*]] = phi i64 [ 0, [[WHILE_BODY_I]] ], [ [[INDVARS_IV_NEXT_I:%.*]], [[FOR_BODY_I]] ] -; CHECK-NEXT: [[ADD_PTR_SUM:%.*]] = add i64 [[ADD_PTR_SUM_I]], [[INDVARS_IV_I]] -; CHECK-NEXT: [[ARRAYIDX22_I:%.*]] = getelementptr inbounds i8, ptr [[BASE:%.*]], i64 [[ADD_PTR_SUM]] -; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX22_I]], align 1 -; CHECK-NEXT: [[INDVARS_IV_NEXT_I]] = add i64 [[INDVARS_IV_I]], 1 +; CHECK-NEXT: [[LSR_IV2:%.*]] = phi ptr [ [[UGLYGEP3:%.*]], [[FOR_BODY_I]] ], [ [[UGLYGEP1]], [[WHILE_BODY_I]] ] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[LSR_IV2]], align 1 ; CHECK-NEXT: [[CMP:%.*]] = call i1 @check() #[[ATTR3:[0-9]+]] +; CHECK-NEXT: [[UGLYGEP3]] = getelementptr i8, ptr [[LSR_IV2]], i64 1 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_END_I:%.*]], label [[FOR_BODY_I]] ; CHECK: for.end.i: ; CHECK-NEXT: [[ADD_PTR_I144:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD_PTR_SUM_I]] @@ -93,18 +95,20 @@ ; CHECK: for.cond468.preheader: ; CHECK-NEXT: br label [[FOR_COND468:%.*]] ; CHECK: for.cond468: -; CHECK-NEXT: [[INDVARS_IV1163:%.*]] = phi i64 [ [[INDVARS_IV_NEXT1164:%.*]], [[IF_THEN477:%.*]] ], [ 1, [[FOR_COND468_PREHEADER]] ] -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDVARS_IV1163]] to i32 -; CHECK-NEXT: [[CMP469:%.*]] = icmp slt i32 [[TMP0]], [[N:%.*]] +; CHECK-NEXT: [[LSR_IV1:%.*]] = phi i32 [ 1, [[FOR_COND468_PREHEADER]] ], [ [[LSR_IV_NEXT:%.*]], [[IF_THEN477:%.*]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ getelementptr inbounds ([5000 x %struct.anon.7.91.199.307.415.475.559.643.751.835.943.1003.1111.1219.1351.1375.1399.1435.1471.1483.1519.1531.1651.1771], ptr @tags, i64 0, i64 0, i32 2), [[FOR_COND468_PREHEADER]] ], [ [[UGLYGEP:%.*]], [[IF_THEN477]] ] +; CHECK-NEXT: [[K_0:%.*]] = load i32, ptr [[LSR_IV]], align 4 +; CHECK-NEXT: [[CMP469:%.*]] = icmp slt i32 [[LSR_IV1]], [[N:%.*]] ; CHECK-NEXT: br i1 [[CMP469]], label [[FOR_BODY471:%.*]], label [[FOR_INC498_PREHEADER:%.*]] ; CHECK: for.body471: -; CHECK-NEXT: [[FIRST:%.*]] = getelementptr inbounds [5000 x %struct.anon.7.91.199.307.415.475.559.643.751.835.943.1003.1111.1219.1351.1375.1399.1435.1471.1483.1519.1531.1651.1771], ptr @tags, i64 0, i64 [[INDVARS_IV1163]], i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[FIRST]], align 4 +; CHECK-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[LSR_IV]], i64 8 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[UGLYGEP2]], align 4 ; CHECK-NEXT: br i1 false, label [[IF_THEN477]], label [[FOR_INC498_PREHEADER]] ; CHECK: for.inc498.preheader: ; CHECK-NEXT: br label [[FOR_INC498:%.*]] ; CHECK: if.then477: -; CHECK-NEXT: [[INDVARS_IV_NEXT1164]] = add i64 [[INDVARS_IV1163]], 1 +; CHECK-NEXT: [[UGLYGEP]] = getelementptr i8, ptr [[LSR_IV]], i64 12 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add nuw nsw i32 [[LSR_IV1]], 1 ; CHECK-NEXT: br label [[FOR_COND468]] ; CHECK: for.inc498: ; CHECK-NEXT: br label [[FOR_INC498]] @@ -154,27 +158,27 @@ ; CHECK: for.body3.us.i: ; CHECK-NEXT: [[INDVARS_IV_I_SV_PHI:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_I:%.*]], [[MESHBB]] ], [ 0, [[FOR_BODY3_LR_PH_US_I:%.*]] ] ; CHECK-NEXT: [[OPQ_SA_CALC12:%.*]] = sub i32 undef, 227 -; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[INDVARS_IV_I_SV_PHI]], [[INDVARS_IV8_I_SV_PHI26:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 -; CHECK-NEXT: [[MUL_I_US_I:%.*]] = mul nsw i32 0, [[TMP1]] -; CHECK-NEXT: [[ARRAYIDX5_US_I:%.*]] = getelementptr inbounds double, ptr [[U:%.*]], i64 [[INDVARS_IV_I_SV_PHI]] -; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX5_US_I]], align 8 -; CHECK-NEXT: [[INDVARS_IV_NEXT_I]] = add i64 [[INDVARS_IV_I_SV_PHI]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[LSR_IV:%.*]], [[INDVARS_IV_I_SV_PHI]] +; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[TMP0]] to i32 +; CHECK-NEXT: [[MUL_I_US_I:%.*]] = mul nsw i32 0, [[TMP]] +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV_I_SV_PHI]], 3 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[U:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[UGLYGEP]], align 8 ; CHECK-NEXT: br i1 undef, label [[FOR_INC8_US_I:%.*]], label [[MESHBB]] ; CHECK: for.body3.lr.ph.us.i.loopexit: +; CHECK-NEXT: [[LSR_IV_NEXT:%.*]] = add i64 [[LSR_IV]], 1 ; CHECK-NEXT: br label [[FOR_BODY3_LR_PH_US_I]] ; CHECK: for.body3.lr.ph.us.i: -; CHECK-NEXT: [[INDVARS_IV8_I_SV_PHI26]] = phi i64 [ undef, [[MESHBB1]] ], [ [[INDVARS_IV8_I_SV_PHI24:%.*]], [[FOR_BODY3_LR_PH_US_I_LOOPEXIT:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX_US_I:%.*]] = getelementptr inbounds double, ptr undef, i64 [[INDVARS_IV8_I_SV_PHI26]] -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDVARS_IV8_I_SV_PHI26]], 1 +; CHECK-NEXT: [[LSR_IV]] = phi i64 [ [[LSR_IV_NEXT]], [[FOR_BODY3_LR_PH_US_I_LOOPEXIT:%.*]] ], [ undef, [[MESHBB1]] ] +; CHECK-NEXT: [[ARRAYIDX_US_I:%.*]] = getelementptr inbounds double, ptr undef, i64 [[LSR_IV]] ; CHECK-NEXT: br label [[FOR_BODY3_US_I:%.*]] ; CHECK: for.inc8.us.i2: ; CHECK-NEXT: unreachable ; CHECK: eval_At_times_u.exit: ; CHECK-NEXT: ret void ; CHECK: meshBB: -; CHECK-NEXT: [[INDVARS_IV8_I_SV_PHI24]] = phi i64 [ undef, [[FOR_BODY3_US_I]] ], [ [[TMP3]], [[FOR_INC8_US_I]] ] ; CHECK-NEXT: [[MESHSTACKVARIABLE_PHI:%.*]] = phi i32 [ [[OPQ_SA_CALC12]], [[FOR_BODY3_US_I]] ], [ undef, [[FOR_INC8_US_I]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT_I]] = add i64 [[INDVARS_IV_I_SV_PHI]], 1 ; CHECK-NEXT: br i1 true, label [[FOR_BODY3_LR_PH_US_I_LOOPEXIT]], label [[FOR_BODY3_US_I]] ; CHECK: meshBB1.loopexit: ; CHECK-NEXT: br label [[MESHBB1]] diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/runtime-unroll-generic.ll b/llvm/test/Transforms/LoopUnroll/AArch64/runtime-unroll-generic.ll --- a/llvm/test/Transforms/LoopUnroll/AArch64/runtime-unroll-generic.ll +++ b/llvm/test/Transforms/LoopUnroll/AArch64/runtime-unroll-generic.ll @@ -9,116 +9,110 @@ ; CHECK-A55-NEXT: [[CMP52_NOT:%.*]] = icmp eq i32 [[ARG_0:%.*]], 0 ; CHECK-A55-NEXT: br i1 [[CMP52_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY6_PREHEADER:%.*]] ; CHECK-A55: for.body6.preheader: -; CHECK-A55-NEXT: [[XTRAITER:%.*]] = and i32 [[ARG_0]], 3 +; CHECK-A55-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[ARG_0]] to i64 +; CHECK-A55-NEXT: [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 3 ; CHECK-A55-NEXT: [[TMP0:%.*]] = icmp ult i32 [[ARG_0]], 4 ; CHECK-A55-NEXT: br i1 [[TMP0]], label [[FOR_END_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_BODY6_PREHEADER_NEW:%.*]] ; CHECK-A55: for.body6.preheader.new: -; CHECK-A55-NEXT: [[UNROLL_ITER:%.*]] = and i32 [[ARG_0]], -4 +; CHECK-A55-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292 ; CHECK-A55-NEXT: br label [[FOR_BODY6:%.*]] ; CHECK-A55: for.body6: -; CHECK-A55-NEXT: [[K_03:%.*]] = phi i32 [ 0, [[FOR_BODY6_PREHEADER_NEW]] ], [ [[INC_3:%.*]], [[FOR_BODY6]] ] -; CHECK-A55-NEXT: [[NITER:%.*]] = phi i32 [ 0, [[FOR_BODY6_PREHEADER_NEW]] ], [ [[NITER_NEXT_3:%.*]], [[FOR_BODY6]] ] -; CHECK-A55-NEXT: [[IDX_EXT:%.*]] = zext i32 [[K_03]] to i64 -; CHECK-A55-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[ARG_2:%.*]], i64 [[IDX_EXT]] +; CHECK-A55-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY6_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_BODY6]] ] +; CHECK-A55-NEXT: [[NITER:%.*]] = phi i64 [ 0, [[FOR_BODY6_PREHEADER_NEW]] ], [ [[NITER_NEXT_3:%.*]], [[FOR_BODY6]] ] +; CHECK-A55-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[ARG_2:%.*]], i64 [[INDVARS_IV]] ; CHECK-A55-NEXT: [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2 ; CHECK-A55-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32 -; CHECK-A55-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, ptr [[ARG_3:%.*]], i64 [[IDX_EXT]] +; CHECK-A55-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, ptr [[ARG_3:%.*]], i64 [[INDVARS_IV]] ; CHECK-A55-NEXT: [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX14]], align 2 ; CHECK-A55-NEXT: [[CONV15:%.*]] = sext i16 [[TMP2]] to i32 ; CHECK-A55-NEXT: [[MUL16:%.*]] = mul nsw i32 [[CONV15]], [[CONV]] -; CHECK-A55-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[ARG_1:%.*]], i64 [[IDX_EXT]] +; CHECK-A55-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[ARG_1:%.*]], i64 [[INDVARS_IV]] ; CHECK-A55-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4 ; CHECK-A55-NEXT: [[ADD21:%.*]] = add nsw i32 [[MUL16]], [[TMP3]] ; CHECK-A55-NEXT: store i32 [[ADD21]], ptr [[ARRAYIDX20]], align 4 -; CHECK-A55-NEXT: [[INC:%.*]] = or i32 [[K_03]], 1 -; CHECK-A55-NEXT: [[IDX_EXT_1:%.*]] = zext i32 [[INC]] to i64 -; CHECK-A55-NEXT: [[ARRAYIDX10_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[IDX_EXT_1]] +; CHECK-A55-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1 +; CHECK-A55-NEXT: [[ARRAYIDX10_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[INDVARS_IV_NEXT]] ; CHECK-A55-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX10_1]], align 2 ; CHECK-A55-NEXT: [[CONV_1:%.*]] = sext i16 [[TMP4]] to i32 -; CHECK-A55-NEXT: [[ARRAYIDX14_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[IDX_EXT_1]] +; CHECK-A55-NEXT: [[ARRAYIDX14_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[INDVARS_IV_NEXT]] ; CHECK-A55-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX14_1]], align 2 ; CHECK-A55-NEXT: [[CONV15_1:%.*]] = sext i16 [[TMP5]] to i32 ; CHECK-A55-NEXT: [[MUL16_1:%.*]] = mul nsw i32 [[CONV15_1]], [[CONV_1]] -; CHECK-A55-NEXT: [[ARRAYIDX20_1:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[IDX_EXT_1]] +; CHECK-A55-NEXT: [[ARRAYIDX20_1:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[INDVARS_IV_NEXT]] ; CHECK-A55-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX20_1]], align 4 ; CHECK-A55-NEXT: [[ADD21_1:%.*]] = add nsw i32 [[MUL16_1]], [[TMP6]] ; CHECK-A55-NEXT: store i32 [[ADD21_1]], ptr [[ARRAYIDX20_1]], align 4 -; CHECK-A55-NEXT: [[INC_1:%.*]] = or i32 [[K_03]], 2 -; CHECK-A55-NEXT: [[IDX_EXT_2:%.*]] = zext i32 [[INC_1]] to i64 -; CHECK-A55-NEXT: [[ARRAYIDX10_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[IDX_EXT_2]] +; CHECK-A55-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2 +; CHECK-A55-NEXT: [[ARRAYIDX10_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[INDVARS_IV_NEXT_1]] ; CHECK-A55-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX10_2]], align 2 ; CHECK-A55-NEXT: [[CONV_2:%.*]] = sext i16 [[TMP7]] to i32 -; CHECK-A55-NEXT: [[ARRAYIDX14_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[IDX_EXT_2]] +; CHECK-A55-NEXT: [[ARRAYIDX14_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[INDVARS_IV_NEXT_1]] ; CHECK-A55-NEXT: [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX14_2]], align 2 ; CHECK-A55-NEXT: [[CONV15_2:%.*]] = sext i16 [[TMP8]] to i32 ; CHECK-A55-NEXT: [[MUL16_2:%.*]] = mul nsw i32 [[CONV15_2]], [[CONV_2]] -; CHECK-A55-NEXT: [[ARRAYIDX20_2:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[IDX_EXT_2]] +; CHECK-A55-NEXT: [[ARRAYIDX20_2:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[INDVARS_IV_NEXT_1]] ; CHECK-A55-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX20_2]], align 4 ; CHECK-A55-NEXT: [[ADD21_2:%.*]] = add nsw i32 [[MUL16_2]], [[TMP9]] ; CHECK-A55-NEXT: store i32 [[ADD21_2]], ptr [[ARRAYIDX20_2]], align 4 -; CHECK-A55-NEXT: [[INC_2:%.*]] = or i32 [[K_03]], 3 -; CHECK-A55-NEXT: [[IDX_EXT_3:%.*]] = zext i32 [[INC_2]] to i64 -; CHECK-A55-NEXT: [[ARRAYIDX10_3:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[IDX_EXT_3]] +; CHECK-A55-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3 +; CHECK-A55-NEXT: [[ARRAYIDX10_3:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[INDVARS_IV_NEXT_2]] ; CHECK-A55-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX10_3]], align 2 ; CHECK-A55-NEXT: [[CONV_3:%.*]] = sext i16 [[TMP10]] to i32 -; CHECK-A55-NEXT: [[ARRAYIDX14_3:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[IDX_EXT_3]] +; CHECK-A55-NEXT: [[ARRAYIDX14_3:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[INDVARS_IV_NEXT_2]] ; CHECK-A55-NEXT: [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX14_3]], align 2 ; CHECK-A55-NEXT: [[CONV15_3:%.*]] = sext i16 [[TMP11]] to i32 ; CHECK-A55-NEXT: [[MUL16_3:%.*]] = mul nsw i32 [[CONV15_3]], [[CONV_3]] -; CHECK-A55-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[IDX_EXT_3]] +; CHECK-A55-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[INDVARS_IV_NEXT_2]] ; CHECK-A55-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX20_3]], align 4 ; CHECK-A55-NEXT: [[ADD21_3:%.*]] = add nsw i32 [[MUL16_3]], [[TMP12]] ; CHECK-A55-NEXT: store i32 [[ADD21_3]], ptr [[ARRAYIDX20_3]], align 4 -; CHECK-A55-NEXT: [[INC_3]] = add nuw i32 [[K_03]], 4 -; CHECK-A55-NEXT: [[NITER_NEXT_3]] = add i32 [[NITER]], 4 -; CHECK-A55-NEXT: [[NITER_NCMP_3_NOT:%.*]] = icmp eq i32 [[NITER_NEXT_3]], [[UNROLL_ITER]] -; CHECK-A55-NEXT: br i1 [[NITER_NCMP_3_NOT]], label [[FOR_END_LOOPEXIT_UNR_LCSSA]], label [[FOR_BODY6]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-A55-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4 +; CHECK-A55-NEXT: [[NITER_NEXT_3]] = add i64 [[NITER]], 4 +; CHECK-A55-NEXT: [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NEXT_3]], [[UNROLL_ITER]] +; CHECK-A55-NEXT: br i1 [[NITER_NCMP_3]], label [[FOR_END_LOOPEXIT_UNR_LCSSA]], label [[FOR_BODY6]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-A55: for.end.loopexit.unr-lcssa: -; CHECK-A55-NEXT: [[K_03_UNR:%.*]] = phi i32 [ 0, [[FOR_BODY6_PREHEADER]] ], [ [[INC_3]], [[FOR_BODY6]] ] -; CHECK-A55-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i32 [[XTRAITER]], 0 +; CHECK-A55-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[FOR_BODY6_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3]], [[FOR_BODY6]] ] +; CHECK-A55-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 ; CHECK-A55-NEXT: br i1 [[LCMP_MOD_NOT]], label [[FOR_END]], label [[FOR_BODY6_EPIL:%.*]] ; CHECK-A55: for.body6.epil: -; CHECK-A55-NEXT: [[IDX_EXT_EPIL:%.*]] = zext i32 [[K_03_UNR]] to i64 -; CHECK-A55-NEXT: [[ARRAYIDX10_EPIL:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[IDX_EXT_EPIL]] +; CHECK-A55-NEXT: [[ARRAYIDX10_EPIL:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[INDVARS_IV_UNR]] ; CHECK-A55-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX10_EPIL]], align 2 ; CHECK-A55-NEXT: [[CONV_EPIL:%.*]] = sext i16 [[TMP13]] to i32 -; CHECK-A55-NEXT: [[ARRAYIDX14_EPIL:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[IDX_EXT_EPIL]] +; CHECK-A55-NEXT: [[ARRAYIDX14_EPIL:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[INDVARS_IV_UNR]] ; CHECK-A55-NEXT: [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX14_EPIL]], align 2 ; CHECK-A55-NEXT: [[CONV15_EPIL:%.*]] = sext i16 [[TMP14]] to i32 ; CHECK-A55-NEXT: [[MUL16_EPIL:%.*]] = mul nsw i32 [[CONV15_EPIL]], [[CONV_EPIL]] -; CHECK-A55-NEXT: [[ARRAYIDX20_EPIL:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[IDX_EXT_EPIL]] +; CHECK-A55-NEXT: [[ARRAYIDX20_EPIL:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[INDVARS_IV_UNR]] ; CHECK-A55-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX20_EPIL]], align 4 ; CHECK-A55-NEXT: [[ADD21_EPIL:%.*]] = add nsw i32 [[MUL16_EPIL]], [[TMP15]] ; CHECK-A55-NEXT: store i32 [[ADD21_EPIL]], ptr [[ARRAYIDX20_EPIL]], align 4 -; CHECK-A55-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i32 [[XTRAITER]], 1 +; CHECK-A55-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 1 ; CHECK-A55-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_END]], label [[FOR_BODY6_EPIL_1:%.*]] ; CHECK-A55: for.body6.epil.1: -; CHECK-A55-NEXT: [[INC_EPIL:%.*]] = add nuw i32 [[K_03_UNR]], 1 -; CHECK-A55-NEXT: [[IDX_EXT_EPIL_1:%.*]] = zext i32 [[INC_EPIL]] to i64 -; CHECK-A55-NEXT: [[ARRAYIDX10_EPIL_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[IDX_EXT_EPIL_1]] +; CHECK-A55-NEXT: [[INDVARS_IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[INDVARS_IV_UNR]], 1 +; CHECK-A55-NEXT: [[ARRAYIDX10_EPIL_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[INDVARS_IV_NEXT_EPIL]] ; CHECK-A55-NEXT: [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX10_EPIL_1]], align 2 ; CHECK-A55-NEXT: [[CONV_EPIL_1:%.*]] = sext i16 [[TMP16]] to i32 -; CHECK-A55-NEXT: [[ARRAYIDX14_EPIL_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[IDX_EXT_EPIL_1]] +; CHECK-A55-NEXT: [[ARRAYIDX14_EPIL_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[INDVARS_IV_NEXT_EPIL]] ; CHECK-A55-NEXT: [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX14_EPIL_1]], align 2 ; CHECK-A55-NEXT: [[CONV15_EPIL_1:%.*]] = sext i16 [[TMP17]] to i32 ; CHECK-A55-NEXT: [[MUL16_EPIL_1:%.*]] = mul nsw i32 [[CONV15_EPIL_1]], [[CONV_EPIL_1]] -; CHECK-A55-NEXT: [[ARRAYIDX20_EPIL_1:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[IDX_EXT_EPIL_1]] +; CHECK-A55-NEXT: [[ARRAYIDX20_EPIL_1:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[INDVARS_IV_NEXT_EPIL]] ; CHECK-A55-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX20_EPIL_1]], align 4 ; CHECK-A55-NEXT: [[ADD21_EPIL_1:%.*]] = add nsw i32 [[MUL16_EPIL_1]], [[TMP18]] ; CHECK-A55-NEXT: store i32 [[ADD21_EPIL_1]], ptr [[ARRAYIDX20_EPIL_1]], align 4 -; CHECK-A55-NEXT: [[EPIL_ITER_CMP_1_NOT:%.*]] = icmp eq i32 [[XTRAITER]], 2 +; CHECK-A55-NEXT: [[EPIL_ITER_CMP_1_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 2 ; CHECK-A55-NEXT: br i1 [[EPIL_ITER_CMP_1_NOT]], label [[FOR_END]], label [[FOR_BODY6_EPIL_2:%.*]] ; CHECK-A55: for.body6.epil.2: -; CHECK-A55-NEXT: [[INC_EPIL_1:%.*]] = add nuw i32 [[K_03_UNR]], 2 -; CHECK-A55-NEXT: [[IDX_EXT_EPIL_2:%.*]] = zext i32 [[INC_EPIL_1]] to i64 -; CHECK-A55-NEXT: [[ARRAYIDX10_EPIL_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[IDX_EXT_EPIL_2]] +; CHECK-A55-NEXT: [[INDVARS_IV_NEXT_EPIL_1:%.*]] = add nuw nsw i64 [[INDVARS_IV_UNR]], 2 +; CHECK-A55-NEXT: [[ARRAYIDX10_EPIL_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[INDVARS_IV_NEXT_EPIL_1]] ; CHECK-A55-NEXT: [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX10_EPIL_2]], align 2 ; CHECK-A55-NEXT: [[CONV_EPIL_2:%.*]] = sext i16 [[TMP19]] to i32 -; CHECK-A55-NEXT: [[ARRAYIDX14_EPIL_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[IDX_EXT_EPIL_2]] +; CHECK-A55-NEXT: [[ARRAYIDX14_EPIL_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[INDVARS_IV_NEXT_EPIL_1]] ; CHECK-A55-NEXT: [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX14_EPIL_2]], align 2 ; CHECK-A55-NEXT: [[CONV15_EPIL_2:%.*]] = sext i16 [[TMP20]] to i32 ; CHECK-A55-NEXT: [[MUL16_EPIL_2:%.*]] = mul nsw i32 [[CONV15_EPIL_2]], [[CONV_EPIL_2]] -; CHECK-A55-NEXT: [[ARRAYIDX20_EPIL_2:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[IDX_EXT_EPIL_2]] +; CHECK-A55-NEXT: [[ARRAYIDX20_EPIL_2:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[INDVARS_IV_NEXT_EPIL_1]] ; CHECK-A55-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX20_EPIL_2]], align 4 ; CHECK-A55-NEXT: [[ADD21_EPIL_2:%.*]] = add nsw i32 [[MUL16_EPIL_2]], [[TMP21]] ; CHECK-A55-NEXT: store i32 [[ADD21_EPIL_2]], ptr [[ARRAYIDX20_EPIL_2]], align 4 @@ -129,24 +123,26 @@ ; CHECK-GENERIC-LABEL: @runtime_unroll_generic( ; CHECK-GENERIC-NEXT: entry: ; CHECK-GENERIC-NEXT: [[CMP52_NOT:%.*]] = icmp eq i32 [[ARG_0:%.*]], 0 -; CHECK-GENERIC-NEXT: br i1 [[CMP52_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY6:%.*]] +; CHECK-GENERIC-NEXT: br i1 [[CMP52_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY6_PREHEADER:%.*]] +; CHECK-GENERIC: for.body6.preheader: +; CHECK-GENERIC-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[ARG_0]] to i64 +; CHECK-GENERIC-NEXT: br label [[FOR_BODY6:%.*]] ; CHECK-GENERIC: for.body6: -; CHECK-GENERIC-NEXT: [[K_03:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY6]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-GENERIC-NEXT: [[IDX_EXT:%.*]] = zext i32 [[K_03]] to i64 -; CHECK-GENERIC-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[ARG_2:%.*]], i64 [[IDX_EXT]] +; CHECK-GENERIC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY6_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY6]] ] +; CHECK-GENERIC-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[ARG_2:%.*]], i64 [[INDVARS_IV]] ; CHECK-GENERIC-NEXT: [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2 ; CHECK-GENERIC-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32 -; CHECK-GENERIC-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, ptr [[ARG_3:%.*]], i64 [[IDX_EXT]] +; CHECK-GENERIC-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, ptr [[ARG_3:%.*]], i64 [[INDVARS_IV]] ; CHECK-GENERIC-NEXT: [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX14]], align 2 ; CHECK-GENERIC-NEXT: [[CONV15:%.*]] = sext i16 [[TMP1]] to i32 ; CHECK-GENERIC-NEXT: [[MUL16:%.*]] = mul nsw i32 [[CONV15]], [[CONV]] -; CHECK-GENERIC-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[ARG_1:%.*]], i64 [[IDX_EXT]] +; CHECK-GENERIC-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[ARG_1:%.*]], i64 [[INDVARS_IV]] ; CHECK-GENERIC-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4 ; CHECK-GENERIC-NEXT: [[ADD21:%.*]] = add nsw i32 [[MUL16]], [[TMP2]] ; CHECK-GENERIC-NEXT: store i32 [[ADD21]], ptr [[ARRAYIDX20]], align 4 -; CHECK-GENERIC-NEXT: [[INC]] = add nuw i32 [[K_03]], 1 -; CHECK-GENERIC-NEXT: [[CMP5:%.*]] = icmp ult i32 [[INC]], [[ARG_0]] -; CHECK-GENERIC-NEXT: br i1 [[CMP5]], label [[FOR_BODY6]], label [[FOR_END]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-GENERIC-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-GENERIC-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-GENERIC-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY6]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-GENERIC: for.end: ; CHECK-GENERIC-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopUnroll/ARM/instr-size-costs.ll b/llvm/test/Transforms/LoopUnroll/ARM/instr-size-costs.ll --- a/llvm/test/Transforms/LoopUnroll/ARM/instr-size-costs.ll +++ b/llvm/test/Transforms/LoopUnroll/ARM/instr-size-costs.ll @@ -103,19 +103,19 @@ ; CHECK-V8-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[COUNT_1:%.*]], [[LOOP]] ] ; CHECK-V8-NEXT: [[ADDR_A:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 [[IV]] ; CHECK-V8-NEXT: [[ADDR_B:%.*]] = getelementptr i64, ptr [[B:%.*]], i32 [[IV]] -; CHECK-V8-NEXT: [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 4 -; CHECK-V8-NEXT: [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 4 +; CHECK-V8-NEXT: [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 8 +; CHECK-V8-NEXT: [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 8 ; CHECK-V8-NEXT: [[RES:%.*]] = add i64 [[DATA_A]], [[DATA_B]] ; CHECK-V8-NEXT: [[ADDR_C:%.*]] = getelementptr i64, ptr [[C:%.*]], i32 [[IV]] -; CHECK-V8-NEXT: store i64 [[RES]], ptr [[ADDR_C]], align 4 +; CHECK-V8-NEXT: store i64 [[RES]], ptr [[ADDR_C]], align 8 ; CHECK-V8-NEXT: [[COUNT:%.*]] = add nuw nsw i32 [[IV]], 1 ; CHECK-V8-NEXT: [[ADDR_A_1:%.*]] = getelementptr i64, ptr [[A]], i32 [[COUNT]] ; CHECK-V8-NEXT: [[ADDR_B_1:%.*]] = getelementptr i64, ptr [[B]], i32 [[COUNT]] -; CHECK-V8-NEXT: [[DATA_A_1:%.*]] = load i64, ptr [[ADDR_A_1]], align 4 -; CHECK-V8-NEXT: [[DATA_B_1:%.*]] = load i64, ptr [[ADDR_B_1]], align 4 +; CHECK-V8-NEXT: [[DATA_A_1:%.*]] = load i64, ptr [[ADDR_A_1]], align 8 +; CHECK-V8-NEXT: [[DATA_B_1:%.*]] = load i64, ptr [[ADDR_B_1]], align 8 ; CHECK-V8-NEXT: [[RES_1:%.*]] = add i64 [[DATA_A_1]], [[DATA_B_1]] ; CHECK-V8-NEXT: [[ADDR_C_1:%.*]] = getelementptr i64, ptr [[C]], i32 [[COUNT]] -; CHECK-V8-NEXT: store i64 [[RES_1]], ptr [[ADDR_C_1]], align 4 +; CHECK-V8-NEXT: store i64 [[RES_1]], ptr [[ADDR_C_1]], align 8 ; CHECK-V8-NEXT: [[COUNT_1]] = add nuw nsw i32 [[IV]], 2 ; CHECK-V8-NEXT: [[END_1:%.*]] = icmp ne i32 [[COUNT_1]], 100 ; CHECK-V8-NEXT: br i1 [[END_1]], label [[LOOP]], label [[EXIT:%.*]] @@ -150,19 +150,19 @@ ; CHECK-V8-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[COUNT_1:%.*]], [[LOOP]] ] ; CHECK-V8-NEXT: [[ADDR_A:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 [[IV]] ; CHECK-V8-NEXT: [[ADDR_B:%.*]] = getelementptr i64, ptr [[B:%.*]], i32 [[IV]] -; CHECK-V8-NEXT: [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 4 -; CHECK-V8-NEXT: [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 4 +; CHECK-V8-NEXT: [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 8 +; CHECK-V8-NEXT: [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 8 ; CHECK-V8-NEXT: [[RES:%.*]] = add i64 [[DATA_A]], [[DATA_B]] ; CHECK-V8-NEXT: [[ADDR_C:%.*]] = getelementptr i64, ptr [[C:%.*]], i32 [[IV]] -; CHECK-V8-NEXT: store i64 [[RES]], ptr [[ADDR_C]], align 4 +; CHECK-V8-NEXT: store i64 [[RES]], ptr [[ADDR_C]], align 8 ; CHECK-V8-NEXT: [[COUNT:%.*]] = add nuw nsw i32 [[IV]], 1 ; CHECK-V8-NEXT: [[ADDR_A_1:%.*]] = getelementptr i64, ptr [[A]], i32 [[COUNT]] ; CHECK-V8-NEXT: [[ADDR_B_1:%.*]] = getelementptr i64, ptr [[B]], i32 [[COUNT]] -; CHECK-V8-NEXT: [[DATA_A_1:%.*]] = load i64, ptr [[ADDR_A_1]], align 4 -; CHECK-V8-NEXT: [[DATA_B_1:%.*]] = load i64, ptr [[ADDR_B_1]], align 4 +; CHECK-V8-NEXT: [[DATA_A_1:%.*]] = load i64, ptr [[ADDR_A_1]], align 8 +; CHECK-V8-NEXT: [[DATA_B_1:%.*]] = load i64, ptr [[ADDR_B_1]], align 8 ; CHECK-V8-NEXT: [[RES_1:%.*]] = add i64 [[DATA_A_1]], [[DATA_B_1]] ; CHECK-V8-NEXT: [[ADDR_C_1:%.*]] = getelementptr i64, ptr [[C]], i32 [[COUNT]] -; CHECK-V8-NEXT: store i64 [[RES_1]], ptr [[ADDR_C_1]], align 4 +; CHECK-V8-NEXT: store i64 [[RES_1]], ptr [[ADDR_C_1]], align 8 ; CHECK-V8-NEXT: [[COUNT_1]] = add nuw nsw i32 [[IV]], 2 ; CHECK-V8-NEXT: [[END_1:%.*]] = icmp ne i32 [[COUNT_1]], 100 ; CHECK-V8-NEXT: br i1 [[END_1]], label [[LOOP]], label [[EXIT:%.*]] @@ -310,13 +310,13 @@ ; CHECK-V8-NEXT: [[ACC:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ACC_NEXT:%.*]], [[LOOP]] ] ; CHECK-V8-NEXT: [[ADDR_A:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 [[IV]] ; CHECK-V8-NEXT: [[ADDR_B:%.*]] = getelementptr i64, ptr [[B:%.*]], i32 [[IV]] -; CHECK-V8-NEXT: [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 4 -; CHECK-V8-NEXT: [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 4 +; CHECK-V8-NEXT: [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 8 +; CHECK-V8-NEXT: [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 8 ; CHECK-V8-NEXT: [[UGT:%.*]] = icmp ugt i64 [[DATA_A]], [[DATA_B]] ; CHECK-V8-NEXT: [[UMAX:%.*]] = select i1 [[UGT]], i64 [[DATA_A]], i64 [[DATA_B]] ; CHECK-V8-NEXT: [[ACC_NEXT]] = add i64 [[UMAX]], [[ACC]] ; CHECK-V8-NEXT: [[ADDR_C:%.*]] = getelementptr i64, ptr [[C:%.*]], i32 [[IV]] -; CHECK-V8-NEXT: store i64 [[UMAX]], ptr [[ADDR_C]], align 4 +; CHECK-V8-NEXT: store i64 [[UMAX]], ptr [[ADDR_C]], align 8 ; CHECK-V8-NEXT: [[COUNT]] = add nuw i32 [[IV]], 1 ; CHECK-V8-NEXT: [[END:%.*]] = icmp ne i32 [[COUNT]], 100 ; CHECK-V8-NEXT: br i1 [[END]], label [[LOOP]], label [[EXIT:%.*]] @@ -356,13 +356,13 @@ ; CHECK-V8-NEXT: [[ACC:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ACC_NEXT:%.*]], [[LOOP]] ] ; CHECK-V8-NEXT: [[ADDR_A:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 [[IV]] ; CHECK-V8-NEXT: [[ADDR_B:%.*]] = getelementptr i64, ptr [[B:%.*]], i32 [[IV]] -; CHECK-V8-NEXT: [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 4 -; CHECK-V8-NEXT: [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 4 +; CHECK-V8-NEXT: [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 8 +; CHECK-V8-NEXT: [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 8 ; CHECK-V8-NEXT: [[UGT:%.*]] = icmp ugt i64 [[DATA_A]], [[DATA_B]] ; CHECK-V8-NEXT: [[UMAX:%.*]] = select i1 [[UGT]], i64 [[DATA_A]], i64 [[DATA_B]] ; CHECK-V8-NEXT: [[ACC_NEXT]] = add i64 [[UMAX]], [[ACC]] ; CHECK-V8-NEXT: [[ADDR_C:%.*]] = getelementptr i64, ptr [[C:%.*]], i32 [[IV]] -; CHECK-V8-NEXT: store i64 [[UMAX]], ptr [[ADDR_C]], align 4 +; CHECK-V8-NEXT: store i64 [[UMAX]], ptr [[ADDR_C]], align 8 ; CHECK-V8-NEXT: [[COUNT]] = add nuw i32 [[IV]], 1 ; CHECK-V8-NEXT: [[END:%.*]] = icmp ne i32 [[COUNT]], 100 ; CHECK-V8-NEXT: br i1 [[END]], label [[LOOP]], label [[EXIT:%.*]] diff --git a/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll b/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll --- a/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll +++ b/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll @@ -17,7 +17,7 @@ ; CHECK-NEXT: store i32 0, ptr [[X]], align 4 ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: -; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 1 +; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[REM]], 1 ; CHECK-NEXT: br i1 [[CMP]], label [[WHILE_BODY_1:%.*]], label [[WHILE_END]] ; CHECK: while.body.1: @@ -28,7 +28,7 @@ ; CHECK-NEXT: store i32 0, ptr [[INCDEC_PTR]], align 4 ; CHECK-NEXT: br label [[IF_END_1]] ; CHECK: if.end.1: -; CHECK-NEXT: [[INCDEC_PTR_1:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 2 +; CHECK-NEXT: [[INCDEC_PTR_1:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 2 ; CHECK-NEXT: [[CMP_1:%.*]] = icmp sgt i32 [[REM]], 2 ; CHECK-NEXT: br i1 [[CMP_1]], label [[WHILE_BODY_2:%.*]], label [[WHILE_END]] ; CHECK: while.body.2: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -23,10 +23,10 @@ ; TFNONE: vector.body: ; TFNONE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TFNONE-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 4 +; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 ; TFNONE-NEXT: [[TMP5:%.*]] = call @foo_vector( [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; TFNONE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFNONE-NEXT: store [[TMP5]], ptr [[TMP6]], align 4 +; TFNONE-NEXT: store [[TMP5]], ptr [[TMP6]], align 8 ; TFNONE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; TFNONE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 ; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] @@ -40,10 +40,10 @@ ; TFNONE: for.body: ; TFNONE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; TFNONE-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]] -; TFNONE-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4 +; TFNONE-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8 ; TFNONE-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR3:[0-9]+]] ; TFNONE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] -; TFNONE-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 4 +; TFNONE-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 8 ; TFNONE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; TFNONE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025 ; TFNONE-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] @@ -66,10 +66,10 @@ ; TFCOMMON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; TFCOMMON-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFCOMMON-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; TFCOMMON-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) ; TFCOMMON-NEXT: [[TMP6:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK]]) ; TFCOMMON-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP6]], ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]]) +; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP6]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) ; TFCOMMON-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; TFCOMMON-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] @@ -359,10 +359,10 @@ ; TFNONE: vector.body: ; TFNONE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TFNONE-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 4 +; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 ; TFNONE-NEXT: [[TMP5:%.*]] = call @foo_vector_nomask( [[WIDE_LOAD]]) ; TFNONE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFNONE-NEXT: store [[TMP5]], ptr [[TMP6]], align 4 +; TFNONE-NEXT: store [[TMP5]], ptr [[TMP6]], align 8 ; TFNONE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; TFNONE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 ; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] @@ -376,10 +376,10 @@ ; TFNONE: for.body: ; TFNONE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; TFNONE-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]] -; TFNONE-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4 +; TFNONE-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8 ; TFNONE-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR5:[0-9]+]] ; TFNONE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] -; TFNONE-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 4 +; TFNONE-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 8 ; TFNONE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; TFNONE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025 ; TFNONE-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] @@ -392,10 +392,10 @@ ; TFALWAYS: for.body: ; TFALWAYS-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; TFALWAYS-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDVARS_IV]] -; TFALWAYS-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4 +; TFALWAYS-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8 ; TFALWAYS-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR5:[0-9]+]] ; TFALWAYS-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDVARS_IV]] -; TFALWAYS-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 4 +; TFALWAYS-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 8 ; TFALWAYS-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; TFALWAYS-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025 ; TFALWAYS-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] @@ -417,10 +417,10 @@ ; TFFALLBACK: vector.body: ; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TFFALLBACK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFFALLBACK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 4 +; TFFALLBACK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 ; TFFALLBACK-NEXT: [[TMP5:%.*]] = call @foo_vector_nomask( [[WIDE_LOAD]]) ; TFFALLBACK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFFALLBACK-NEXT: store [[TMP5]], ptr [[TMP6]], align 4 +; TFFALLBACK-NEXT: store [[TMP5]], ptr [[TMP6]], align 8 ; TFFALLBACK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; TFFALLBACK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 ; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] @@ -432,10 +432,10 @@ ; TFFALLBACK: for.body: ; TFFALLBACK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; TFFALLBACK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]] -; TFFALLBACK-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4 +; TFFALLBACK-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8 ; TFFALLBACK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR5:[0-9]+]] ; TFFALLBACK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] -; TFFALLBACK-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 4 +; TFFALLBACK-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 8 ; TFFALLBACK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; TFFALLBACK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025 ; TFFALLBACK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -479,10 +479,10 @@ ; TFNONE: vector.body: ; TFNONE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TFNONE-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 4 +; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 ; TFNONE-NEXT: [[TMP5:%.*]] = call @foo_vector_nomask( [[WIDE_LOAD]]) ; TFNONE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFNONE-NEXT: store [[TMP5]], ptr [[TMP6]], align 4 +; TFNONE-NEXT: store [[TMP5]], ptr [[TMP6]], align 8 ; TFNONE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; TFNONE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 ; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] @@ -496,10 +496,10 @@ ; TFNONE: for.body: ; TFNONE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; TFNONE-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]] -; TFNONE-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4 +; TFNONE-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8 ; TFNONE-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR6:[0-9]+]] ; TFNONE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] -; TFNONE-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 4 +; TFNONE-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 8 ; TFNONE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; TFNONE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025 ; TFNONE-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] @@ -522,10 +522,10 @@ ; TFALWAYS-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; TFALWAYS-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFALWAYS-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; TFALWAYS-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) ; TFALWAYS-NEXT: [[TMP6:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK]]) ; TFALWAYS-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFALWAYS-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP6]], ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]]) +; TFALWAYS-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP6]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) ; TFALWAYS-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; TFALWAYS-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; TFALWAYS-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] @@ -552,10 +552,10 @@ ; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; TFFALLBACK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFFALLBACK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; TFFALLBACK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) ; TFFALLBACK-NEXT: [[TMP6:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK]]) ; TFFALLBACK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFFALLBACK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP6]], ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]]) +; TFFALLBACK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP6]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) ; TFFALLBACK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; TFFALLBACK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] @@ -611,7 +611,7 @@ ; TFNONE-NEXT: [[TMP6:%.*]] = fptoui [[WIDE_LOAD]] to ; TFNONE-NEXT: [[TMP7:%.*]] = call @foo_vector( [[TMP6]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; TFNONE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFNONE-NEXT: store [[TMP7]], ptr [[TMP8]], align 4 +; TFNONE-NEXT: store [[TMP7]], ptr [[TMP8]], align 8 ; TFNONE-NEXT: [[TMP9]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], [[TMP5]]) ; TFNONE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() ; TFNONE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 @@ -633,7 +633,7 @@ ; TFNONE-NEXT: [[TOINT:%.*]] = fptoui double [[LOAD]] to i64 ; TFNONE-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[TOINT]]) #[[ATTR3]] ; TFNONE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] -; TFNONE-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 4 +; TFNONE-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 8 ; TFNONE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; TFNONE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025 ; TFNONE-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] @@ -665,7 +665,7 @@ ; TFALWAYS-NEXT: [[TMP7:%.*]] = fptoui [[WIDE_MASKED_LOAD]] to ; TFALWAYS-NEXT: [[TMP8:%.*]] = call @foo_vector( [[TMP7]], [[ACTIVE_LANE_MASK]]) ; TFALWAYS-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFALWAYS-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP8]], ptr [[TMP9]], i32 4, [[ACTIVE_LANE_MASK]]) +; TFALWAYS-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP8]], ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]]) ; TFALWAYS-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], shufflevector ( insertelement ( poison, double -0.000000e+00, i64 0), poison, zeroinitializer) ; TFALWAYS-NEXT: [[TMP11]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], [[TMP10]]) ; TFALWAYS-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() @@ -702,7 +702,7 @@ ; TFFALLBACK-NEXT: [[TMP7:%.*]] = fptoui [[WIDE_MASKED_LOAD]] to ; TFFALLBACK-NEXT: [[TMP8:%.*]] = call @foo_vector( [[TMP7]], [[ACTIVE_LANE_MASK]]) ; TFFALLBACK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFFALLBACK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP8]], ptr [[TMP9]], i32 4, [[ACTIVE_LANE_MASK]]) +; TFFALLBACK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP8]], ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]]) ; TFFALLBACK-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], shufflevector ( insertelement ( poison, double -0.000000e+00, i64 0), poison, zeroinitializer) ; TFFALLBACK-NEXT: [[TMP11]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], [[TMP10]]) ; TFFALLBACK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll @@ -33,11 +33,11 @@ ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 8 ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 2 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP14]] -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP15]], align 4 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP15]], align 8 ; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.and.nxv2i64( [[WIDE_LOAD]]) ; CHECK-NEXT: [[TMP17]] = and i64 [[TMP16]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vector.reduce.and.nxv2i64( [[WIDE_LOAD3]]) @@ -67,7 +67,7 @@ ; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[INDEX7]], 0 ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP23]] ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[TMP24]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <2 x i64>, ptr [[TMP25]], align 4 +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <2 x i64>, ptr [[TMP25]], align 8 ; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> [[WIDE_LOAD9]]) ; CHECK-NEXT: [[TMP27]] = and i64 [[TMP26]], [[VEC_PHI8]] ; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], 2 @@ -84,7 +84,7 @@ ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[AND:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX11]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[L2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; CHECK-NEXT: [[L3:%.*]] = load i64, ptr [[L2]], align 4 +; CHECK-NEXT: [[L3:%.*]] = load i64, ptr [[L2]], align 8 ; CHECK-NEXT: [[AND]] = and i64 [[RDX]], [[L3]] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll @@ -33,11 +33,11 @@ ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 8 ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 2 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP14]] -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP15]], align 4 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP15]], align 8 ; CHECK-NEXT: [[TMP16]] = add [[WIDE_LOAD]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP17]] = add [[WIDE_LOAD3]], [[VEC_PHI2]] ; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() @@ -67,7 +67,7 @@ ; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[INDEX7]], 0 ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP23]] ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[TMP24]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <2 x i64>, ptr [[TMP25]], align 4 +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <2 x i64>, ptr [[TMP25]], align 8 ; CHECK-NEXT: [[TMP26]] = add <2 x i64> [[WIDE_LOAD9]], [[VEC_PHI8]] ; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], 2 ; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]] @@ -84,7 +84,7 @@ ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX11]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; CHECK-NEXT: [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[ADD]] = add i64 [[TMP29]], [[SUM]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll @@ -55,11 +55,11 @@ ; CHECK-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 8 ; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[START_2]], i64 [[TMP28]] ; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 -; CHECK-NEXT: store zeroinitializer, ptr [[TMP29]], align 4 +; CHECK-NEXT: store zeroinitializer, ptr [[TMP29]], align 8 ; CHECK-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP31:%.*]] = mul i64 [[TMP30]], 2 ; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i64 [[TMP31]] -; CHECK-NEXT: store zeroinitializer, ptr [[TMP32]], align 4 +; CHECK-NEXT: store zeroinitializer, ptr [[TMP32]], align 8 ; CHECK-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP34:%.*]] = mul i64 [[TMP33]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP34]] @@ -79,7 +79,7 @@ ; CHECK: loop: ; CHECK-NEXT: [[IV_1:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[IV_2:%.*]] = phi ptr [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: store i64 0, ptr [[IV_2]], align 4 +; CHECK-NEXT: store i64 0, ptr [[IV_2]], align 8 ; CHECK-NEXT: [[IV_2_NEXT]] = getelementptr inbounds ptr, ptr [[IV_2]], i64 1 ; CHECK-NEXT: [[IV_1_NEXT]] = getelementptr inbounds ptr, ptr [[IV_1]], i64 1 ; CHECK-NEXT: [[CMP_I_I_NOT_I:%.*]] = icmp eq ptr [[IV_2_NEXT]], [[END]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll @@ -61,17 +61,17 @@ ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i64, ptr [[SRC_2]], i64 [[TMP17]] ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i64, ptr [[SRC_2]], i64 [[TMP22]] ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i64, ptr [[TMP23]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP27]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP27]], align 8 ; CHECK-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 2 ; CHECK-NEXT: [[TMP30:%.*]] = getelementptr i64, ptr [[TMP23]], i64 [[TMP29]] -; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load , ptr [[TMP30]], align 4 +; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load , ptr [[TMP30]], align 8 ; CHECK-NEXT: [[TMP31:%.*]] = getelementptr i64, ptr [[TMP25]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load , ptr [[TMP31]], align 4 +; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load , ptr [[TMP31]], align 8 ; CHECK-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 2 ; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i64, ptr [[TMP25]], i64 [[TMP33]] -; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load , ptr [[TMP34]], align 4 +; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load , ptr [[TMP34]], align 8 ; CHECK-NEXT: [[TMP35:%.*]] = add [[WIDE_LOAD]], [[WIDE_LOAD13]] ; CHECK-NEXT: [[TMP36:%.*]] = add [[WIDE_LOAD12]], [[WIDE_LOAD14]] ; CHECK-NEXT: [[TMP37:%.*]] = getelementptr i64, ptr [[DST_1]], i64 [[TMP17]] @@ -79,17 +79,17 @@ ; CHECK-NEXT: [[TMP39:%.*]] = getelementptr i64, ptr [[DST_2]], i64 [[TMP17]] ; CHECK-NEXT: [[TMP40:%.*]] = getelementptr i64, ptr [[DST_2]], i64 [[TMP22]] ; CHECK-NEXT: [[TMP41:%.*]] = getelementptr i64, ptr [[TMP37]], i32 0 -; CHECK-NEXT: store [[TMP35]], ptr [[TMP41]], align 4 +; CHECK-NEXT: store [[TMP35]], ptr [[TMP41]], align 8 ; CHECK-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 2 ; CHECK-NEXT: [[TMP44:%.*]] = getelementptr i64, ptr [[TMP37]], i64 [[TMP43]] -; CHECK-NEXT: store [[TMP36]], ptr [[TMP44]], align 4 +; CHECK-NEXT: store [[TMP36]], ptr [[TMP44]], align 8 ; CHECK-NEXT: [[TMP45:%.*]] = getelementptr i64, ptr [[TMP39]], i32 0 -; CHECK-NEXT: store [[TMP35]], ptr [[TMP45]], align 4 +; CHECK-NEXT: store [[TMP35]], ptr [[TMP45]], align 8 ; CHECK-NEXT: [[TMP46:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP47:%.*]] = mul i64 [[TMP46]], 2 ; CHECK-NEXT: [[TMP48:%.*]] = getelementptr i64, ptr [[TMP39]], i64 [[TMP47]] -; CHECK-NEXT: store [[TMP36]], ptr [[TMP48]], align 4 +; CHECK-NEXT: store [[TMP36]], ptr [[TMP48]], align 8 ; CHECK-NEXT: [[TMP49:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP49]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP50]] @@ -105,13 +105,13 @@ ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr i64, ptr [[SRC_1]], i64 [[IV]] ; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr i64, ptr [[SRC_2]], i64 [[IV]] -; CHECK-NEXT: [[L_1:%.*]] = load i64, ptr [[GEP_SRC_1]], align 4 -; CHECK-NEXT: [[L_2:%.*]] = load i64, ptr [[GEP_SRC_2]], align 4 +; CHECK-NEXT: [[L_1:%.*]] = load i64, ptr [[GEP_SRC_1]], align 8 +; CHECK-NEXT: [[L_2:%.*]] = load i64, ptr [[GEP_SRC_2]], align 8 ; CHECK-NEXT: [[ADD:%.*]] = add i64 [[L_1]], [[L_2]] ; CHECK-NEXT: [[GEP_DST_1:%.*]] = getelementptr i64, ptr [[DST_1]], i64 [[IV]] ; CHECK-NEXT: [[GEP_DST_2:%.*]] = getelementptr i64, ptr [[DST_2]], i64 [[IV]] -; CHECK-NEXT: store i64 [[ADD]], ptr [[GEP_DST_1]], align 4 -; CHECK-NEXT: store i64 [[ADD]], ptr [[GEP_DST_2]], align 4 +; CHECK-NEXT: store i64 [[ADD]], ptr [[GEP_DST_1]], align 8 +; CHECK-NEXT: store i64 [[ADD]], ptr [[GEP_DST_2]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 ; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[CMP10]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll @@ -185,11 +185,11 @@ ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @foo_vector_fixed4_nomask(<4 x i64> [[WIDE_LOAD]]) ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 -; CHECK-NEXT: store <4 x i64> [[TMP3]], ptr [[TMP5]], align 4 +; CHECK-NEXT: store <4 x i64> [[TMP3]], ptr [[TMP5]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -201,10 +201,10 @@ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4 +; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8 ; CHECK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR1:[0-9]+]] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] @@ -241,11 +241,11 @@ ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @foo_vector_fixed4_mask(<4 x i64> [[WIDE_LOAD]], <4 x i1> ) ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 -; CHECK-NEXT: store <4 x i64> [[TMP3]], ptr [[TMP5]], align 4 +; CHECK-NEXT: store <4 x i64> [[TMP3]], ptr [[TMP5]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -257,10 +257,10 @@ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4 +; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8 ; CHECK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR2:[0-9]+]] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -297,11 +297,11 @@ ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @foo_vector_fixed4_nomask(<4 x i64> [[WIDE_LOAD]]) ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 -; CHECK-NEXT: store <4 x i64> [[TMP3]], ptr [[TMP5]], align 4 +; CHECK-NEXT: store <4 x i64> [[TMP3]], ptr [[TMP5]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -313,10 +313,10 @@ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4 +; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8 ; CHECK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR3:[0-9]+]] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll --- a/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll @@ -8,21 +8,20 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[INDEX]] to i64 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[ARR:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = fpext <2 x float> [[WIDE_LOAD]] to <2 x double> -; CHECK-NEXT: [[TMP3:%.*]] = tail call fast <2 x double> @__sind2(<2 x double> [[TMP2]]) -; CHECK-NEXT: [[TMP4]] = fadd fast <2 x double> [[TMP3]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[ARR:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP0]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fpext <2 x float> [[WIDE_LOAD]] to <2 x double> +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast <2 x double> @__sind2(<2 x double> [[TMP1]]) +; CHECK-NEXT: [[TMP3]] = fadd fast <2 x double> [[TMP2]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <2 x double> [ [[TMP4]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[DOTLCSSA]]) -; CHECK-NEXT: ret double [[TMP6]] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <2 x double> [ [[TMP3]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[DOTLCSSA]]) +; CHECK-NEXT: ret double [[TMP5]] ; entry: br label %for.cond diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses-zve32x.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses-zve32x.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses-zve32x.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses-zve32x.ll @@ -10,14 +10,14 @@ ; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 ; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 4 +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 ; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 4 +; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 ; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] -; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 4 +; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 -; CHECK-NEXT: store i64 [[Y1]], ptr [[Q1]], align 4 +; CHECK-NEXT: store i64 [[Y1]], ptr [[Q1]], align 8 ; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 ; CHECK-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll @@ -77,22 +77,49 @@ define void @load_store_factor2_i64(ptr %p) { ; CHECK-LABEL: @load_store_factor2_i64( ; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP3]], align 8 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC1]], +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[TMP6]], i32 -1 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP7]], <8 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> poison, <8 x i32> +; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 4 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 ; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 4 +; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 ; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] -; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 4 +; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 -; CHECK-NEXT: store i64 [[Y1]], ptr [[Q1]], align 4 +; CHECK-NEXT: store i64 [[Y1]], ptr [[Q1]], align 8 ; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -150,7 +177,7 @@ ; CHECK-NEXT: store <6 x i32> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -175,7 +202,7 @@ ; CHECK-NEXT: store i32 [[Y2]], ptr [[Q2]], align 4 ; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -212,27 +239,75 @@ define void @load_store_factor3_i64(ptr %p) { ; CHECK-LABEL: @load_store_factor3_i64( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; CHECK-NEXT: [[TMP5:%.*]] = add [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = mul [[TMP5]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = mul [[VEC_IND]], shufflevector ( insertelement ( poison, i64 3, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i64, ptr [[P:%.*]], [[TMP10]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP11]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP12:%.*]] = add [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[TMP12]], [[TMP11]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[P]], [[TMP13]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP14]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP15:%.*]] = add [[WIDE_MASKED_GATHER1]], shufflevector ( insertelement ( poison, i64 2, i64 0), poison, zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[TMP15]], [[TMP14]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP16:%.*]] = add [[TMP13]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i64, ptr [[P]], [[TMP16]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP17]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP18:%.*]] = add [[WIDE_MASKED_GATHER2]], shufflevector ( insertelement ( poison, i64 3, i64 0), poison, zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[TMP18]], [[TMP17]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP20]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 4 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 ; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 4 +; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 ; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] -; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 4 +; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 -; CHECK-NEXT: store i64 [[Y1]], ptr [[Q1]], align 4 +; CHECK-NEXT: store i64 [[Y1]], ptr [[Q1]], align 8 ; CHECK-NEXT: [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1 ; CHECK-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] -; CHECK-NEXT: [[X2:%.*]] = load i64, ptr [[Q2]], align 4 +; CHECK-NEXT: [[X2:%.*]] = load i64, ptr [[Q2]], align 8 ; CHECK-NEXT: [[Y2:%.*]] = add i64 [[X2]], 3 -; CHECK-NEXT: store i64 [[Y2]], ptr [[Q2]], align 4 +; CHECK-NEXT: store i64 [[Y2]], ptr [[Q2]], align 8 ; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -269,52 +344,125 @@ define void @load_store_factor8(ptr %p) { ; CHECK-LABEL: @load_store_factor8( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; CHECK-NEXT: [[TMP5:%.*]] = add [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = mul [[TMP5]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = shl [[VEC_IND]], shufflevector ( insertelement ( poison, i64 3, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i64, ptr [[P:%.*]], [[TMP10]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP11]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP12:%.*]] = add [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[TMP12]], [[TMP11]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[P]], [[TMP13]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP14]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP15:%.*]] = add [[WIDE_MASKED_GATHER1]], shufflevector ( insertelement ( poison, i64 2, i64 0), poison, zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[TMP15]], [[TMP14]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP16:%.*]] = add [[TMP13]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i64, ptr [[P]], [[TMP16]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP17]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP18:%.*]] = add [[WIDE_MASKED_GATHER2]], shufflevector ( insertelement ( poison, i64 3, i64 0), poison, zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[TMP18]], [[TMP17]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP19:%.*]] = add [[TMP16]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i64, ptr [[P]], [[TMP19]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP20]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP21:%.*]] = add [[WIDE_MASKED_GATHER3]], shufflevector ( insertelement ( poison, i64 4, i64 0), poison, zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[TMP21]], [[TMP20]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP22:%.*]] = add [[TMP19]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i64, ptr [[P]], [[TMP22]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP23]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP24:%.*]] = add [[WIDE_MASKED_GATHER4]], shufflevector ( insertelement ( poison, i64 5, i64 0), poison, zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[TMP24]], [[TMP23]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP25:%.*]] = add [[TMP22]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i64, ptr [[P]], [[TMP25]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP26]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP27:%.*]] = add [[WIDE_MASKED_GATHER5]], shufflevector ( insertelement ( poison, i64 6, i64 0), poison, zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[TMP27]], [[TMP26]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP28:%.*]] = add [[TMP25]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i64, ptr [[P]], [[TMP28]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP29]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP30:%.*]] = add [[WIDE_MASKED_GATHER6]], shufflevector ( insertelement ( poison, i64 7, i64 0), poison, zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[TMP30]], [[TMP29]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP31:%.*]] = add [[TMP28]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i64, ptr [[P]], [[TMP31]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP32]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP33:%.*]] = add [[WIDE_MASKED_GATHER7]], shufflevector ( insertelement ( poison, i64 8, i64 0), poison, zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[TMP33]], [[TMP32]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP35]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 3 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 4 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 ; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 4 +; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 ; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] -; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 4 +; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 -; CHECK-NEXT: store i64 [[Y1]], ptr [[Q1]], align 4 +; CHECK-NEXT: store i64 [[Y1]], ptr [[Q1]], align 8 ; CHECK-NEXT: [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1 ; CHECK-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] -; CHECK-NEXT: [[X2:%.*]] = load i64, ptr [[Q2]], align 4 +; CHECK-NEXT: [[X2:%.*]] = load i64, ptr [[Q2]], align 8 ; CHECK-NEXT: [[Y2:%.*]] = add i64 [[X2]], 3 -; CHECK-NEXT: store i64 [[Y2]], ptr [[Q2]], align 4 +; CHECK-NEXT: store i64 [[Y2]], ptr [[Q2]], align 8 ; CHECK-NEXT: [[OFFSET3:%.*]] = add i64 [[OFFSET2]], 1 ; CHECK-NEXT: [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]] -; CHECK-NEXT: [[X3:%.*]] = load i64, ptr [[Q3]], align 4 +; CHECK-NEXT: [[X3:%.*]] = load i64, ptr [[Q3]], align 8 ; CHECK-NEXT: [[Y3:%.*]] = add i64 [[X3]], 4 -; CHECK-NEXT: store i64 [[Y3]], ptr [[Q3]], align 4 +; CHECK-NEXT: store i64 [[Y3]], ptr [[Q3]], align 8 ; CHECK-NEXT: [[OFFSET4:%.*]] = add i64 [[OFFSET3]], 1 ; CHECK-NEXT: [[Q4:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET4]] -; CHECK-NEXT: [[X4:%.*]] = load i64, ptr [[Q4]], align 4 +; CHECK-NEXT: [[X4:%.*]] = load i64, ptr [[Q4]], align 8 ; CHECK-NEXT: [[Y4:%.*]] = add i64 [[X4]], 5 -; CHECK-NEXT: store i64 [[Y4]], ptr [[Q4]], align 4 +; CHECK-NEXT: store i64 [[Y4]], ptr [[Q4]], align 8 ; CHECK-NEXT: [[OFFSET5:%.*]] = add i64 [[OFFSET4]], 1 ; CHECK-NEXT: [[Q5:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET5]] -; CHECK-NEXT: [[X5:%.*]] = load i64, ptr [[Q5]], align 4 +; CHECK-NEXT: [[X5:%.*]] = load i64, ptr [[Q5]], align 8 ; CHECK-NEXT: [[Y5:%.*]] = add i64 [[X5]], 6 -; CHECK-NEXT: store i64 [[Y5]], ptr [[Q5]], align 4 +; CHECK-NEXT: store i64 [[Y5]], ptr [[Q5]], align 8 ; CHECK-NEXT: [[OFFSET6:%.*]] = add i64 [[OFFSET5]], 1 ; CHECK-NEXT: [[Q6:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET6]] -; CHECK-NEXT: [[X6:%.*]] = load i64, ptr [[Q6]], align 4 +; CHECK-NEXT: [[X6:%.*]] = load i64, ptr [[Q6]], align 8 ; CHECK-NEXT: [[Y6:%.*]] = add i64 [[X6]], 7 -; CHECK-NEXT: store i64 [[Y6]], ptr [[Q6]], align 4 +; CHECK-NEXT: store i64 [[Y6]], ptr [[Q6]], align 8 ; CHECK-NEXT: [[OFFSET7:%.*]] = add i64 [[OFFSET6]], 1 ; CHECK-NEXT: [[Q7:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET7]] -; CHECK-NEXT: [[X7:%.*]] = load i64, ptr [[Q7]], align 4 +; CHECK-NEXT: [[X7:%.*]] = load i64, ptr [[Q7]], align 8 ; CHECK-NEXT: [[Y7:%.*]] = add i64 [[X7]], 8 -; CHECK-NEXT: store i64 [[Y7]], ptr [[Q7]], align 4 +; CHECK-NEXT: store i64 [[Y7]], ptr [[Q7]], align 8 ; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -410,7 +558,7 @@ ; CHECK-NEXT: store <8 x i32> [[TMP9]], ptr [[TMP13]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -429,7 +577,7 @@ ; CHECK-NEXT: store i32 [[RES]], ptr [[DST]], align 4 ; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -461,21 +609,55 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; CHECK-LABEL: @combine_load_factor2_i64( ; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP6]], align 8 +; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC3]] +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC4]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i64, ptr [[Q]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[TMP10]], i32 0 +; CHECK-NEXT: store <4 x i64> [[TMP8]], ptr [[TMP12]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP10]], i32 4 +; CHECK-NEXT: store <4 x i64> [[TMP9]], ptr [[TMP13]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 4 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 ; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] -; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 4 +; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[RES:%.*]] = add i64 [[X0]], [[X1]] -; CHECK-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[I]] -; CHECK-NEXT: store i64 [[RES]], ptr [[DST]], align 4 +; CHECK-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]] +; CHECK-NEXT: store i64 [[RES]], ptr [[DST]], align 8 ; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll @@ -21,9 +21,9 @@ ; LMUL1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 ; LMUL1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP2]] ; LMUL1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; LMUL1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 4 +; LMUL1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 ; LMUL1-NEXT: [[TMP5:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; LMUL1-NEXT: store [[TMP5]], ptr [[TMP4]], align 4 +; LMUL1-NEXT: store [[TMP5]], ptr [[TMP4]], align 8 ; LMUL1-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() ; LMUL1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; LMUL1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -37,9 +37,9 @@ ; LMUL1: for.body: ; LMUL1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; LMUL1-NEXT: [[Q:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV]] -; LMUL1-NEXT: [[V:%.*]] = load i64, ptr [[Q]], align 4 +; LMUL1-NEXT: [[V:%.*]] = load i64, ptr [[Q]], align 8 ; LMUL1-NEXT: [[W:%.*]] = add i64 [[V]], 1 -; LMUL1-NEXT: store i64 [[W]], ptr [[Q]], align 4 +; LMUL1-NEXT: store i64 [[W]], ptr [[Q]], align 8 ; LMUL1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; LMUL1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; LMUL1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] @@ -63,9 +63,9 @@ ; LMUL2-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 ; LMUL2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP4]] ; LMUL2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 -; LMUL2-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; LMUL2-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 ; LMUL2-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; LMUL2-NEXT: store [[TMP7]], ptr [[TMP6]], align 4 +; LMUL2-NEXT: store [[TMP7]], ptr [[TMP6]], align 8 ; LMUL2-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; LMUL2-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; LMUL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] @@ -80,9 +80,9 @@ ; LMUL2: for.body: ; LMUL2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; LMUL2-NEXT: [[Q:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV]] -; LMUL2-NEXT: [[V:%.*]] = load i64, ptr [[Q]], align 4 +; LMUL2-NEXT: [[V:%.*]] = load i64, ptr [[Q]], align 8 ; LMUL2-NEXT: [[W:%.*]] = add i64 [[V]], 1 -; LMUL2-NEXT: store i64 [[W]], ptr [[Q]], align 4 +; LMUL2-NEXT: store i64 [[W]], ptr [[Q]], align 8 ; LMUL2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; LMUL2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; LMUL2-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] @@ -106,9 +106,9 @@ ; LMUL4-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 ; LMUL4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP4]] ; LMUL4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 -; LMUL4-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; LMUL4-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 ; LMUL4-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; LMUL4-NEXT: store [[TMP7]], ptr [[TMP6]], align 4 +; LMUL4-NEXT: store [[TMP7]], ptr [[TMP6]], align 8 ; LMUL4-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; LMUL4-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 ; LMUL4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] @@ -123,9 +123,9 @@ ; LMUL4: for.body: ; LMUL4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; LMUL4-NEXT: [[Q:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV]] -; LMUL4-NEXT: [[V:%.*]] = load i64, ptr [[Q]], align 4 +; LMUL4-NEXT: [[V:%.*]] = load i64, ptr [[Q]], align 8 ; LMUL4-NEXT: [[W:%.*]] = add i64 [[V]], 1 -; LMUL4-NEXT: store i64 [[W]], ptr [[Q]], align 4 +; LMUL4-NEXT: store i64 [[W]], ptr [[Q]], align 8 ; LMUL4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; LMUL4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; LMUL4-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] @@ -149,9 +149,9 @@ ; LMUL8-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 ; LMUL8-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP4]] ; LMUL8-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 -; LMUL8-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; LMUL8-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 ; LMUL8-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; LMUL8-NEXT: store [[TMP7]], ptr [[TMP6]], align 4 +; LMUL8-NEXT: store [[TMP7]], ptr [[TMP6]], align 8 ; LMUL8-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; LMUL8-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 8 ; LMUL8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] @@ -166,9 +166,9 @@ ; LMUL8: for.body: ; LMUL8-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; LMUL8-NEXT: [[Q:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV]] -; LMUL8-NEXT: [[V:%.*]] = load i64, ptr [[Q]], align 4 +; LMUL8-NEXT: [[V:%.*]] = load i64, ptr [[Q]], align 8 ; LMUL8-NEXT: [[W:%.*]] = add i64 [[V]], 1 -; LMUL8-NEXT: store i64 [[W]], ptr [[Q]], align 4 +; LMUL8-NEXT: store i64 [[W]], ptr [[Q]], align 8 ; LMUL8-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; LMUL8-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; LMUL8-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll @@ -22,9 +22,9 @@ ; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP2]] ; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; RV32: vector.memcheck: -; RV32-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 79880 -; RV32-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 39940 -; RV32-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 159752 +; RV32-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 79880 +; RV32-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i32 39940 +; RV32-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 159752 ; RV32-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] ; RV32-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]] ; RV32-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] diff --git a/llvm/test/Transforms/MemCpyOpt/no-libcalls.ll b/llvm/test/Transforms/MemCpyOpt/no-libcalls.ll --- a/llvm/test/Transforms/MemCpyOpt/no-libcalls.ll +++ b/llvm/test/Transforms/MemCpyOpt/no-libcalls.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -passes=memcpyopt < %s | FileCheck %s --check-prefixes=CHECK,LIBCALLS +; RUN: opt -S -passes=memcpyopt -mtriple=x86_64 < %s | FileCheck %s --check-prefixes=CHECK,LIBCALLS ; RUN: opt -S -passes=memcpyopt -mtriple=amdgcn-- < %s | FileCheck %s --check-prefixes=CHECK,NO-LIBCALLS ; RUN: opt -S -passes=memcpyopt -mtriple=amdgcn-- -enable-memcpyopt-without-libcalls < %s \ ; RUN: | FileCheck %s --check-prefixes=CHECK,LIBCALLS @@ -38,12 +38,12 @@ define void @dont_create_memcpy(ptr %p1, ptr %p2) { ; LIBCALLS-LABEL: @dont_create_memcpy( -; LIBCALLS-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 4 [[P2:%.*]], ptr align 4 [[P1:%.*]], i64 8, i1 false) +; LIBCALLS-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 8 [[P2:%.*]], ptr align 8 [[P1:%.*]], i64 8, i1 false) ; LIBCALLS-NEXT: ret void ; ; NO-LIBCALLS-LABEL: @dont_create_memcpy( -; NO-LIBCALLS-NEXT: [[V:%.*]] = load [[TY:%.*]], ptr [[P1:%.*]], align 4 -; NO-LIBCALLS-NEXT: store [[TY]] [[V]], ptr [[P2:%.*]], align 4 +; NO-LIBCALLS-NEXT: [[V:%.*]] = load [[TY:%.*]], ptr [[P1:%.*]], align 8 +; NO-LIBCALLS-NEXT: store [[TY]] [[V]], ptr [[P2:%.*]], align 8 ; NO-LIBCALLS-NEXT: ret void ; %v = load %ty, ptr %p1 diff --git a/llvm/test/Transforms/MergeICmps/X86/addressspaces.ll b/llvm/test/Transforms/MergeICmps/X86/addressspaces.ll --- a/llvm/test/Transforms/MergeICmps/X86/addressspaces.ll +++ b/llvm/test/Transforms/MergeICmps/X86/addressspaces.ll @@ -46,13 +46,13 @@ ; CHECK-NEXT: [[PTR_B1:%.*]] = getelementptr inbounds [2 x i64], ptr addrspace(11) [[B:%.*]], i64 0, i64 1 ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[A0:%.*]] = load i64, ptr addrspace(11) [[A]], align 4 -; CHECK-NEXT: [[B0:%.*]] = load i64, ptr addrspace(11) [[B]], align 4 +; CHECK-NEXT: [[A0:%.*]] = load i64, ptr addrspace(11) [[A]], align 8 +; CHECK-NEXT: [[B0:%.*]] = load i64, ptr addrspace(11) [[B]], align 8 ; CHECK-NEXT: [[COND0:%.*]] = icmp eq i64 [[A0]], [[B0]] ; CHECK-NEXT: br i1 [[COND0]], label [[BB2:%.*]], label [[BB3:%.*]] ; CHECK: bb2: -; CHECK-NEXT: [[A1:%.*]] = load i64, ptr addrspace(11) [[PTR_A1]], align 4 -; CHECK-NEXT: [[B1:%.*]] = load i64, ptr addrspace(11) [[PTR_B1]], align 4 +; CHECK-NEXT: [[A1:%.*]] = load i64, ptr addrspace(11) [[PTR_A1]], align 8 +; CHECK-NEXT: [[B1:%.*]] = load i64, ptr addrspace(11) [[PTR_B1]], align 8 ; CHECK-NEXT: [[COND1:%.*]] = icmp eq i64 [[A1]], [[B1]] ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: diff --git a/llvm/test/Transforms/NewGVN/refine-stores.ll b/llvm/test/Transforms/NewGVN/refine-stores.ll --- a/llvm/test/Transforms/NewGVN/refine-stores.ll +++ b/llvm/test/Transforms/NewGVN/refine-stores.ll @@ -56,7 +56,7 @@ ; CHECK-NEXT: b: ; CHECK-NEXT: br label [[C:%.*]] ; CHECK: c: -; CHECK-NEXT: store i64 undef, ptr null, align 4 +; CHECK-NEXT: store i64 undef, ptr null, align 8 ; CHECK-NEXT: br label [[E:%.*]] ; CHECK: e: ; CHECK-NEXT: store ptr undef, ptr null, align 8 @@ -93,9 +93,9 @@ ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[TMP3]], 0 ; CHECK-NEXT: br i1 [[TMP4]], label [[BB7]], label [[BB5:%.*]] ; CHECK: bb5: -; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr null, align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr null, align 8 ; CHECK-NEXT: call void @quux() -; CHECK-NEXT: store i64 [[TMP6]], ptr undef, align 4 +; CHECK-NEXT: store i64 [[TMP6]], ptr undef, align 8 ; CHECK-NEXT: br label [[BB7]] ; CHECK: bb7: ; CHECK-NEXT: [[TMP8]] = add i64 [[TMP3]], 1 diff --git a/llvm/test/Transforms/OpenMP/barrier_removal.ll b/llvm/test/Transforms/OpenMP/barrier_removal.ll --- a/llvm/test/Transforms/OpenMP/barrier_removal.ll +++ b/llvm/test/Transforms/OpenMP/barrier_removal.ll @@ -329,7 +329,7 @@ define void @pos_priv_mem() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@pos_priv_mem ; CHECK-SAME: () #[[ATTR4]] { -; CHECK-NEXT: [[ARG:%.*]] = load ptr addrspace(5), ptr @GPtr5, align 8 +; CHECK-NEXT: [[ARG:%.*]] = load ptr addrspace(5), ptr @GPtr5, align 4 ; CHECK-NEXT: [[LOC:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[A:%.*]] = load i32, ptr @PG1, align 4 ; CHECK-NEXT: store i32 [[A]], ptr [[LOC]], align 4 diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll --- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll @@ -41,49 +41,47 @@ define void @loop(ptr %X, ptr %Y) { ; CHECK-LABEL: @loop( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X5:%.*]] = ptrtoint ptr [[X:%.*]] to i64 -; CHECK-NEXT: [[Y6:%.*]] = ptrtoint ptr [[Y:%.*]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[X5]], [[Y6]] +; CHECK-NEXT: [[X6:%.*]] = ptrtoint ptr [[X:%.*]] to i64 +; CHECK-NEXT: [[Y7:%.*]] = ptrtoint ptr [[Y:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[X6]], [[Y7]] ; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32 ; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[INDEX]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[TMP1]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i64 2 -; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <2 x double>, ptr [[TMP3]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD7]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP7:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD7]], -; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP6]], <2 x double> , <2 x double> [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP7]], <2 x double> , <2 x double> [[WIDE_LOAD7]] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP1]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i64 2 +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <2 x double>, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD8]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP6:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD8]], +; CHECK-NEXT: [[TMP7:%.*]] = select <2 x i1> [[TMP5]], <2 x double> , <2 x double> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP6]], <2 x double> , <2 x double> [[WIDE_LOAD8]] +; CHECK-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP3]], <2 x double> zeroinitializer, <2 x double> [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[TMP4]], <2 x double> zeroinitializer, <2 x double> [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP5]], <2 x double> zeroinitializer, <2 x double> [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX]] +; CHECK-NEXT: store <2 x double> [[TMP9]], ptr [[TMP11]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[TMP11]], i64 2 ; CHECK-NEXT: store <2 x double> [[TMP10]], ptr [[TMP12]], align 8 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, ptr [[TMP12]], i64 2 -; CHECK-NEXT: store <2 x double> [[TMP11]], ptr [[TMP13]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 20000 -; CHECK-NEXT: br i1 [[TMP14]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20000 +; CHECK-NEXT: br i1 [[TMP13]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; CHECK: for.body: -; CHECK-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_04]] to i64 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[IDXPROM]] -; CHECK-NEXT: [[TMP15:%.*]] = load double, ptr [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt double [[TMP15]], 0.000000e+00 -; CHECK-NEXT: [[CMP1_I:%.*]] = fcmp ogt double [[TMP15]], 6.000000e+00 -; CHECK-NEXT: [[DOTV_I:%.*]] = select i1 [[CMP1_I]], double 6.000000e+00, double [[TMP15]] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP14:%.*]] = load double, ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt double [[TMP14]], 0.000000e+00 +; CHECK-NEXT: [[CMP1_I:%.*]] = fcmp ogt double [[TMP14]], 6.000000e+00 +; CHECK-NEXT: [[DOTV_I:%.*]] = select i1 [[CMP1_I]], double 6.000000e+00, double [[TMP14]] ; CHECK-NEXT: [[RETVAL_0_I:%.*]] = select i1 [[CMP_I]], double 0.000000e+00, double [[DOTV_I]] -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV]] ; CHECK-NEXT: store double [[RETVAL_0_I]], ptr [[ARRAYIDX2]], align 8 -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_04]], 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[I_04]], 19999 -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20000 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; entry: %X.addr = alloca ptr, align 8 @@ -193,8 +191,8 @@ ; CHECK-NEXT: [[ADD_SINK:%.*]] = phi float [ [[ADD]], [[ELSE]] ], [ [[MUL2_I81_I]], [[LOOP_BODY]] ] ; CHECK-NEXT: store float [[ADD_SINK]], ptr [[B_GEP_0]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 -; CHECK-NEXT: [[CMP_0:%.*]] = icmp ult i64 [[IV1]], 9999 -; CHECK-NEXT: br i1 [[CMP_0]], label [[LOOP_BODY]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 10000 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll --- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll @@ -92,21 +92,20 @@ ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <225 x double>, ptr [[B:%.*]], i64 0, i64 [[CONV6]] ; CHECK-NEXT: br label [[FOR_BODY4_US:%.*]] ; CHECK: for.body4.us: -; CHECK-NEXT: [[K_011_US:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[FOR_BODY4_US]] ] -; CHECK-NEXT: [[CONV_US:%.*]] = zext i32 [[K_011_US]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 [[K_011_US]], 225 +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY4_US]] ] +; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i64 [[INDVARS_IV]], 225 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP2]]) -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds <225 x double>, ptr [[A:%.*]], i64 0, i64 [[CONV_US]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds <225 x double>, ptr [[A:%.*]], i64 0, i64 [[INDVARS_IV]] ; CHECK-NEXT: [[MATRIXEXT_US:%.*]] = load double, ptr [[TMP3]], align 8 ; CHECK-NEXT: [[MATRIXEXT8_US:%.*]] = load double, ptr [[TMP1]], align 8 ; CHECK-NEXT: [[MUL_US:%.*]] = fmul double [[MATRIXEXT_US]], [[MATRIXEXT8_US]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[CONV_US]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[INDVARS_IV]] ; CHECK-NEXT: [[MATRIXEXT11_US:%.*]] = load double, ptr [[TMP4]], align 8 ; CHECK-NEXT: [[SUB_US:%.*]] = fsub double [[MATRIXEXT11_US]], [[MUL_US]] ; CHECK-NEXT: store double [[SUB_US]], ptr [[TMP4]], align 8 -; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[K_011_US]], 1 -; CHECK-NEXT: [[CMP2_US:%.*]] = icmp ult i32 [[INC_US]], [[I]] -; CHECK-NEXT: br i1 [[CMP2_US]], label [[FOR_BODY4_US]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[CONV6]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]], label [[FOR_BODY4_US]] ; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us: ; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[CONV6]], 15 ; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 [[I]], 210 @@ -114,10 +113,9 @@ ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[TMP5]] ; CHECK-NEXT: br label [[FOR_BODY4_US_1:%.*]] ; CHECK: for.body4.us.1: -; CHECK-NEXT: [[K_011_US_1:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ [[INC_US_1:%.*]], [[FOR_BODY4_US_1]] ] -; CHECK-NEXT: [[CONV_US_1:%.*]] = zext i32 [[K_011_US_1]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[CONV_US_1]], 15 -; CHECK-NEXT: [[TMP9:%.*]] = icmp ult i32 [[K_011_US_1]], 210 +; CHECK-NEXT: [[INDVARS_IV_1:%.*]] = phi i64 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_BODY4_US_1]] ] +; CHECK-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[INDVARS_IV_1]], 15 +; CHECK-NEXT: [[TMP9:%.*]] = icmp ult i64 [[INDVARS_IV_1]], 210 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP9]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds <225 x double>, ptr [[A]], i64 0, i64 [[TMP8]] ; CHECK-NEXT: [[MATRIXEXT_US_1:%.*]] = load double, ptr [[TMP10]], align 8 @@ -127,9 +125,9 @@ ; CHECK-NEXT: [[MATRIXEXT11_US_1:%.*]] = load double, ptr [[TMP11]], align 8 ; CHECK-NEXT: [[SUB_US_1:%.*]] = fsub double [[MATRIXEXT11_US_1]], [[MUL_US_1]] ; CHECK-NEXT: store double [[SUB_US_1]], ptr [[TMP11]], align 8 -; CHECK-NEXT: [[INC_US_1]] = add nuw nsw i32 [[K_011_US_1]], 1 -; CHECK-NEXT: [[CMP2_US_1:%.*]] = icmp ult i32 [[INC_US_1]], [[I]] -; CHECK-NEXT: br i1 [[CMP2_US_1]], label [[FOR_BODY4_US_1]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1:%.*]] +; CHECK-NEXT: [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV_1]], 1 +; CHECK-NEXT: [[EXITCOND_NOT_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], [[CONV6]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_1]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1:%.*]], label [[FOR_BODY4_US_1]] ; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us.1: ; CHECK-NEXT: [[TMP12:%.*]] = add nuw nsw i64 [[CONV6]], 30 ; CHECK-NEXT: [[TMP13:%.*]] = icmp ult i32 [[I]], 195 @@ -137,10 +135,9 @@ ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[TMP12]] ; CHECK-NEXT: br label [[FOR_BODY4_US_2:%.*]] ; CHECK: for.body4.us.2: -; CHECK-NEXT: [[K_011_US_2:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1]] ], [ [[INC_US_2:%.*]], [[FOR_BODY4_US_2]] ] -; CHECK-NEXT: [[CONV_US_2:%.*]] = zext i32 [[K_011_US_2]] to i64 -; CHECK-NEXT: [[TMP15:%.*]] = add nuw nsw i64 [[CONV_US_2]], 30 -; CHECK-NEXT: [[TMP16:%.*]] = icmp ult i32 [[K_011_US_2]], 195 +; CHECK-NEXT: [[INDVARS_IV_2:%.*]] = phi i64 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1]] ], [ [[INDVARS_IV_NEXT_2:%.*]], [[FOR_BODY4_US_2]] ] +; CHECK-NEXT: [[TMP15:%.*]] = add nuw nsw i64 [[INDVARS_IV_2]], 30 +; CHECK-NEXT: [[TMP16:%.*]] = icmp ult i64 [[INDVARS_IV_2]], 195 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP16]]) ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds <225 x double>, ptr [[A]], i64 0, i64 [[TMP15]] ; CHECK-NEXT: [[MATRIXEXT_US_2:%.*]] = load double, ptr [[TMP17]], align 8 @@ -150,9 +147,9 @@ ; CHECK-NEXT: [[MATRIXEXT11_US_2:%.*]] = load double, ptr [[TMP18]], align 8 ; CHECK-NEXT: [[SUB_US_2:%.*]] = fsub double [[MATRIXEXT11_US_2]], [[MUL_US_2]] ; CHECK-NEXT: store double [[SUB_US_2]], ptr [[TMP18]], align 8 -; CHECK-NEXT: [[INC_US_2]] = add nuw nsw i32 [[K_011_US_2]], 1 -; CHECK-NEXT: [[CMP2_US_2:%.*]] = icmp ult i32 [[INC_US_2]], [[I]] -; CHECK-NEXT: br i1 [[CMP2_US_2]], label [[FOR_BODY4_US_2]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2:%.*]] +; CHECK-NEXT: [[INDVARS_IV_NEXT_2]] = add nuw nsw i64 [[INDVARS_IV_2]], 1 +; CHECK-NEXT: [[EXITCOND_NOT_2:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_2]], [[CONV6]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_2]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2:%.*]], label [[FOR_BODY4_US_2]] ; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us.2: ; CHECK-NEXT: [[TMP19:%.*]] = add nuw nsw i64 [[CONV6]], 45 ; CHECK-NEXT: [[TMP20:%.*]] = icmp ult i32 [[I]], 180 @@ -160,10 +157,9 @@ ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[TMP19]] ; CHECK-NEXT: br label [[FOR_BODY4_US_3:%.*]] ; CHECK: for.body4.us.3: -; CHECK-NEXT: [[K_011_US_3:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2]] ], [ [[INC_US_3:%.*]], [[FOR_BODY4_US_3]] ] -; CHECK-NEXT: [[CONV_US_3:%.*]] = zext i32 [[K_011_US_3]] to i64 -; CHECK-NEXT: [[TMP22:%.*]] = add nuw nsw i64 [[CONV_US_3]], 45 -; CHECK-NEXT: [[TMP23:%.*]] = icmp ult i32 [[K_011_US_3]], 180 +; CHECK-NEXT: [[INDVARS_IV_3:%.*]] = phi i64 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_BODY4_US_3]] ] +; CHECK-NEXT: [[TMP22:%.*]] = add nuw nsw i64 [[INDVARS_IV_3]], 45 +; CHECK-NEXT: [[TMP23:%.*]] = icmp ult i64 [[INDVARS_IV_3]], 180 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP23]]) ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds <225 x double>, ptr [[A]], i64 0, i64 [[TMP22]] ; CHECK-NEXT: [[MATRIXEXT_US_3:%.*]] = load double, ptr [[TMP24]], align 8 @@ -173,9 +169,9 @@ ; CHECK-NEXT: [[MATRIXEXT11_US_3:%.*]] = load double, ptr [[TMP25]], align 8 ; CHECK-NEXT: [[SUB_US_3:%.*]] = fsub double [[MATRIXEXT11_US_3]], [[MUL_US_3]] ; CHECK-NEXT: store double [[SUB_US_3]], ptr [[TMP25]], align 8 -; CHECK-NEXT: [[INC_US_3]] = add nuw nsw i32 [[K_011_US_3]], 1 -; CHECK-NEXT: [[CMP2_US_3:%.*]] = icmp ult i32 [[INC_US_3]], [[I]] -; CHECK-NEXT: br i1 [[CMP2_US_3]], label [[FOR_BODY4_US_3]], label [[FOR_COND_CLEANUP]] +; CHECK-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV_3]], 1 +; CHECK-NEXT: [[EXITCOND_NOT_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], [[CONV6]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_3]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY4_US_3]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll --- a/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll @@ -16,19 +16,20 @@ ; CHECK-NEXT: [[START_INT_I:%.*]] = ptrtoint ptr [[START_I]] to i64 ; CHECK-NEXT: [[END_INT_I:%.*]] = ptrtoint ptr [[END_I]] to i64 ; CHECK-NEXT: [[SUB_I:%.*]] = sub i64 [[END_INT_I]], [[START_INT_I]] +; CHECK-NEXT: [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 0) ; CHECK-NEXT: [[GEP_END_I2:%.*]] = getelementptr [[VEC]], ptr [[B:%.*]], i64 0, i32 1 ; CHECK-NEXT: [[START_I1_PEEL:%.*]] = load ptr, ptr [[B]], align 8 ; CHECK-NEXT: [[END_I3_PEEL:%.*]] = load ptr, ptr [[GEP_END_I2]], align 8 ; CHECK-NEXT: [[START_INT_I4_PEEL:%.*]] = ptrtoint ptr [[START_I1_PEEL]] to i64 ; CHECK-NEXT: [[END_INT_I5_PEEL:%.*]] = ptrtoint ptr [[END_I3_PEEL]] to i64 ; CHECK-NEXT: [[SUB_I6_PEEL:%.*]] = sub i64 [[END_INT_I5_PEEL]], [[START_INT_I4_PEEL]] -; CHECK-NEXT: [[LV_I_PEEL:%.*]] = load i64, ptr [[START_I]], align 4 -; CHECK-NEXT: [[LV_I9_PEEL:%.*]] = load i64, ptr [[START_I1_PEEL]], align 4 +; CHECK-NEXT: [[LV_I_PEEL:%.*]] = load i64, ptr [[START_I]], align 8 +; CHECK-NEXT: [[LV_I9_PEEL:%.*]] = load i64, ptr [[START_I1_PEEL]], align 8 ; CHECK-NEXT: [[SUM_NEXT_PEEL:%.*]] = add i64 [[LV_I_PEEL]], [[LV_I9_PEEL]] -; CHECK-NEXT: [[C_PEEL:%.*]] = icmp sgt i64 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[C_PEEL]], label [[LOOP_PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK-NEXT: [[EXITCOND_PEEL_NOT:%.*]] = icmp slt i64 [[N]], 1 +; CHECK-NEXT: br i1 [[EXITCOND_PEEL_NOT]], label [[EXIT:%.*]], label [[LOOP_PREHEADER:%.*]] ; CHECK: loop.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1 +; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1 ; CHECK-NEXT: [[UMIN:%.*]] = tail call i64 @llvm.umin.i64(i64 [[SUB_I6_PEEL]], i64 [[TMP0]]) ; CHECK-NEXT: [[TMP1:%.*]] = freeze i64 [[UMIN]] ; CHECK-NEXT: [[UMIN15:%.*]] = tail call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[SUB_I]]) @@ -49,13 +50,13 @@ ; CHECK-NEXT: [[VEC_PHI16:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = or i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i64 2 -; CHECK-NEXT: [[WIDE_LOAD17:%.*]] = load <2 x i64>, ptr [[TMP7]], align 4 +; CHECK-NEXT: [[WIDE_LOAD17:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[WIDE_LOAD18:%.*]] = load <2 x i64>, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[WIDE_LOAD18:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i64 2 -; CHECK-NEXT: [[WIDE_LOAD19:%.*]] = load <2 x i64>, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[WIDE_LOAD19:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i64> [[WIDE_LOAD17]], [[VEC_PHI16]] ; CHECK-NEXT: [[TMP12]] = add <2 x i64> [[TMP10]], [[WIDE_LOAD18]] @@ -87,14 +88,14 @@ ; CHECK-NEXT: unreachable ; CHECK: at_with_int_conversion.exit11: ; CHECK-NEXT: [[GEP_IDX_I:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[IV]] -; CHECK-NEXT: [[LV_I:%.*]] = load i64, ptr [[GEP_IDX_I]], align 4 +; CHECK-NEXT: [[LV_I:%.*]] = load i64, ptr [[GEP_IDX_I]], align 8 ; CHECK-NEXT: [[GEP_IDX_I8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[IV]] -; CHECK-NEXT: [[LV_I9:%.*]] = load i64, ptr [[GEP_IDX_I8]], align 4 +; CHECK-NEXT: [[LV_I9:%.*]] = load i64, ptr [[GEP_IDX_I8]], align 8 ; CHECK-NEXT: [[ADD:%.*]] = add i64 [[LV_I]], [[SUM]] ; CHECK-NEXT: [[SUM_NEXT]] = add i64 [[ADD]], [[LV_I9]] -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[C:%.*]] = icmp slt i64 [[IV]], [[N]] -; CHECK-NEXT: br i1 [[C]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], [[SMAX]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[AT_WITH_INT_CONVERSION_EXIT11_PEEL:%.*]] ], [ [[SUM_NEXT]], [[AT_WITH_INT_CONVERSION_EXIT11]] ] ; CHECK-NEXT: ret i64 [[SUM_NEXT_LCSSA]] @@ -127,8 +128,9 @@ ; CHECK-NEXT: [[END_INT_I:%.*]] = ptrtoint ptr [[END_I]] to i64 ; CHECK-NEXT: [[SUB_I:%.*]] = sub i64 [[END_INT_I]], [[START_INT_I]] ; CHECK-NEXT: [[GEP_END_I13:%.*]] = getelementptr [[VEC]], ptr [[C:%.*]], i64 0, i32 1 +; CHECK-NEXT: [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 0) ; CHECK-NEXT: [[GEP_END_I2:%.*]] = getelementptr [[VEC]], ptr [[B:%.*]], i64 0, i32 1 -; CHECK-NEXT: [[LV_I_PEEL:%.*]] = load i64, ptr [[START_I]], align 4 +; CHECK-NEXT: [[LV_I_PEEL:%.*]] = load i64, ptr [[START_I]], align 8 ; CHECK-NEXT: [[START_I1_PEEL:%.*]] = load ptr, ptr [[B]], align 8 ; CHECK-NEXT: [[END_I3_PEEL:%.*]] = load ptr, ptr [[GEP_END_I2]], align 8 ; CHECK-NEXT: [[START_INT_I4_PEEL:%.*]] = ptrtoint ptr [[START_I1_PEEL]] to i64 @@ -140,14 +142,14 @@ ; CHECK-NEXT: [[START_INT_I15_PEEL:%.*]] = ptrtoint ptr [[START_I12_PEEL]] to i64 ; CHECK-NEXT: [[END_INT_I16_PEEL:%.*]] = ptrtoint ptr [[END_I14_PEEL]] to i64 ; CHECK-NEXT: [[SUB_I17_PEEL:%.*]] = sub i64 [[END_INT_I16_PEEL]], [[START_INT_I15_PEEL]] -; CHECK-NEXT: [[LV_I9_PEEL:%.*]] = load i64, ptr [[START_I1_PEEL]], align 4 -; CHECK-NEXT: [[LV_I20_PEEL:%.*]] = load i64, ptr [[START_I12_PEEL]], align 4 +; CHECK-NEXT: [[LV_I9_PEEL:%.*]] = load i64, ptr [[START_I1_PEEL]], align 8 +; CHECK-NEXT: [[LV_I20_PEEL:%.*]] = load i64, ptr [[START_I12_PEEL]], align 8 ; CHECK-NEXT: [[ADD_2_PEEL:%.*]] = add i64 [[LV_I_PEEL]], [[LV_I9_PEEL]] ; CHECK-NEXT: [[SUM_NEXT_PEEL:%.*]] = add i64 [[ADD_2_PEEL]], [[LV_I20_PEEL]] -; CHECK-NEXT: [[COND_PEEL:%.*]] = icmp sgt i64 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[COND_PEEL]], label [[LOOP_PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK-NEXT: [[EXITCOND_PEEL_NOT:%.*]] = icmp slt i64 [[N]], 1 +; CHECK-NEXT: br i1 [[EXITCOND_PEEL_NOT]], label [[EXIT:%.*]], label [[LOOP_PREHEADER:%.*]] ; CHECK: loop.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1 +; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1 ; CHECK-NEXT: [[UMIN:%.*]] = tail call i64 @llvm.umin.i64(i64 [[SUB_I17_PEEL]], i64 [[TMP0]]) ; CHECK-NEXT: [[TMP1:%.*]] = freeze i64 [[UMIN]] ; CHECK-NEXT: [[UMIN26:%.*]] = tail call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[SUB_I6_PEEL]]) @@ -169,17 +171,17 @@ ; CHECK-NEXT: [[VEC_PHI28:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = or i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i64 2 -; CHECK-NEXT: [[WIDE_LOAD29:%.*]] = load <2 x i64>, ptr [[TMP7]], align 4 +; CHECK-NEXT: [[WIDE_LOAD29:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[WIDE_LOAD30:%.*]] = load <2 x i64>, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[WIDE_LOAD30:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i64 2 -; CHECK-NEXT: [[WIDE_LOAD31:%.*]] = load <2 x i64>, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[WIDE_LOAD31:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[START_I12_PEEL]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[WIDE_LOAD32:%.*]] = load <2 x i64>, ptr [[TMP10]], align 4 +; CHECK-NEXT: [[WIDE_LOAD32:%.*]] = load <2 x i64>, ptr [[TMP10]], align 8 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i64, ptr [[TMP10]], i64 2 -; CHECK-NEXT: [[WIDE_LOAD33:%.*]] = load <2 x i64>, ptr [[TMP11]], align 4 +; CHECK-NEXT: [[WIDE_LOAD33:%.*]] = load <2 x i64>, ptr [[TMP11]], align 8 ; CHECK-NEXT: [[TMP12:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP13:%.*]] = add <2 x i64> [[WIDE_LOAD29]], [[VEC_PHI28]] ; CHECK-NEXT: [[TMP14:%.*]] = add <2 x i64> [[TMP12]], [[WIDE_LOAD30]] @@ -207,7 +209,7 @@ ; CHECK-NEXT: unreachable ; CHECK: at_with_int_conversion.exit: ; CHECK-NEXT: [[GEP_IDX_I:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[IV]] -; CHECK-NEXT: [[LV_I:%.*]] = load i64, ptr [[GEP_IDX_I]], align 4 +; CHECK-NEXT: [[LV_I:%.*]] = load i64, ptr [[GEP_IDX_I]], align 8 ; CHECK-NEXT: [[INRANGE_I7:%.*]] = icmp ult i64 [[SUB_I6_PEEL]], [[IV]] ; CHECK-NEXT: br i1 [[INRANGE_I7]], label [[ERROR_I10:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT11:%.*]] ; CHECK: error.i10: @@ -221,15 +223,15 @@ ; CHECK-NEXT: unreachable ; CHECK: at_with_int_conversion.exit22: ; CHECK-NEXT: [[GEP_IDX_I8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[IV]] -; CHECK-NEXT: [[LV_I9:%.*]] = load i64, ptr [[GEP_IDX_I8]], align 4 +; CHECK-NEXT: [[LV_I9:%.*]] = load i64, ptr [[GEP_IDX_I8]], align 8 ; CHECK-NEXT: [[GEP_IDX_I19:%.*]] = getelementptr i64, ptr [[START_I12_PEEL]], i64 [[IV]] -; CHECK-NEXT: [[LV_I20:%.*]] = load i64, ptr [[GEP_IDX_I19]], align 4 +; CHECK-NEXT: [[LV_I20:%.*]] = load i64, ptr [[GEP_IDX_I19]], align 8 ; CHECK-NEXT: [[ADD_1:%.*]] = add i64 [[LV_I]], [[SUM]] ; CHECK-NEXT: [[ADD_2:%.*]] = add i64 [[ADD_1]], [[LV_I9]] ; CHECK-NEXT: [[SUM_NEXT]] = add i64 [[ADD_2]], [[LV_I20]] -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[IV]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], [[SMAX]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[AT_WITH_INT_CONVERSION_EXIT22_PEEL:%.*]] ], [ [[SUM_NEXT]], [[AT_WITH_INT_CONVERSION_EXIT22]] ] ; CHECK-NEXT: ret i64 [[SUM_NEXT_LCSSA]] @@ -267,7 +269,7 @@ ; CHECK-NEXT: br i1 [[INRANGE]], label [[ERROR:%.*]], label [[EXIT:%.*]] ; CHECK: exit: ; CHECK-NEXT: [[GEP_IDX:%.*]] = getelementptr i64, ptr [[START]], i64 [[IDX]] -; CHECK-NEXT: [[LV:%.*]] = load i64, ptr [[GEP_IDX]], align 4 +; CHECK-NEXT: [[LV:%.*]] = load i64, ptr [[GEP_IDX]], align 8 ; CHECK-NEXT: ret i64 [[LV]] ; CHECK: error: ; CHECK-NEXT: tail call void @error() diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll @@ -183,9 +183,9 @@ define i64 @red_ld_2xi64(ptr %ptr) { ; CHECK-LABEL: @red_ld_2xi64( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[LD0:%.*]] = load i64, ptr [[PTR:%.*]], align 4 +; CHECK-NEXT: [[LD0:%.*]] = load i64, ptr [[PTR:%.*]], align 8 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 1 -; CHECK-NEXT: [[LD1:%.*]] = load i64, ptr [[GEP]], align 4 +; CHECK-NEXT: [[LD1:%.*]] = load i64, ptr [[GEP]], align 8 ; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i64 [[LD0]], [[LD1]] ; CHECK-NEXT: ret i64 [[ADD_1]] ; @@ -200,7 +200,7 @@ define i64 @red_ld_4xi64(ptr %ptr) { ; CHECK-LABEL: @red_ld_4xi64( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[PTR:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[PTR:%.*]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP0]]) ; CHECK-NEXT: ret i64 [[TMP1]] ; @@ -221,7 +221,7 @@ define i64 @red_ld_8xi64(ptr %ptr) { ; CHECK-LABEL: @red_ld_8xi64( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i64>, ptr [[PTR:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i64>, ptr [[PTR:%.*]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP0]]) ; CHECK-NEXT: ret i64 [[TMP1]] ; @@ -254,7 +254,7 @@ define i64 @red_ld_16xi64(ptr %ptr) { ; CHECK-LABEL: @red_ld_16xi64( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i64>, ptr [[PTR:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i64>, ptr [[PTR:%.*]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP0]]) ; CHECK-NEXT: ret i64 [[TMP1]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll @@ -101,7 +101,7 @@ ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = or <2 x i64> [[TMP0]], [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP3]], [[TMP6]] -; CHECK-NEXT: store <2 x i64> [[TMP7]], ptr [[ARRAY1]], align 4 +; CHECK-NEXT: store <2 x i64> [[TMP7]], ptr [[ARRAY1]], align 8 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll b/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll @@ -8,10 +8,10 @@ ; Base case with no interesting control dependencies define void @test_no_control(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: @test_no_control( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: store <2 x i64> [[TMP3]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: store <2 x i64> [[TMP3]], ptr [[B:%.*]], align 8 ; CHECK-NEXT: ret void ; %v1 = load i64, ptr %a @@ -32,13 +32,13 @@ define void @test1(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: @test1( -; CHECK-NEXT: [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[C1:%.*]] = load i64, ptr [[C:%.*]], align 8 ; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro() -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1 ; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] -; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 8 ; CHECK-NEXT: ret void ; %v1 = load i64, ptr %a @@ -58,13 +58,13 @@ define void @test2(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: @test2( -; CHECK-NEXT: [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[C1:%.*]] = load i64, ptr [[C:%.*]], align 8 ; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro() -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1 ; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] -; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 8 ; CHECK-NEXT: ret void ; %c1 = load i64, ptr %c @@ -85,13 +85,13 @@ define void @test3(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: @test3( -; CHECK-NEXT: [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[C1:%.*]] = load i64, ptr [[C:%.*]], align 8 ; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro() -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1 ; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] -; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 8 ; CHECK-NEXT: ret void ; %v1 = load i64, ptr %a @@ -111,13 +111,13 @@ define void @test4(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: @test4( -; CHECK-NEXT: [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[C1:%.*]] = load i64, ptr [[C:%.*]], align 8 ; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro() -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1 ; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] -; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 8 ; CHECK-NEXT: ret void ; %v1 = load i64, ptr %a @@ -138,12 +138,12 @@ define void @test5(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: @test5( ; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro() -; CHECK-NEXT: [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[C1:%.*]] = load i64, ptr [[C:%.*]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1 ; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] -; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 8 ; CHECK-NEXT: ret void ; %a2 = getelementptr i64, ptr %a, i32 1 @@ -164,10 +164,10 @@ define void @test6(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: @test6( ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro() -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8 ; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]] -; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 8 ; CHECK-NEXT: ret void ; %v1 = load i64, ptr %a @@ -196,15 +196,15 @@ define void @test7(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: @test7( ; CHECK-NEXT: [[A2:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 1 -; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A]], align 4 -; CHECK-NEXT: store i64 0, ptr [[A]], align 4 +; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A]], align 8 +; CHECK-NEXT: store i64 0, ptr [[A]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro() -; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]] -; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 8 ; CHECK-NEXT: ret void ; %v1 = load i64, ptr %a @@ -229,15 +229,15 @@ define void @test8(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: @test8( ; CHECK-NEXT: [[A2:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 1 -; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A]], align 4 -; CHECK-NEXT: store i64 0, ptr [[A]], align 4 +; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A]], align 8 +; CHECK-NEXT: store i64 0, ptr [[A]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_throw() #[[ATTR4:[0-9]+]] -; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]] -; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 8 ; CHECK-NEXT: ret void ; %v1 = load i64, ptr %a @@ -262,15 +262,15 @@ define void @test9(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: @test9( ; CHECK-NEXT: [[A2:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 1 -; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A]], align 4 -; CHECK-NEXT: store i64 0, ptr [[A]], align 4 +; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A]], align 8 +; CHECK-NEXT: store i64 0, ptr [[A]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_throw() -; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]] -; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 8 ; CHECK-NEXT: ret void ; %v1 = load i64, ptr %a @@ -294,18 +294,18 @@ ; A variant of test7 which shows the same problem with a non-load instruction define void @test10(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: @test10( -; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A:%.*]], align 8 ; CHECK-NEXT: [[A2:%.*]] = getelementptr i64, ptr [[A]], i32 1 -; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 4 +; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 8 ; CHECK-NEXT: [[U1:%.*]] = udiv i64 200, [[V1]] -; CHECK-NEXT: store i64 [[U1]], ptr [[A]], align 4 +; CHECK-NEXT: store i64 [[U1]], ptr [[A]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro() ; CHECK-NEXT: [[U2:%.*]] = udiv i64 200, [[V2]] -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[U2]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]] -; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 8 ; CHECK-NEXT: ret void ; %v1 = load i64, ptr %a @@ -334,14 +334,14 @@ define void @test11(i64 %x, i64 %y, ptr %b, ptr %c) { ; CHECK-LABEL: @test11( ; CHECK-NEXT: [[U1:%.*]] = udiv i64 200, [[X:%.*]] -; CHECK-NEXT: store i64 [[U1]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: store i64 [[U1]], ptr [[B:%.*]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro() ; CHECK-NEXT: [[U2:%.*]] = udiv i64 200, [[Y:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[U2]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]] -; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B]], align 4 +; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B]], align 8 ; CHECK-NEXT: ret void ; %u1 = udiv i64 200, %x diff --git a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll @@ -52,14 +52,17 @@ define void @test2(i64* %a, i64* %b) { ; CHECK-LABEL: @test2( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[A:%.*]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[B:%.*]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP4]], [[TMP6]] -; CHECK-NEXT: store <2 x i64> [[TMP7]], ptr [[TMP5]], align 8 +; CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 1 +; CHECK-NEXT: [[A2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 2 +; CHECK-NEXT: [[I1:%.*]] = ptrtoint ptr [[A1]] to i64 +; CHECK-NEXT: [[B3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 3 +; CHECK-NEXT: [[I2:%.*]] = ptrtoint ptr [[B3]] to i64 +; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A1]], align 8 +; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 8 +; CHECK-NEXT: [[ADD1:%.*]] = add i64 [[I1]], [[V1]] +; CHECK-NEXT: [[ADD2:%.*]] = add i64 [[I2]], [[V2]] +; CHECK-NEXT: store i64 [[ADD1]], ptr [[A1]], align 8 +; CHECK-NEXT: store i64 [[ADD2]], ptr [[A2]], align 8 ; CHECK-NEXT: ret void ; %a1 = getelementptr inbounds i64, i64* %a, i64 1 diff --git a/llvm/test/Transforms/SafeStack/X86/setjmp2.ll b/llvm/test/Transforms/SafeStack/X86/setjmp2.ll --- a/llvm/test/Transforms/SafeStack/X86/setjmp2.ll +++ b/llvm/test/Transforms/SafeStack/X86/setjmp2.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 -; RUN: opt -safe-stack -S -mtriple=i386-pc-linux-gnu < %s -o - | FileCheck %s -; RUN: opt -safe-stack -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck %s +; RUN: opt -safe-stack -S -mtriple=i386-pc-linux-gnu < %s -o - | FileCheck %s --check-prefix=I386 +; RUN: opt -safe-stack -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck %s --check-prefix=X86-64 %struct.__jmp_buf_tag = type { [8 x i64], i32, %struct.__sigset_t } %struct.__sigset_t = type { [16 x i64] } @@ -11,27 +11,48 @@ ; setjmp/longjmp test with dynamically sized array. ; Requires protector. define i32 @foo(i32 %size) nounwind uwtable safestack { -; CHECK-LABEL: define i32 @foo( -; CHECK-SAME: i32 [[SIZE:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[UNSAFE_STACK_PTR:%.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr, align 8 -; CHECK-NEXT: [[UNSAFE_STACK_DYNAMIC_PTR:%.*]] = alloca ptr, align 8 -; CHECK-NEXT: store ptr [[UNSAFE_STACK_PTR]], ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[SIZE]] to i64 -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[TMP2]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], -16 -; CHECK-NEXT: [[A:%.*]] = inttoptr i64 [[TMP5]] to ptr -; CHECK-NEXT: store ptr [[A]], ptr @__safestack_unsafe_stack_ptr, align 8 -; CHECK-NEXT: store ptr [[A]], ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 8 -; CHECK-NEXT: [[CALL:%.*]] = call i32 @_setjmp(ptr @buf) #[[ATTR1:[0-9]+]] -; CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 8 -; CHECK-NEXT: store ptr [[TMP6]], ptr @__safestack_unsafe_stack_ptr, align 8 -; CHECK-NEXT: call void @funcall(ptr [[A]]) -; CHECK-NEXT: store ptr [[UNSAFE_STACK_PTR]], ptr @__safestack_unsafe_stack_ptr, align 8 -; CHECK-NEXT: ret i32 0 +; I386-LABEL: define i32 @foo( +; I386-SAME: i32 [[SIZE:%.*]]) #[[ATTR0:[0-9]+]] { +; I386-NEXT: entry: +; I386-NEXT: [[UNSAFE_STACK_PTR:%.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr, align 4 +; I386-NEXT: [[UNSAFE_STACK_DYNAMIC_PTR:%.*]] = alloca ptr, align 4 +; I386-NEXT: store ptr [[UNSAFE_STACK_PTR]], ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 4 +; I386-NEXT: [[TMP0:%.*]] = mul i32 [[SIZE]], 4 +; I386-NEXT: [[TMP1:%.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr, align 4 +; I386-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i32 +; I386-NEXT: [[TMP3:%.*]] = sub i32 [[TMP2]], [[TMP0]] +; I386-NEXT: [[TMP4:%.*]] = and i32 [[TMP3]], -16 +; I386-NEXT: [[A:%.*]] = inttoptr i32 [[TMP4]] to ptr +; I386-NEXT: store ptr [[A]], ptr @__safestack_unsafe_stack_ptr, align 4 +; I386-NEXT: store ptr [[A]], ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 4 +; I386-NEXT: [[CALL:%.*]] = call i32 @_setjmp(ptr @buf) #[[ATTR1:[0-9]+]] +; I386-NEXT: [[TMP5:%.*]] = load ptr, ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 4 +; I386-NEXT: store ptr [[TMP5]], ptr @__safestack_unsafe_stack_ptr, align 4 +; I386-NEXT: call void @funcall(ptr [[A]]) +; I386-NEXT: store ptr [[UNSAFE_STACK_PTR]], ptr @__safestack_unsafe_stack_ptr, align 4 +; I386-NEXT: ret i32 0 +; +; X86-64-LABEL: define i32 @foo( +; X86-64-SAME: i32 [[SIZE:%.*]]) #[[ATTR0:[0-9]+]] { +; X86-64-NEXT: entry: +; X86-64-NEXT: [[UNSAFE_STACK_PTR:%.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr, align 8 +; X86-64-NEXT: [[UNSAFE_STACK_DYNAMIC_PTR:%.*]] = alloca ptr, align 8 +; X86-64-NEXT: store ptr [[UNSAFE_STACK_PTR]], ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 8 +; X86-64-NEXT: [[TMP0:%.*]] = zext i32 [[SIZE]] to i64 +; X86-64-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; X86-64-NEXT: [[TMP2:%.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr, align 8 +; X86-64-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[TMP2]] to i64 +; X86-64-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], [[TMP1]] +; X86-64-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], -16 +; X86-64-NEXT: [[A:%.*]] = inttoptr i64 [[TMP5]] to ptr +; X86-64-NEXT: store ptr [[A]], ptr @__safestack_unsafe_stack_ptr, align 8 +; X86-64-NEXT: store ptr [[A]], ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 8 +; X86-64-NEXT: [[CALL:%.*]] = call i32 @_setjmp(ptr @buf) #[[ATTR1:[0-9]+]] +; X86-64-NEXT: [[TMP6:%.*]] = load ptr, ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 8 +; X86-64-NEXT: store ptr [[TMP6]], ptr @__safestack_unsafe_stack_ptr, align 8 +; X86-64-NEXT: call void @funcall(ptr [[A]]) +; X86-64-NEXT: store ptr [[UNSAFE_STACK_PTR]], ptr @__safestack_unsafe_stack_ptr, align 8 +; X86-64-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn.ll --- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn.ll +++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn.ll @@ -22,20 +22,20 @@ ; IR-NEXT: .preheader: ; IR-NEXT: [[I:%.*]] = sext i32 [[Y]] to i64 ; IR-NEXT: [[I1:%.*]] = sext i32 [[X]] to i64 -; IR-NEXT: [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 [[I1]], i64 [[I]] +; IR-NEXT: [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i32 0, i32 [[X]], i32 [[Y]] ; IR-NEXT: [[I3:%.*]] = addrspacecast ptr addrspace(3) [[I2]] to ptr ; IR-NEXT: [[I4:%.*]] = load float, ptr [[I3]], align 4 ; IR-NEXT: [[I5:%.*]] = fadd float [[I4]], 0.000000e+00 -; IR-NEXT: [[I82:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 1 -; IR-NEXT: [[I9:%.*]] = addrspacecast ptr addrspace(3) [[I82]] to ptr +; IR-NEXT: [[I87:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 1 +; IR-NEXT: [[I9:%.*]] = addrspacecast ptr addrspace(3) [[I87]] to ptr ; IR-NEXT: [[I10:%.*]] = load float, ptr [[I9]], align 4 ; IR-NEXT: [[I11:%.*]] = fadd float [[I5]], [[I10]] -; IR-NEXT: [[I144:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 32 -; IR-NEXT: [[I15:%.*]] = addrspacecast ptr addrspace(3) [[I144]] to ptr +; IR-NEXT: [[I1412:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 32 +; IR-NEXT: [[I15:%.*]] = addrspacecast ptr addrspace(3) [[I1412]] to ptr ; IR-NEXT: [[I16:%.*]] = load float, ptr [[I15]], align 4 ; IR-NEXT: [[I17:%.*]] = fadd float [[I11]], [[I16]] -; IR-NEXT: [[I187:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 33 -; IR-NEXT: [[I19:%.*]] = addrspacecast ptr addrspace(3) [[I187]] to ptr +; IR-NEXT: [[I1818:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 33 +; IR-NEXT: [[I19:%.*]] = addrspacecast ptr addrspace(3) [[I1818]] to ptr ; IR-NEXT: [[I20:%.*]] = load float, ptr [[I19]], align 4 ; IR-NEXT: [[I21:%.*]] = fadd float [[I17]], [[I20]] ; IR-NEXT: store float [[I21]], ptr [[OUTPUT]], align 4 @@ -84,20 +84,20 @@ ; IR-NEXT: .preheader: ; IR-NEXT: [[I:%.*]] = sext i32 [[Y]] to i64 ; IR-NEXT: [[I1:%.*]] = sext i32 [[X]] to i64 -; IR-NEXT: [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 [[I1]], i64 [[I]] +; IR-NEXT: [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i32 0, i32 [[X]], i32 [[Y]] ; IR-NEXT: [[I3:%.*]] = addrspacecast ptr addrspace(3) [[I2]] to ptr ; IR-NEXT: [[I4:%.*]] = load float, ptr [[I3]], align 4 ; IR-NEXT: [[I5:%.*]] = fadd float [[I4]], 0.000000e+00 -; IR-NEXT: [[I72:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 1 -; IR-NEXT: [[I8:%.*]] = addrspacecast ptr addrspace(3) [[I72]] to ptr +; IR-NEXT: [[I77:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 1 +; IR-NEXT: [[I8:%.*]] = addrspacecast ptr addrspace(3) [[I77]] to ptr ; IR-NEXT: [[I9:%.*]] = load float, ptr [[I8]], align 4 ; IR-NEXT: [[I10:%.*]] = fadd float [[I5]], [[I9]] -; IR-NEXT: [[I124:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 32 -; IR-NEXT: [[I13:%.*]] = addrspacecast ptr addrspace(3) [[I124]] to ptr +; IR-NEXT: [[I1212:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 32 +; IR-NEXT: [[I13:%.*]] = addrspacecast ptr addrspace(3) [[I1212]] to ptr ; IR-NEXT: [[I14:%.*]] = load float, ptr [[I13]], align 4 ; IR-NEXT: [[I15:%.*]] = fadd float [[I10]], [[I14]] -; IR-NEXT: [[I167:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 33 -; IR-NEXT: [[I17:%.*]] = addrspacecast ptr addrspace(3) [[I167]] to ptr +; IR-NEXT: [[I1618:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 33 +; IR-NEXT: [[I17:%.*]] = addrspacecast ptr addrspace(3) [[I1618]] to ptr ; IR-NEXT: [[I18:%.*]] = load float, ptr [[I17]], align 4 ; IR-NEXT: [[I19:%.*]] = fadd float [[I15]], [[I18]] ; IR-NEXT: store float [[I19]], ptr [[OUTPUT]], align 4 @@ -145,20 +145,20 @@ ; IR-NEXT: .preheader: ; IR-NEXT: [[I:%.*]] = zext i32 [[Y]] to i64 ; IR-NEXT: [[I1:%.*]] = zext i32 [[X]] to i64 -; IR-NEXT: [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 [[I1]], i64 [[I]] +; IR-NEXT: [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i32 0, i32 [[X]], i32 [[Y]] ; IR-NEXT: [[I3:%.*]] = addrspacecast ptr addrspace(3) [[I2]] to ptr ; IR-NEXT: [[I4:%.*]] = load float, ptr [[I3]], align 4 ; IR-NEXT: [[I5:%.*]] = fadd float [[I4]], 0.000000e+00 -; IR-NEXT: [[I82:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 1 -; IR-NEXT: [[I9:%.*]] = addrspacecast ptr addrspace(3) [[I82]] to ptr +; IR-NEXT: [[I87:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 1 +; IR-NEXT: [[I9:%.*]] = addrspacecast ptr addrspace(3) [[I87]] to ptr ; IR-NEXT: [[I10:%.*]] = load float, ptr [[I9]], align 4 ; IR-NEXT: [[I11:%.*]] = fadd float [[I5]], [[I10]] -; IR-NEXT: [[I144:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 32 -; IR-NEXT: [[I15:%.*]] = addrspacecast ptr addrspace(3) [[I144]] to ptr +; IR-NEXT: [[I1412:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 32 +; IR-NEXT: [[I15:%.*]] = addrspacecast ptr addrspace(3) [[I1412]] to ptr ; IR-NEXT: [[I16:%.*]] = load float, ptr [[I15]], align 4 ; IR-NEXT: [[I17:%.*]] = fadd float [[I11]], [[I16]] -; IR-NEXT: [[I187:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 33 -; IR-NEXT: [[I19:%.*]] = addrspacecast ptr addrspace(3) [[I187]] to ptr +; IR-NEXT: [[I1818:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 33 +; IR-NEXT: [[I19:%.*]] = addrspacecast ptr addrspace(3) [[I1818]] to ptr ; IR-NEXT: [[I20:%.*]] = load float, ptr [[I19]], align 4 ; IR-NEXT: [[I21:%.*]] = fadd float [[I17]], [[I20]] ; IR-NEXT: store float [[I21]], ptr [[OUTPUT]], align 4 @@ -205,20 +205,20 @@ ; IR-NEXT: .preheader: ; IR-NEXT: [[I:%.*]] = zext i32 [[Y]] to i64 ; IR-NEXT: [[I1:%.*]] = zext i32 [[X]] to i64 -; IR-NEXT: [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 [[I1]], i64 [[I]] +; IR-NEXT: [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i32 0, i32 [[X]], i32 [[Y]] ; IR-NEXT: [[I3:%.*]] = addrspacecast ptr addrspace(3) [[I2]] to ptr ; IR-NEXT: [[I4:%.*]] = load float, ptr [[I3]], align 4 ; IR-NEXT: [[I5:%.*]] = fadd float [[I4]], 0.000000e+00 -; IR-NEXT: [[I72:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 1 -; IR-NEXT: [[I8:%.*]] = addrspacecast ptr addrspace(3) [[I72]] to ptr +; IR-NEXT: [[I77:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 1 +; IR-NEXT: [[I8:%.*]] = addrspacecast ptr addrspace(3) [[I77]] to ptr ; IR-NEXT: [[I9:%.*]] = load float, ptr [[I8]], align 4 ; IR-NEXT: [[I10:%.*]] = fadd float [[I5]], [[I9]] -; IR-NEXT: [[I124:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 32 -; IR-NEXT: [[I13:%.*]] = addrspacecast ptr addrspace(3) [[I124]] to ptr +; IR-NEXT: [[I1212:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 32 +; IR-NEXT: [[I13:%.*]] = addrspacecast ptr addrspace(3) [[I1212]] to ptr ; IR-NEXT: [[I14:%.*]] = load float, ptr [[I13]], align 4 ; IR-NEXT: [[I15:%.*]] = fadd float [[I10]], [[I14]] -; IR-NEXT: [[I167:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 33 -; IR-NEXT: [[I17:%.*]] = addrspacecast ptr addrspace(3) [[I167]] to ptr +; IR-NEXT: [[I1618:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 33 +; IR-NEXT: [[I17:%.*]] = addrspacecast ptr addrspace(3) [[I1618]] to ptr ; IR-NEXT: [[I18:%.*]] = load float, ptr [[I17]], align 4 ; IR-NEXT: [[I19:%.*]] = fadd float [[I15]], [[I18]] ; IR-NEXT: store float [[I19]], ptr [[OUTPUT]], align 4 diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep.ll --- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep.ll +++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep.ll @@ -12,10 +12,9 @@ define ptr addrspace(3) @packed_struct(i32 %i, i32 %j) { ; CHECK-LABEL: @packed_struct( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[I:%.*]] to i64 -; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[J:%.*]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [1024 x %struct.Packed], ptr addrspace(3) @packed_struct_array, i64 0, i64 [[TMP0]], i32 1, i64 [[TMP1]] -; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i64 100 +; CHECK-NEXT: [[IDXPROM:%.*]] = trunc i64 0 to i32 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr [1024 x %struct.Packed], ptr addrspace(3) @packed_struct_array, i32 [[IDXPROM]], i32 [[I:%.*]], i32 1, i32 [[J:%.*]] +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP0]], i32 100 ; CHECK-NEXT: ret ptr addrspace(3) [[UGLYGEP]] ; entry: diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll --- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll +++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll @@ -163,7 +163,7 @@ ; CHECK-NEXT: [[I2:%.*]] = add i64 [[B]], [[A]] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 [[I2]], i64 0 ; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i64 160 -; CHECK-NEXT: store i64 [[B5]], ptr [[OUT]], align 4 +; CHECK-NEXT: store i64 [[B5]], ptr [[OUT]], align 8 ; CHECK-NEXT: ret ptr [[P3]] ; entry: @@ -358,8 +358,8 @@ ; CHECK-SAME: ptr [[PTR:%.*]], i64 [[IDX:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr [[STRUCT0:%.*]], ptr [[PTR]], i64 0, i32 3, i64 [[IDX]], i32 1 -; CHECK-NEXT: [[PTR22:%.*]] = getelementptr inbounds [[STRUCT2:%.*]], ptr [[TMP0]], i64 -3 -; CHECK-NEXT: ret ptr [[PTR22]] +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 -64 +; CHECK-NEXT: ret ptr [[UGLYGEP]] ; entry: %arrayidx = add nsw i64 %idx, -2 @@ -373,7 +373,7 @@ ; CHECK-SAME: ptr [[PTR:%.*]], i64 [[IDX:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr [[STRUCT0:%.*]], ptr [[PTR]], i64 0, i32 3, i64 [[IDX]], i32 1 -; CHECK-NEXT: [[PTR21:%.*]] = getelementptr inbounds [[STRUCT2:%.*]], ptr [[TMP0]], i64 151 +; CHECK-NEXT: [[PTR21:%.*]] = getelementptr inbounds [[STRUCT2:%.*]], ptr [[TMP0]], i64 134 ; CHECK-NEXT: ret ptr [[PTR21]] ; entry: @@ -390,7 +390,7 @@ ; CHECK-SAME: ptr [[PTR:%.*]], i64 [[IDX:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr [[STRUCT0:%.*]], ptr [[PTR]], i64 0, i32 3, i64 [[IDX]], i32 1 -; CHECK-NEXT: [[PTR21:%.*]] = getelementptr inbounds [[STRUCT2:%.*]], ptr [[TMP0]], i64 151 +; CHECK-NEXT: [[PTR21:%.*]] = getelementptr inbounds [[STRUCT2:%.*]], ptr [[TMP0]], i64 134 ; CHECK-NEXT: ret ptr [[PTR21]] ; entry: diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/RISCV/split-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/RISCV/split-gep.ll --- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/RISCV/split-gep.ll +++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/RISCV/split-gep.ll @@ -13,11 +13,11 @@ ; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[I:%.*]], 5 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i64, ptr [[ARRAY:%.*]], i64 [[I]] ; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 5 -; CHECK-NEXT: store i64 [[J:%.*]], ptr [[GEP4]], align 4 +; CHECK-NEXT: store i64 [[J:%.*]], ptr [[GEP4]], align 8 ; CHECK-NEXT: [[GEP26:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 6 -; CHECK-NEXT: store i64 [[J]], ptr [[GEP26]], align 4 +; CHECK-NEXT: store i64 [[J]], ptr [[GEP26]], align 8 ; CHECK-NEXT: [[GEP38:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 35 -; CHECK-NEXT: store i64 [[ADD]], ptr [[GEP38]], align 4 +; CHECK-NEXT: store i64 [[ADD]], ptr [[GEP38]], align 8 ; CHECK-NEXT: ret i64 undef ; entry: @@ -169,11 +169,11 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[I:%.*]], 5 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i64, ptr [[ARRAY:%.*]], i64 [[J:%.*]] -; CHECK-NEXT: store i64 [[ADD]], ptr [[GEP]], align 4 +; CHECK-NEXT: store i64 [[ADD]], ptr [[GEP]], align 8 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i64, ptr [[ARRAY]], i64 [[I]] ; CHECK-NEXT: [[GEP52:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 6 -; CHECK-NEXT: store i64 [[I]], ptr [[GEP52]], align 4 -; CHECK-NEXT: store i64 [[I]], ptr [[TMP0]], align 4 +; CHECK-NEXT: store i64 [[I]], ptr [[GEP52]], align 8 +; CHECK-NEXT: store i64 [[I]], ptr [[TMP0]], align 8 ; CHECK-NEXT: ret i64 undef ; entry: diff --git a/llvm/test/Transforms/TypePromotion/ARM/pointers.ll b/llvm/test/Transforms/TypePromotion/ARM/pointers.ll --- a/llvm/test/Transforms/TypePromotion/ARM/pointers.ll +++ b/llvm/test/Transforms/TypePromotion/ARM/pointers.ll @@ -130,7 +130,7 @@ define i16 @pointer_to_pointer(ptr %arg, i16 zeroext %limit) { ; CHECK-LABEL: @pointer_to_pointer( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ADDR:%.*]] = load ptr, ptr [[ARG:%.*]], align 8 +; CHECK-NEXT: [[ADDR:%.*]] = load ptr, ptr [[ARG:%.*]], align 4 ; CHECK-NEXT: [[VAL:%.*]] = load i16, ptr [[ADDR]], align 2 ; CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[VAL]] to i32 ; CHECK-NEXT: [[ADD:%.*]] = add nuw i32 [[TMP0]], 7 diff --git a/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant-inseltpoison.ll --- a/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant-inseltpoison.ll @@ -26,7 +26,7 @@ define <2 x i64> @add_constant_load(ptr %p) { ; CHECK-LABEL: @add_constant_load( -; CHECK-NEXT: [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8 ; CHECK-NEXT: [[INS:%.*]] = insertelement <2 x i64> poison, i64 [[LD]], i32 0 ; CHECK-NEXT: [[BO:%.*]] = add <2 x i64> [[INS]], ; CHECK-NEXT: ret <2 x i64> [[BO]] @@ -152,7 +152,7 @@ define <2 x i64> @shl_constant_op0_load(ptr %p) { ; CHECK-LABEL: @shl_constant_op0_load( -; CHECK-NEXT: [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8 ; CHECK-NEXT: [[INS:%.*]] = insertelement <2 x i64> poison, i64 [[LD]], i32 1 ; CHECK-NEXT: [[BO:%.*]] = shl <2 x i64> , [[INS]] ; CHECK-NEXT: ret <2 x i64> [[BO]] @@ -203,7 +203,7 @@ define <2 x i64> @shl_constant_op1_load(ptr %p) { ; CHECK-LABEL: @shl_constant_op1_load( -; CHECK-NEXT: [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8 ; CHECK-NEXT: [[INS:%.*]] = insertelement <2 x i64> poison, i64 [[LD]], i32 0 ; CHECK-NEXT: [[BO:%.*]] = shl nuw <2 x i64> [[INS]], ; CHECK-NEXT: ret <2 x i64> [[BO]] diff --git a/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll b/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll --- a/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll +++ b/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll @@ -26,7 +26,7 @@ define <2 x i64> @add_constant_load(ptr %p) { ; CHECK-LABEL: @add_constant_load( -; CHECK-NEXT: [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8 ; CHECK-NEXT: [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[LD]], i32 0 ; CHECK-NEXT: [[BO:%.*]] = add <2 x i64> [[INS]], ; CHECK-NEXT: ret <2 x i64> [[BO]] @@ -152,7 +152,7 @@ define <2 x i64> @shl_constant_op0_load(ptr %p) { ; CHECK-LABEL: @shl_constant_op0_load( -; CHECK-NEXT: [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8 ; CHECK-NEXT: [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[LD]], i32 1 ; CHECK-NEXT: [[BO:%.*]] = shl <2 x i64> , [[INS]] ; CHECK-NEXT: ret <2 x i64> [[BO]] @@ -203,7 +203,7 @@ define <2 x i64> @shl_constant_op1_load(ptr %p) { ; CHECK-LABEL: @shl_constant_op1_load( -; CHECK-NEXT: [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8 ; CHECK-NEXT: [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[LD]], i32 0 ; CHECK-NEXT: [[BO:%.*]] = shl nuw <2 x i64> [[INS]], ; CHECK-NEXT: ret <2 x i64> [[BO]] diff --git a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll --- a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll @@ -169,7 +169,7 @@ define <2 x i1> @constant_op1_i64_load(ptr %p) { ; CHECK-LABEL: @constant_op1_i64_load( -; CHECK-NEXT: [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8 ; CHECK-NEXT: [[INS:%.*]] = insertelement <2 x i64> poison, i64 [[LD]], i32 0 ; CHECK-NEXT: [[R:%.*]] = icmp eq <2 x i64> [[INS]], ; CHECK-NEXT: ret <2 x i1> [[R]] diff --git a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll --- a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll +++ b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll @@ -169,7 +169,7 @@ define <2 x i1> @constant_op1_i64_load(ptr %p) { ; CHECK-LABEL: @constant_op1_i64_load( -; CHECK-NEXT: [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8 ; CHECK-NEXT: [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[LD]], i32 0 ; CHECK-NEXT: [[R:%.*]] = icmp eq <2 x i64> [[INS]], ; CHECK-NEXT: ret <2 x i1> [[R]] diff --git a/llvm/test/tools/opt/infer-data-layout.ll b/llvm/test/tools/opt/infer-data-layout.ll new file mode 100644 --- /dev/null +++ b/llvm/test/tools/opt/infer-data-layout.ll @@ -0,0 +1,13 @@ +; REQUIRES: x86-registered-target +;; Check that we infer the correct datalayout from a target triple +; RUN: opt -mtriple=i386-linux-gnu -S -passes=no-op-module < %s | FileCheck %s --check-prefix=LINUX +; RUN: opt -mtriple=i386-apple-darwin -S -passes=no-op-module < %s | FileCheck %s --check-prefix=DARWIN +; RUN: opt -mtriple=i386-windows-msvc -S -passes=no-op-module < %s | FileCheck %s --check-prefix=WINDOWS + +target datalayout = "" +; LINUX: target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:128-f64:32:64-f80:32-n8:16:32-S128" +; LINUX: target triple = "i386-unknown-linux-gnu" +; DARWIN: target datalayout = "e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:128-f64:32:64-f80:128-n8:16:32-S128" +; DARWIN: target triple = "i386-apple-darwin" +; WINDOWS: target datalayout = "e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32-a:0:32-S32" +; WINDOWS: target triple = "i386-unknown-windows-msvc" diff --git a/llvm/test/tools/opt/invalid-target.ll b/llvm/test/tools/opt/invalid-target.ll new file mode 100644 --- /dev/null +++ b/llvm/test/tools/opt/invalid-target.ll @@ -0,0 +1,16 @@ +;; Check that invalid triples are handled correctly by opt. + +;; No diagnostics should be printed for an explicitly/implicitly empty triple +; RUN: opt -S -passes=no-op-module -o /dev/null < %s 2>&1 | FileCheck %s --allow-empty --check-prefix=EMPTY +; RUN: opt '-mtriple=' -S -passes=no-op-module -o /dev/null < %s 2>&1 | FileCheck %s --allow-empty --check-prefix=EMPTY +; EMPTY-NOT: {{.+}} + +;; Using "unknown" as the architecture is explicitly allowed (but warns) +; RUN: opt -mtriple=unknown -S -passes=no-op-module -o /dev/null < %s 2>&1 | FileCheck %s --check-prefix=UNKNOWN +; UNKNOWN: opt: warning: failed to infer data layout: unable to get target for 'unknown', see --version and --triple. + +;; However, any other invalid target triple should cause the tool to fail: +; RUN: not opt -mtriple=invalid -S -passes=no-op-module -o /dev/null < %s 2>&1 | FileCheck %s --check-prefix=INVALID +; INVALID: opt: warning: failed to infer data layout: unable to get target for 'invalid', see --version and --triple. +; INVALID-NEXT: opt: unrecognized architecture 'invalid' provided. +; INVALID-EMPTY: diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp --- a/llvm/tools/opt/opt.cpp +++ b/llvm/tools/opt/opt.cpp @@ -500,10 +500,36 @@ std::unique_ptr RemarksFile = std::move(*RemarksFileOrErr); // Load the input module... - auto SetDataLayout = [](StringRef, StringRef) -> std::optional { - if (ClDataLayout.empty()) + auto SetDataLayout = [&](StringRef IRTriple, + StringRef IRLayout) -> std::optional { + // Data layout specified on the command line has the highest priority. + if (!ClDataLayout.empty()) + return ClDataLayout; + // If an explicit data layout is already defined in the IR, don't infer. + if (!IRLayout.empty()) return std::nullopt; - return ClDataLayout; + + // If an explicit triple was specified (either in the IR or on the + // command line), use that to infer the default data layout. However, the + // command line target triple should override the IR file target triple. + std::string TripleStr = + TargetTriple.empty() ? IRTriple.str() : Triple::normalize(TargetTriple); + // If the triple string is still empty, we don't fall back to + // sys::getDefaultTargetTriple() since we do not want to have differing + // behaviour dependent on the configured default triple. Therefore, if the + // user did not pass -mtriple or define an explicit triple/datalayout in + // the IR, we should default to an empty (default) DataLayout. + if (TripleStr.empty()) + return std::nullopt; + // Otherwise we infer the DataLayout from the target machine. + Expected> ExpectedTM = + codegen::createTargetMachineForTriple(TripleStr, GetCodeGenOptLevel()); + if (!ExpectedTM) { + errs() << argv[0] << ": warning: failed to infer data layout: " + << toString(ExpectedTM.takeError()) << "\n"; + return std::nullopt; + } + return (*ExpectedTM)->createDataLayout().getStringRepresentation(); }; std::unique_ptr M; if (NoUpgradeDebugInfo) @@ -531,7 +557,7 @@ } } - // If we are supposed to override the target triple or data layout, do so now. + // If we are supposed to override the target triple, do so now. if (!TargetTriple.empty()) M->setTargetTriple(Triple::normalize(TargetTriple));