Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -85,7 +85,7 @@ static cl::opt ScalarizeGlobal( "amdgpu-scalarize-global-loads", cl::desc("Enable global load scalarization"), - cl::init(false), + cl::init(true), cl::Hidden); // Option to run internalize pass. Index: test/CodeGen/AMDGPU/add.ll =================================================================== --- test/CodeGen/AMDGPU/add.ll +++ test/CodeGen/AMDGPU/add.ll @@ -5,9 +5,9 @@ ;FUNC-LABEL: {{^}}test1: ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;SI: v_add_i32_e32 [[REG:v[0-9]+]], vcc, {{v[0-9]+, v[0-9]+}} -;SI-NOT: [[REG]] -;SI: buffer_store_dword [[REG]], +;SI: s_add_i32 s[[REG:[0-9]+]], {{s[0-9]+, s[0-9]+}} +;SI: v_mov_b32_e32 v[[REG]], s[[REG]] +;SI: buffer_store_dword v[[REG]], define amdgpu_kernel void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in @@ -21,8 +21,8 @@ ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} +;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 @@ -39,10 +39,10 @@ ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} +;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 Index: test/CodeGen/AMDGPU/add_i128.ll =================================================================== --- test/CodeGen/AMDGPU/add_i128.ll +++ test/CodeGen/AMDGPU/add_i128.ll @@ -19,10 +19,10 @@ ; Check that the SGPR add operand is correctly moved to a VGPR. ; GCN-LABEL: {{^}}sgpr_operand: -; GCN: v_add_i32 -; GCN: v_addc_u32 -; GCN: v_addc_u32 -; GCN: v_addc_u32 +; GCN: s_add_u32 +; GCN: s_addc_u32 +; GCN: s_addc_u32 +; GCN: s_addc_u32 define amdgpu_kernel void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) { %foo = load i128, i128 addrspace(1)* %in, align 8 %result = add i128 %foo, %a @@ -31,10 +31,10 @@ } ; GCN-LABEL: {{^}}sgpr_operand_reversed: -; GCN: v_add_i32 -; GCN: v_addc_u32 -; GCN: v_addc_u32 -; GCN: v_addc_u32 +; GCN: s_add_u32 +; GCN: s_addc_u32 +; GCN: s_addc_u32 +; GCN: s_addc_u32 define amdgpu_kernel void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) { %foo = load i128, i128 addrspace(1)* %in, align 8 %result = add i128 %a, %foo Index: test/CodeGen/AMDGPU/add_i64.ll =================================================================== --- test/CodeGen/AMDGPU/add_i64.ll +++ test/CodeGen/AMDGPU/add_i64.ll @@ -19,8 +19,8 @@ ; Check that the SGPR add operand is correctly moved to a VGPR. ; SI-LABEL: {{^}}sgpr_operand: -; SI: v_add_i32 -; SI: v_addc_u32 +; SI: s_add_u32 +; SI: s_addc_u32 define amdgpu_kernel void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) { %foo = load i64, i64 addrspace(1)* %in, align 8 %result = add i64 %foo, %a @@ -32,8 +32,8 @@ ; SGPR as other operand. ; ; SI-LABEL: {{^}}sgpr_operand_reversed: -; SI: v_add_i32 -; SI: v_addc_u32 +; SI: s_add_u32 +; SI: s_addc_u32 define amdgpu_kernel void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) { %foo = load i64, i64 addrspace(1)* %in, align 8 %result = add i64 %a, %foo Index: test/CodeGen/AMDGPU/and-gcn.ll =================================================================== --- test/CodeGen/AMDGPU/and-gcn.ll +++ test/CodeGen/AMDGPU/and-gcn.ll @@ -2,8 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}v_and_i64_br: -; SI: v_and_b32 -; SI: v_and_b32 +; SI: s_and_b64 define amdgpu_kernel void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { entry: %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 Index: test/CodeGen/AMDGPU/and.ll =================================================================== --- test/CodeGen/AMDGPU/and.ll +++ test/CodeGen/AMDGPU/and.ll @@ -8,8 +8,8 @@ ; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}} define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 @@ -26,10 +26,11 @@ ; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}} define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 @@ -136,7 +137,9 @@ ; FUNC-LABEL: {{^}}v_and_constant_i32 ; SI: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, v{{[0-9]+}} define amdgpu_kernel void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { - %a = load i32, i32 addrspace(1)* %aptr, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %a = load i32, i32 addrspace(1)* %gep, align 4 %and = and i32 %a, 1234567 store i32 %and, i32 addrspace(1)* %out, align 4 ret void @@ -145,7 +148,9 @@ ; FUNC-LABEL: {{^}}v_and_inline_imm_64_i32 ; SI: v_and_b32_e32 v{{[0-9]+}}, 64, v{{[0-9]+}} define amdgpu_kernel void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { - %a = load i32, i32 addrspace(1)* %aptr, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %a = load i32, i32 addrspace(1)* %gep, align 4 %and = and i32 %a, 64 store i32 %and, i32 addrspace(1)* %out, align 4 ret void @@ -154,7 +159,9 @@ ; FUNC-LABEL: {{^}}v_and_inline_imm_neg_16_i32 ; SI: v_and_b32_e32 v{{[0-9]+}}, -16, v{{[0-9]+}} define amdgpu_kernel void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { - %a = load i32, i32 addrspace(1)* %aptr, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %a = load i32, i32 addrspace(1)* %gep, align 4 %and = and i32 %a, -16 store i32 %and, i32 addrspace(1)* %out, align 4 ret void @@ -239,8 +246,11 @@ ; SI: v_and_b32 ; SI: v_and_b32 define amdgpu_kernel void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { - %a = load i64, i64 addrspace(1)* %aptr, align 8 - %b = load i64, i64 addrspace(1)* %bptr, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %a = load i64, i64 addrspace(1)* %gep.a, align 8 + %gep.b = getelementptr i64, i64 addrspace(1)* %bptr, i32 %tid + %b = load i64, i64 addrspace(1)* %gep.b, align 8 %and = and i64 %a, %b store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -251,7 +261,9 @@ ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, {{v[0-9]+}} ; SI: buffer_store_dwordx2 define amdgpu_kernel void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { - %a = load i64, i64 addrspace(1)* %aptr, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %a = load i64, i64 addrspace(1)* %gep.a, align 8 %and = and i64 %a, 1231231234567 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -299,26 +311,30 @@ } ; FUNC-LABEL: {{^}}v_and_i64_32_bit_constant: -; SI: buffer_load_dword [[VAL:v[0-9]+]] +; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]] ; SI-NOT: and ; SI: v_and_b32_e32 {{v[0-9]+}}, 0x12d687, [[VAL]] ; SI-NOT: and ; SI: buffer_store_dwordx2 define amdgpu_kernel void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { - %a = load i64, i64 addrspace(1)* %aptr, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %a = load i64, i64 addrspace(1)* %gep.a, align 8 %and = and i64 %a, 1234567 store i64 %and, i64 addrspace(1)* %out, align 8 ret void } ; FUNC-LABEL: {{^}}v_and_inline_imm_i64: -; SI: buffer_load_dword v{{[0-9]+}} +; SI: {{buffer|flat}}_load_dword v{{[0-9]+}} ; SI-NOT: and ; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}} ; SI-NOT: and ; SI: buffer_store_dwordx2 define amdgpu_kernel void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { - %a = load i64, i64 addrspace(1)* %aptr, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %a = load i64, i64 addrspace(1)* %gep.a, align 8 %and = and i64 %a, 64 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -326,13 +342,15 @@ ; FIXME: Should be able to reduce load width ; FUNC-LABEL: {{^}}v_and_inline_neg_imm_i64: -; SI: buffer_load_dwordx2 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} +; SI: {{buffer|flat}}_load_dwordx2 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} ; SI-NOT: and ; SI: v_and_b32_e32 v[[VAL_LO]], -8, v[[VAL_LO]] ; SI-NOT: and ; SI: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}} define amdgpu_kernel void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { - %a = load i64, i64 addrspace(1)* %aptr, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %a = load i64, i64 addrspace(1)* %gep.a, align 8 %and = and i64 %a, -8 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -549,5 +567,4 @@ store i64 %and, i64 addrspace(1)* %out, align 8 ret void } - attributes #0 = { nounwind readnone } Index: test/CodeGen/AMDGPU/any_extend_vector_inreg.ll =================================================================== --- test/CodeGen/AMDGPU/any_extend_vector_inreg.ll +++ test/CodeGen/AMDGPU/any_extend_vector_inreg.ll @@ -2,9 +2,9 @@ ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}any_extend_vector_inreg_v16i8_to_v4i32: -; GCN: {{buffer|flat}}_load_dwordx4 -; GCN-DAG: {{buffer|flat}}_load_dwordx4 -; GCN-DAG: {{buffer|flat}}_load_dword +; GCN: s_load_dwordx4 +; GCN-DAG: s_load_dwordx4 +; GCN-DAG: s_load_dword ; GCN: {{buffer|flat}}_store_byte ; GCN: {{buffer|flat}}_store_byte Index: test/CodeGen/AMDGPU/bitreverse.ll =================================================================== --- test/CodeGen/AMDGPU/bitreverse.ll +++ test/CodeGen/AMDGPU/bitreverse.ll @@ -2,6 +2,8 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s +declare i32 @llvm.amdgcn.workitem.id.x() #1 + declare i16 @llvm.bitreverse.i16(i16) #1 declare i32 @llvm.bitreverse.i32(i32) #1 declare i64 @llvm.bitreverse.i64(i64) #1 @@ -42,12 +44,14 @@ } ; FUNC-LABEL: {{^}}v_brev_i32: -; SI: buffer_load_dword [[VAL:v[0-9]+]], +; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 { - %val = load i32, i32 addrspace(1)* %valptr + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %gep %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1 store i32 %brev, i32 addrspace(1)* %out ret void @@ -66,7 +70,9 @@ ; SI: v_bfrev_b32_e32 ; SI: v_bfrev_b32_e32 define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 { - %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid + %val = load <2 x i32>, <2 x i32> addrspace(1)* %gep %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1 store <2 x i32> %brev, <2 x i32> addrspace(1)* %out ret void @@ -82,7 +88,9 @@ ; FUNC-LABEL: {{^}}v_brev_i64: ; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0 define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 { - %val = load i64, i64 addrspace(1)* %valptr + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i64, i64 addrspace(1)* %valptr, i32 %tid + %val = load i64, i64 addrspace(1)* %gep %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1 store i64 %brev, i64 addrspace(1)* %out ret void @@ -97,7 +105,9 @@ ; FUNC-LABEL: {{^}}v_brev_v2i64: define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 { - %val = load <2 x i64>, <2 x i64> addrspace(1)* %valptr + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x i64> , <2 x i64> addrspace(1)* %valptr, i32 %tid + %val = load <2 x i64>, <2 x i64> addrspace(1)* %gep %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1 store <2 x i64> %brev, <2 x i64> addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/bswap.ll =================================================================== --- test/CodeGen/AMDGPU/bswap.ll +++ test/CodeGen/AMDGPU/bswap.ll @@ -10,7 +10,7 @@ declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone ; FUNC-LABEL: @test_bswap_i32 -; SI: buffer_load_dword [[VAL:v[0-9]+]] +; SI: s_load_dword [[VAL:s[0-9]+]] ; SI-DAG: v_alignbit_b32 [[TMP0:v[0-9]+]], [[VAL]], [[VAL]], 8 ; SI-DAG: v_alignbit_b32 [[TMP1:v[0-9]+]], [[VAL]], [[VAL]], 24 ; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xff00ff Index: test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll =================================================================== --- test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll +++ test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll @@ -1,9 +1,9 @@ ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-CIVI %s ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-CIVI %s ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-GFX9 %s -; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -amdgpu-scalarize-global-loads=false -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s +; RUN: llc -march=amdgcn -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s +; RUN: llc -march=amdgcn -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s ; OPT-LABEL: @test_no_sink_flat_small_offset_i32( ; OPT-CIVI: getelementptr i32, i32 addrspace(4)* %in @@ -40,7 +40,7 @@ ; OPT-LABEL: @test_sink_noop_addrspacecast_flat_to_global_i32( ; OPT: getelementptr i32, i32 addrspace(4)* %out, -; OPT-CI-NOT: getelementptr +; rOPT-CI-NOT: getelementptr ; OPT: br i1 ; OPT-CI: addrspacecast Index: test/CodeGen/AMDGPU/cgp-addressing-modes.ll =================================================================== --- test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -1,9 +1,9 @@ ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI %s ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s -; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=tahiti -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -mattr=-flat-for-global -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" Index: test/CodeGen/AMDGPU/coalescer_remat.ll =================================================================== --- test/CodeGen/AMDGPU/coalescer_remat.ll +++ test/CodeGen/AMDGPU/coalescer_remat.ll @@ -12,7 +12,7 @@ ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 ; It's probably OK if this is slightly higher: -; CHECK: ; NumVgprs: 8 +; CHECK: ; NumVgprs: 4 define amdgpu_kernel void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) { entry: %cmpflag = icmp eq i32 %flag, 1 Index: test/CodeGen/AMDGPU/copy-illegal-type.ll =================================================================== --- test/CodeGen/AMDGPU/copy-illegal-type.ll +++ test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -5,35 +5,41 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone ; FUNC-LABEL: {{^}}test_copy_v4i8: -; GCN: buffer_load_dword [[REG:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]] ; GCN: buffer_store_dword [[REG]] ; GCN: s_endpgm define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x + %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 ret void } ; FUNC-LABEL: {{^}}test_copy_v4i8_x2: -; GCN: buffer_load_dword [[REG:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]] ; GCN: buffer_store_dword [[REG]] ; GCN: buffer_store_dword [[REG]] ; GCN: s_endpgm define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x + %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 ret void } ; FUNC-LABEL: {{^}}test_copy_v4i8_x3: -; GCN: buffer_load_dword [[REG:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]] ; GCN: buffer_store_dword [[REG]] ; GCN: buffer_store_dword [[REG]] ; GCN: buffer_store_dword [[REG]] ; GCN: s_endpgm define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x + %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 @@ -41,14 +47,16 @@ } ; FUNC-LABEL: {{^}}test_copy_v4i8_x4: -; GCN: buffer_load_dword [[REG:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]] ; GCN: buffer_store_dword [[REG]] ; GCN: buffer_store_dword [[REG]] ; GCN: buffer_store_dword [[REG]] ; GCN: buffer_store_dword [[REG]] ; GCN: s_endpgm define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x + %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 @@ -57,7 +65,7 @@ } ; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use: -; GCN: buffer_load_dword +; GCN: {{buffer|flat}}_load_dword ; GCN-DAG: v_lshrrev_b32 ; GCN: v_and_b32 ; GCN: v_or_b32 @@ -66,7 +74,9 @@ ; GCN: s_endpgm define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x + %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 %add = add <4 x i8> %val, store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4 @@ -97,19 +107,21 @@ } ; FUNC-LABEL: {{^}}test_copy_v3i8_align4: -; GCN: buffer_load_dword +; GCN: {{buffer|flat}}_load_dword ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} ; GCN: s_endpgm define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { - %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid.x + %val = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4 ret void } ; FUNC-LABEL: {{^}}test_copy_v3i8_align2: -; GCN-DAG: buffer_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} +; GCN-DAG: {{buffer|flat}}_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: {{buffer|flat}}_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} ; GCN: s_endpgm @@ -120,9 +132,9 @@ } ; FUNC-LABEL: {{^}}test_copy_v3i8_align1: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte ; GCN: buffer_store_byte ; GCN: buffer_store_byte @@ -135,10 +147,10 @@ } ; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte ; GCN: buffer_store_dword ; GCN: s_endpgm define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { @@ -148,10 +160,10 @@ } ; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_store: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte ; GCN: buffer_store_byte ; GCN: buffer_store_byte ; GCN: buffer_store_byte Index: test/CodeGen/AMDGPU/ctlz.ll =================================================================== --- test/CodeGen/AMDGPU/ctlz.ll +++ test/CodeGen/AMDGPU/ctlz.ll @@ -34,7 +34,7 @@ } ; FUNC-LABEL: {{^}}v_ctlz_i32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN-DAG: v_ffbh_u32_e32 [[CTLZ:v[0-9]+]], [[VAL]] ; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[CTLZ]] ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], 32, [[CTLZ]], vcc @@ -44,14 +44,16 @@ ; EG: FFBH_UINT ; EG: CNDE_INT define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone store i32 %ctlz, i32 addrspace(1)* %out, align 4 ret void } ; FUNC-LABEL: {{^}}v_ctlz_v2i32: -; GCN: buffer_load_dwordx2 +; GCN: {{buffer|flat}}_load_dwordx2 ; GCN: v_ffbh_u32_e32 ; GCN: v_ffbh_u32_e32 ; GCN: buffer_store_dwordx2 @@ -62,14 +64,16 @@ ; EG: FFBH_UINT ; EG: CNDE_INT define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { - %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid + %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8 %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8 ret void } ; FUNC-LABEL: {{^}}v_ctlz_v4i32: -; GCN: buffer_load_dwordx4 +; GCN: {{buffer|flat}}_load_dwordx4 ; GCN: v_ffbh_u32_e32 ; GCN: v_ffbh_u32_e32 ; GCN: v_ffbh_u32_e32 @@ -90,14 +94,16 @@ ; EG-DAG: FFBH_UINT ; EG-DAG: CNDE_INT define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { - %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid + %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16 %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16 ret void } ; FUNC-LABEL: {{^}}v_ctlz_i8: -; GCN: buffer_load_ubyte [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]], ; SI-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; VI-DAG: v_ffbh_u32_sdwa [[RESULT:v[0-9]+]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GCN: buffer_store_byte [[RESULT]], @@ -168,12 +174,14 @@ } ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_eq_neg1: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm - define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr +define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone %cmp = icmp eq i32 %val, 0 %sel = select i1 %cmp, i32 -1, i32 %ctlz @@ -182,12 +190,14 @@ } ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_ne_neg1: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone %cmp = icmp ne i32 %val, 0 %sel = select i1 %cmp, i32 %ctlz, i32 -1 @@ -197,13 +207,15 @@ ; TODO: Should be able to eliminate select here as well. ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_eq_bitwidth: -; GCN: buffer_load_dword +; GCN: {{buffer|flat}}_load_dword ; GCN: v_ffbh_u32_e32 ; GCN: v_cmp ; GCN: v_cndmask ; GCN: s_endpgm define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone %cmp = icmp eq i32 %ctlz, 32 %sel = select i1 %cmp, i32 -1, i32 %ctlz @@ -212,13 +224,15 @@ } ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_ne_bitwidth: -; GCN: buffer_load_dword +; GCN: {{buffer|flat}}_load_dword ; GCN: v_ffbh_u32_e32 ; GCN: v_cmp ; GCN: v_cndmask ; GCN: s_endpgm define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone %cmp = icmp ne i32 %ctlz, 32 %sel = select i1 %cmp, i32 %ctlz, i32 -1 @@ -242,7 +256,7 @@ } ; FUNC-LABEL: {{^}}v_ctlz_i16_sel_eq_neg1: -; SI: buffer_load_ushort [[VAL:v[0-9]+]], +; SI: {{buffer|flat}}_load_ushort [[VAL:v[0-9]+]], ; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] ; SI: buffer_store_short [[FFBH]], define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind { Index: test/CodeGen/AMDGPU/ctlz_zero_undef.ll =================================================================== --- test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -29,21 +29,23 @@ } ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] ; EG: FFBH_UINT {{\*? *}}[[RESULT]] define amdgpu_kernel void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone store i32 %ctlz, i32 addrspace(1)* %out, align 4 ret void } ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v2i32: -; GCN: buffer_load_dwordx2 +; GCN: {{buffer|flat}}_load_dwordx2 ; GCN: v_ffbh_u32_e32 ; GCN: v_ffbh_u32_e32 ; GCN: buffer_store_dwordx2 @@ -52,14 +54,16 @@ ; EG: FFBH_UINT {{\*? *}}[[RESULT]] ; EG: FFBH_UINT {{\*? *}}[[RESULT]] define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { - %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid + %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8 %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8 ret void } ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v4i32: -; GCN: buffer_load_dwordx4 +; GCN: {{buffer|flat}}_load_dwordx4 ; GCN: v_ffbh_u32_e32 ; GCN: v_ffbh_u32_e32 ; GCN: v_ffbh_u32_e32 @@ -72,18 +76,22 @@ ; EG: FFBH_UINT {{\*? *}}[[RESULT]] ; EG: FFBH_UINT {{\*? *}}[[RESULT]] define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { - %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid + %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16 %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16 ret void } ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i8: -; GCN: buffer_load_ubyte [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_byte [[RESULT]], define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { - %val = load i8, i8 addrspace(1)* %valptr + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid + %val = load i8, i8 addrspace(1)* %in.gep %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone store i8 %ctlz, i8 addrspace(1)* %out ret void @@ -145,11 +153,13 @@ } ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_neg1: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[RESULT]], - define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr +define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp eq i32 %val, 0 %sel = select i1 %cmp, i32 -1, i32 %ctlz @@ -158,11 +168,13 @@ } ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_neg1: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[RESULT]], define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp ne i32 %val, 0 %sel = select i1 %cmp, i32 %ctlz, i32 -1 @@ -186,15 +198,17 @@ } ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN-DAG: v_ffbh_u32_e32 [[RESULT0:v[0-9]+]], [[VAL]] ; GCN-DAG: v_cmp_eq_u32_e32 vcc, 0, [[VAL]] ; GCN-DAG: v_cndmask_b32_e64 [[RESULT1:v[0-9]+]], 0, 1, vcc ; GCN-DAG: buffer_store_dword [[RESULT0]] ; GCN-DAG: buffer_store_byte [[RESULT1]] ; GCN: s_endpgm - define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr +define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp eq i32 %val, 0 %sel = select i1 %cmp, i32 -1, i32 %ctlz @@ -205,13 +219,15 @@ ; Selected on wrong constant ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_0: -; GCN: buffer_load_dword +; GCN: {{buffer|flat}}_load_dword ; GCN: v_ffbh_u32_e32 ; GCN: v_cmp ; GCN: v_cndmask ; GCN: buffer_store_dword - define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr +define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp eq i32 %val, 0 %sel = select i1 %cmp, i32 0, i32 %ctlz @@ -221,13 +237,15 @@ ; Selected on wrong constant ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_0: -; GCN: buffer_load_dword +; GCN: {{buffer|flat}}_load_dword ; GCN: v_ffbh_u32_e32 ; GCN: v_cmp ; GCN: v_cndmask ; GCN: buffer_store_dword define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp ne i32 %val, 0 %sel = select i1 %cmp, i32 %ctlz, i32 0 @@ -237,13 +255,15 @@ ; Compare on wrong constant ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_cmp_non0: -; GCN: buffer_load_dword +; GCN: {{buffer|flat}}_load_dword ; GCN: v_ffbh_u32_e32 ; GCN: v_cmp ; GCN: v_cndmask ; GCN: buffer_store_dword - define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr +define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp eq i32 %val, 1 %sel = select i1 %cmp, i32 0, i32 %ctlz @@ -253,13 +273,15 @@ ; Selected on wrong constant ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_cmp_non0: -; GCN: buffer_load_dword +; GCN: {{buffer|flat}}_load_dword ; GCN: v_ffbh_u32_e32 ; GCN: v_cmp ; GCN: v_cndmask ; GCN: buffer_store_dword define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp ne i32 %val, 1 %sel = select i1 %cmp, i32 %ctlz, i32 0 Index: test/CodeGen/AMDGPU/ctpop.ll =================================================================== --- test/CodeGen/AMDGPU/ctpop.ll +++ test/CodeGen/AMDGPU/ctpop.ll @@ -8,6 +8,8 @@ declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readnone declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + ; FUNC-LABEL: {{^}}s_ctpop_i32: ; GCN: s_load_dword [[SVAL:s[0-9]+]], ; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[SVAL]] @@ -24,22 +26,24 @@ ; XXX - Why 0 in register? ; FUNC-LABEL: {{^}}v_ctpop_i32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 0 ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone store i32 %ctpop, i32 addrspace(1)* %out, align 4 ret void } ; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32: -; GCN: buffer_load_dword [[VAL1:v[0-9]+]], -; GCN: buffer_load_dword [[VAL0:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL0:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL1:v[0-9]+]], ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], [[VAL1]], 0 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] ; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] @@ -49,8 +53,11 @@ ; EG: BCNT_INT ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind { - %val0 = load i32, i32 addrspace(1)* %in0, align 4 - %val1 = load i32, i32 addrspace(1)* %in1, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %tid + %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %tid + %val0 = load i32, i32 addrspace(1)* %in0.gep, align 4 + %val1 = load i32, i32 addrspace(1)* %in1.gep, align 4 %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone %ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone %add = add i32 %ctpop0, %ctpop1 @@ -59,15 +66,17 @@ } ; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32: -; GCN: buffer_load_dword [[VAL0:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL0:v[0-9]+]], ; GCN: s_waitcnt ; GCN-NEXT: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}} ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm -define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind { - %val0 = load i32, i32 addrspace(1)* %in0, align 4 - %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone - %add = add i32 %ctpop0, %sval +define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %sval) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 + %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone + %add = add i32 %ctpop, %sval store i32 %add, i32 addrspace(1)* %out, align 4 ret void } @@ -80,7 +89,9 @@ ; EG: BCNT_INT ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind { - %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 %tid + %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8 %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8 ret void @@ -98,7 +109,9 @@ ; EG: BCNT_INT ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind { - %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 %tid + %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16 %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16 ret void @@ -124,7 +137,9 @@ ; EG: BCNT_INT ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind { - %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <8 x i32>, <8 x i32> addrspace(1)* %in, i32 %tid + %val = load <8 x i32>, <8 x i32> addrspace(1)* %in.gep, align 32 %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32 ret void @@ -166,21 +181,25 @@ ; EG: BCNT_INT ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind { - %val = load <16 x i32>, <16 x i32> addrspace(1)* %in, align 32 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <16 x i32>, <16 x i32> addrspace(1)* %in, i32 %tid + %val = load <16 x i32>, <16 x i32> addrspace(1)* %in.gep, align 32 %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32 ret void } ; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4 ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone %add = add i32 %ctpop, 4 store i32 %add, i32 addrspace(1)* %out, align 4 @@ -188,14 +207,16 @@ } ; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant_inv: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4 ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone %add = add i32 4, %ctpop store i32 %add, i32 addrspace(1)* %out, align 4 @@ -203,14 +224,16 @@ } ; FUNC-LABEL: {{^}}v_ctpop_i32_add_literal: -; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], +; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN-DAG: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]] ; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone %add = add i32 %ctpop, 99999 store i32 %add, i32 addrspace(1)* %out, align 4 @@ -218,7 +241,7 @@ } ; FUNC-LABEL: {{^}}v_ctpop_i32_add_var: -; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], +; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN-DAG: s_load_dword [[VAR:s[0-9]+]], ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] ; GCN: buffer_store_dword [[RESULT]], @@ -226,7 +249,9 @@ ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone %add = add i32 %ctpop, %const store i32 %add, i32 addrspace(1)* %out, align 4 @@ -234,7 +259,7 @@ } ; FUNC-LABEL: {{^}}v_ctpop_i32_add_var_inv: -; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], +; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN-DAG: s_load_dword [[VAR:s[0-9]+]], ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] ; GCN: buffer_store_dword [[RESULT]], @@ -242,7 +267,9 @@ ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone %add = add i32 %const, %ctpop store i32 %add, i32 addrspace(1)* %out, align 4 @@ -250,18 +277,22 @@ } ; FUNC-LABEL: {{^}}v_ctpop_i32_add_vvar_inv: -; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], {{0$}} -; GCN-DAG: buffer_load_dword [[VAR:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16 -; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] +; SI: buffer_load_dword [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 +; SI: buffer_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 +; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAR]], [[VAL]] +; VI: flat_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] +; VI: flat_load_dword [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone - %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 4 + %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 %tid %const = load i32, i32 addrspace(1)* %gep, align 4 %add = add i32 %const, %ctpop store i32 %add, i32 addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/ctpop64.ll =================================================================== --- test/CodeGen/AMDGPU/ctpop64.ll +++ test/CodeGen/AMDGPU/ctpop64.ll @@ -1,6 +1,8 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + declare i64 @llvm.ctpop.i64(i64) nounwind readnone declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone @@ -25,14 +27,16 @@ } ; FUNC-LABEL: {{^}}v_ctpop_i64: -; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, +; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0 ; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] ; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { - %val = load i64, i64 addrspace(1)* %in, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %val = load i64, i64 addrspace(1)* %in.gep, align 8 %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone %truncctpop = trunc i64 %ctpop to i32 store i32 %truncctpop, i32 addrspace(1)* %out, align 4 @@ -40,7 +44,7 @@ } ; FUNC-LABEL: {{^}}v_ctpop_i64_user: -; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, +; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0 ; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] ; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] @@ -49,7 +53,9 @@ ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} ; GCN: s_endpgm define amdgpu_kernel void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind { - %val = load i64, i64 addrspace(1)* %in, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %val = load i64, i64 addrspace(1)* %in.gep, align 8 %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone %or = or i64 %ctpop, %s.val store i64 %or, i64 addrspace(1)* %out @@ -87,7 +93,9 @@ ; GCN: v_bcnt_u32_b32 ; GCN: s_endpgm define amdgpu_kernel void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind { - %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i32 %tid + %val = load <2 x i64>, <2 x i64> addrspace(1)* %in.gep, align 16 %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone %truncctpop = trunc <2 x i64> %ctpop to <2 x i32> store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8 @@ -105,7 +113,9 @@ ; GCN: v_bcnt_u32_b32 ; GCN: s_endpgm define amdgpu_kernel void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind { - %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid + %val = load <4 x i64>, <4 x i64> addrspace(1)* %in.gep, align 32 %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone %truncctpop = trunc <4 x i64> %ctpop to <4 x i32> store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16 @@ -169,7 +179,8 @@ ; FIXME: Should not have extra add ; FUNC-LABEL: {{^}}v_ctpop_i128: -; GCN: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; SI: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 +; VI: flat_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}} ; GCN-DAG: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT0:v[0-9]+]], v{{[0-9]+}}, 0 ; GCN-DAG: v_bcnt_u32_b32{{(_e32)*(_e64)*}} [[MIDRESULT1:v[0-9]+]], v[[VAL3]], [[MIDRESULT0]] @@ -182,7 +193,9 @@ ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind { - %val = load i128, i128 addrspace(1)* %in, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %tid + %val = load i128, i128 addrspace(1)* %in.gep, align 8 %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone %truncctpop = trunc i128 %ctpop to i32 store i32 %truncctpop, i32 addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/cttz_zero_undef.ll =================================================================== --- test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -5,6 +5,7 @@ declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone +declare i32 @llvm.r600.read.tidig.x() nounwind readnone ; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32: ; SI: s_load_dword [[VAL:s[0-9]+]], @@ -21,21 +22,23 @@ } ; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32: -; SI: buffer_load_dword [[VAL:v[0-9]+]], +; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; SI: v_ffbl_b32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] ; EG: FFBL_INT {{\*? *}}[[RESULT]] define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone store i32 %cttz, i32 addrspace(1)* %out, align 4 ret void } ; FUNC-LABEL: {{^}}v_cttz_zero_undef_v2i32: -; SI: buffer_load_dwordx2 +; SI: {{buffer|flat}}_load_dwordx2 ; SI: v_ffbl_b32_e32 ; SI: v_ffbl_b32_e32 ; SI: buffer_store_dwordx2 @@ -44,14 +47,16 @@ ; EG: FFBL_INT {{\*? *}}[[RESULT]] ; EG: FFBL_INT {{\*? *}}[[RESULT]] define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { - %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid + %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8 %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8 ret void } ; FUNC-LABEL: {{^}}v_cttz_zero_undef_v4i32: -; SI: buffer_load_dwordx4 +; SI: {{buffer|flat}}_load_dwordx4 ; SI: v_ffbl_b32_e32 ; SI: v_ffbl_b32_e32 ; SI: v_ffbl_b32_e32 @@ -64,7 +69,9 @@ ; EG: FFBL_INT {{\*? *}}[[RESULT]] ; EG: FFBL_INT {{\*? *}}[[RESULT]] define amdgpu_kernel void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { - %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid + %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16 %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16 ret void Index: test/CodeGen/AMDGPU/cvt_f32_ubyte.ll =================================================================== --- test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -5,46 +5,52 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone ; GCN-LABEL: {{^}}load_i8_to_f32: -; GCN: buffer_load_ubyte [[LOADREG:v[0-9]+]], +; GCN: {{buffer|flat}}_load_ubyte [[LOADREG:v[0-9]+]], ; GCN-NOT: bfe ; GCN-NOT: lshr ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]] ; GCN: buffer_store_dword [[CONV]], define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { - %load = load i8, i8 addrspace(1)* %in, align 1 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid + %load = load i8, i8 addrspace(1)* %gep, align 1 %cvt = uitofp i8 %load to float store float %cvt, float addrspace(1)* %out, align 4 ret void } ; GCN-LABEL: {{^}}load_v2i8_to_v2f32: -; GCN: buffer_load_ushort [[LD:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[LD:v[0-9]+]] ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]] ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]] ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid + %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2 %cvt = uitofp <2 x i8> %load to <2 x float> store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16 ret void } ; GCN-LABEL: {{^}}load_v3i8_to_v3f32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]] ; GCN-NOT: v_cvt_f32_ubyte3_e32 ; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]] ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]] ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]] ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid + %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4 %cvt = uitofp <3 x i8> %load to <3 x float> store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16 ret void } ; GCN-LABEL: {{^}}load_v4i8_to_v4f32: -; GCN: buffer_load_dword [[LOADREG:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[LOADREG:v[0-9]+]] ; GCN-NOT: bfe ; GCN-NOT: lshr ; GCN-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]] @@ -53,7 +59,9 @@ ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]] ; GCN: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid + %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 %cvt = uitofp <4 x i8> %load to <4 x float> store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 ret void @@ -64,10 +72,10 @@ ; FIXME: Packing bytes ; GCN-LABEL: {{^}}load_v4i8_to_v4f32_unaligned: -; GCN: buffer_load_ubyte [[LOADREG3:v[0-9]+]] -; GCN: buffer_load_ubyte [[LOADREG2:v[0-9]+]] -; GCN: buffer_load_ubyte [[LOADREG1:v[0-9]+]] -; GCN: buffer_load_ubyte [[LOADREG0:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ubyte [[LOADREG3:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ubyte [[LOADREG2:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ubyte [[LOADREG1:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ubyte [[LOADREG0:v[0-9]+]] ; GCN-DAG: v_lshlrev_b32 ; GCN-DAG: v_or_b32 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], @@ -77,7 +85,9 @@ ; GCN: buffer_store_dwordx4 define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid + %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1 %cvt = uitofp <4 x i8> %load to <4 x float> store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 ret void @@ -124,14 +134,16 @@ ; GCN-LABEL: {{^}}load_v7i8_to_v7f32: ; GCN: s_endpgm define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid + %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1 %cvt = uitofp <7 x i8> %load to <7 x float> store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16 ret void } ; GCN-LABEL: {{^}}load_v8i8_to_v8f32: -; GCN: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}}, +; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}}, ; GCN-NOT: bfe ; GCN-NOT: lshr ; GCN-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]] @@ -147,19 +159,23 @@ ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid + %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8 %cvt = uitofp <8 x i8> %load to <8 x float> store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16 ret void } ; GCN-LABEL: {{^}}i8_zext_inreg_i32_to_f32: -; GCN: buffer_load_dword [[LOADREG:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[LOADREG:v[0-9]+]], ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]] ; GCN-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]] ; GCN: buffer_store_dword [[CONV]], define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %load = load i32, i32 addrspace(1)* %gep, align 4 %add = add i32 %load, 2 %inreg = and i32 %add, 255 %cvt = uitofp i32 %inreg to float @@ -169,7 +185,9 @@ ; GCN-LABEL: {{^}}i8_zext_inreg_hi1_to_f32: define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %load = load i32, i32 addrspace(1)* %gep, align 4 %inreg = and i32 %load, 65280 %shr = lshr i32 %inreg, 8 %cvt = uitofp i32 %shr to float @@ -181,7 +199,9 @@ ; them so it shouldn't really matter. ; GCN-LABEL: {{^}}i8_zext_i32_to_f32: define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { - %load = load i8, i8 addrspace(1)* %in, align 1 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid + %load = load i8, i8 addrspace(1)* %gep, align 1 %ext = zext i8 %load to i32 %cvt = uitofp i32 %ext to float store float %cvt, float addrspace(1)* %out, align 4 @@ -190,7 +210,9 @@ ; GCN-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32: define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid + %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1 %ext = zext <4 x i8> %load to <4 x i32> %cvt = uitofp <4 x i32> %ext to <4 x float> store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 @@ -198,12 +220,14 @@ } ; GCN-LABEL: {{^}}extract_byte0_to_f32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]] ; GCN-NOT: [[VAL]] ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[CONV]] define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %gep %and = and i32 %val, 255 %cvt = uitofp i32 %and to float store float %cvt, float addrspace(1)* %out @@ -211,12 +235,14 @@ } ; GCN-LABEL: {{^}}extract_byte1_to_f32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]] ; GCN-NOT: [[VAL]] ; GCN: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[CONV]] define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %gep %srl = lshr i32 %val, 8 %and = and i32 %srl, 255 %cvt = uitofp i32 %and to float @@ -225,12 +251,14 @@ } ; GCN-LABEL: {{^}}extract_byte2_to_f32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]] ; GCN-NOT: [[VAL]] ; GCN: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[CONV]] define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %gep %srl = lshr i32 %val, 16 %and = and i32 %srl, 255 %cvt = uitofp i32 %and to float @@ -239,12 +267,14 @@ } ; GCN-LABEL: {{^}}extract_byte3_to_f32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]] ; GCN-NOT: [[VAL]] ; GCN: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[CONV]] define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %gep %srl = lshr i32 %val, 24 %and = and i32 %srl, 255 %cvt = uitofp i32 %and to float Index: test/CodeGen/AMDGPU/early-if-convert-cost.ll =================================================================== --- test/CodeGen/AMDGPU/early-if-convert-cost.ll +++ test/CodeGen/AMDGPU/early-if-convert-cost.ll @@ -1,4 +1,4 @@ -; RUN: llc -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; FIXME: Most of these cases that don't trigger because of broken cost ; heuristics. Should not need -stress-early-ifcvt Index: test/CodeGen/AMDGPU/early-if-convert.ll =================================================================== --- test/CodeGen/AMDGPU/early-if-convert.ll +++ test/CodeGen/AMDGPU/early-if-convert.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; XUN: llc -march=amdgcn -mcpu=tonga -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; FIXME: This leaves behind a now unnecessary and with exec Index: test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll =================================================================== --- test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll +++ test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll @@ -2,6 +2,8 @@ ; RUN: llc -march=amdgcn -enable-no-signed-zeros-fp-math=1 < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-UNSAFE %s ; RUN: llc -march=amdgcn -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-UNSAFE %s +declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + ; Test that the -enable-no-signed-zeros-fp-math flag works ; GCN-LABEL: {{^}}fneg_fsub_f32: @@ -10,8 +12,11 @@ ; GCN-UNSAFE-NOT: xor define amdgpu_kernel void @fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 - %a = load float, float addrspace(1)* %in, align 4 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %add = add i32 %tid, 1 + %gep = getelementptr float, float addrspace(1)* %in, i32 %tid + %b_ptr = getelementptr float, float addrspace(1)* %in, i32 %add + %a = load float, float addrspace(1)* %gep, align 4 %b = load float, float addrspace(1)* %b_ptr, align 4 %result = fsub float %a, %b %neg.result = fsub float -0.0, %result Index: test/CodeGen/AMDGPU/extractelt-to-trunc.ll =================================================================== --- test/CodeGen/AMDGPU/extractelt-to-trunc.ll +++ test/CodeGen/AMDGPU/extractelt-to-trunc.ll @@ -1,5 +1,7 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + ; Make sure the add and load are reduced to 32-bits even with the ; bitcast to vector. ; GCN-LABEL: {{^}}bitcast_int_to_vector_extract_0: @@ -8,7 +10,9 @@ ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, [[B]], [[A]] ; GCN: buffer_store_dword [[ADD]] define amdgpu_kernel void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) { - %a = load i64, i64 addrspace(1)* %in + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %a = load i64, i64 addrspace(1)* %gep %add = add i64 %a, %b %val.bc = bitcast i64 %add to <2 x i32> %extract = extractelement <2 x i32> %val.bc, i32 0 @@ -21,7 +25,9 @@ ; GCN: v_add_f64 ; GCN: buffer_store_dword v define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrspace(1)* %in, double %b) { - %a = load double, double addrspace(1)* %in + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr double, double addrspace(1)* %in, i32 %tid + %a = load double, double addrspace(1)* %gep %add = fadd double %a, %b %val.bc = bitcast double %add to <2 x i32> %extract = extractelement <2 x i32> %val.bc, i32 0 @@ -34,7 +40,9 @@ ; GCN: v_add_i32 ; GCN: buffer_store_dword define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) { - %a = load i64, i64 addrspace(1)* %in + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %a = load i64, i64 addrspace(1)* %gep %add = add i64 %a, %b %val.bc = bitcast i64 %add to <2 x float> %extract = extractelement <2 x float> %val.bc, i32 0 Index: test/CodeGen/AMDGPU/fabs.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fabs.f16.ll +++ test/CodeGen/AMDGPU/fabs.f16.ll @@ -134,7 +134,9 @@ ; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]] ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[FABS]], v{{[0-9]+$}} define amdgpu_kernel void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { - %val = load <2 x half>, <2 x half> addrspace(1)* %in + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid + %val = load <2 x half>, <2 x half> addrspace(1)* %gep %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) %fmul = fmul <2 x half> %fabs, %val store <2 x half> %fmul, <2 x half> addrspace(1)* %out Index: test/CodeGen/AMDGPU/fadd.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fadd.f16.ll +++ test/CodeGen/AMDGPU/fadd.f16.ll @@ -2,8 +2,8 @@ ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}fadd_f16 -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] @@ -24,7 +24,7 @@ } ; GCN-LABEL: {{^}}fadd_f16_imm_a -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], 1.0, v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] @@ -42,7 +42,7 @@ } ; GCN-LABEL: {{^}}fadd_f16_imm_b -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], 2.0, v[[A_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] @@ -60,8 +60,8 @@ } ; GCN-LABEL: {{^}}fadd_v2f16: -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; GCN: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] @@ -70,8 +70,8 @@ ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] -; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] +; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] @@ -88,15 +88,18 @@ <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { entry: - %a.val = load <2 x half>, <2 x half> addrspace(1)* %a - %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.a = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %a, i32 %tid + %gep.b = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %b, i32 %tid + %a.val = load <2 x half>, <2 x half> addrspace(1)* %gep.a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %gep.b %r.val = fadd <2 x half> %a.val, %b.val store <2 x half> %r.val, <2 x half> addrspace(1)* %r ret void } ; GCN-LABEL: {{^}}fadd_v2f16_imm_a: -; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] @@ -118,14 +121,16 @@ <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b) { entry: - %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.b = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %b, i32 %tid + %b.val = load <2 x half>, <2 x half> addrspace(1)* %gep.b %r.val = fadd <2 x half> , %b.val store <2 x half> %r.val, <2 x half> addrspace(1)* %r ret void } ; GCN-LABEL: {{^}}fadd_v2f16_imm_b: -; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] @@ -147,8 +152,15 @@ <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: - %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.a = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %a, i32 %tid + %a.val = load <2 x half>, <2 x half> addrspace(1)* %gep.a %r.val = fadd <2 x half> %a.val, store <2 x half> %r.val, <2 x half> addrspace(1)* %r ret void } + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/fadd64.ll =================================================================== --- test/CodeGen/AMDGPU/fadd64.ll +++ test/CodeGen/AMDGPU/fadd64.ll @@ -5,8 +5,11 @@ ; CHECK: v_add_f64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}} define amdgpu_kernel void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr inbounds double, double addrspace(1)* %in1, i32 %tid + %gep2 = getelementptr inbounds double, double addrspace(1)* %in2, i32 %tid + %r0 = load double, double addrspace(1)* %gep1 + %r1 = load double, double addrspace(1)* %gep2 %r2 = fadd double %r0, %r1 store double %r2, double addrspace(1)* %out ret void @@ -42,3 +45,8 @@ store <2 x double> %r2, <2 x double> addrspace(1)* %out ret void } + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/fcanonicalize.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -5,6 +5,8 @@ declare half @llvm.canonicalize.f16(half) #0 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #0 declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0 +declare i32 @llvm.amdgcn.workitem.id.x() #0 + ; GCN-LABEL: {{^}}v_test_canonicalize_var_f16: ; GCN: v_mul_f16_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} @@ -213,7 +215,9 @@ ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+$}} ; GFX9: buffer_store_dword [[REG]] define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 { - %val = load <2 x half>, <2 x half> addrspace(1)* %out + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid + %val = load <2 x half>, <2 x half> addrspace(1)* %gep %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out ret void @@ -233,7 +237,9 @@ ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]]{{$}} ; GCN: buffer_store_dword define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 { - %val = load <2 x half>, <2 x half> addrspace(1)* %out + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid + %val = load <2 x half>, <2 x half> addrspace(1)* %gep %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -251,7 +257,9 @@ ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]] neg_lo:[0,1] neg_hi:[0,1]{{$}} ; GCN: buffer_store_dword define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 { - %val = load <2 x half>, <2 x half> addrspace(1)* %out + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid + %val = load <2 x half>, <2 x half> addrspace(1)* %gep %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) %val.fabs.fneg = fsub <2 x half> , %val.fabs %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs.fneg) @@ -270,7 +278,9 @@ ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} neg_lo:[0,1] neg_hi:[0,1]{{$}} ; GFX9: buffer_store_dword [[REG]] define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 { - %val = load <2 x half>, <2 x half> addrspace(1)* %out + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid + %val = load <2 x half>, <2 x half> addrspace(1)* %gep %fneg.val = fsub <2 x half> , %val %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %fneg.val) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out Index: test/CodeGen/AMDGPU/fcanonicalize.ll =================================================================== --- test/CodeGen/AMDGPU/fcanonicalize.ll +++ test/CodeGen/AMDGPU/fcanonicalize.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s declare float @llvm.fabs.f32(float) #0 declare float @llvm.canonicalize.f32(float) #0 Index: test/CodeGen/AMDGPU/fcmp.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fcmp.f16.ll +++ test/CodeGen/AMDGPU/fcmp.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}fcmp_f16_lt ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] Index: test/CodeGen/AMDGPU/fcmp64.ll =================================================================== --- test/CodeGen/AMDGPU/fcmp64.ll +++ test/CodeGen/AMDGPU/fcmp64.ll @@ -2,7 +2,7 @@ ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s ; CHECK-LABEL: {{^}}flt_f64: -; CHECK: v_cmp_nge_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +; CHECK: v_cmp_nge_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} define amdgpu_kernel void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 @@ -14,7 +14,7 @@ } ; CHECK-LABEL: {{^}}fle_f64: -; CHECK: v_cmp_ngt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +; CHECK: v_cmp_ngt_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} define amdgpu_kernel void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 @@ -26,7 +26,7 @@ } ; CHECK-LABEL: {{^}}fgt_f64: -; CHECK: v_cmp_nle_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +; CHECK: v_cmp_nle_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} define amdgpu_kernel void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 @@ -38,7 +38,7 @@ } ; CHECK-LABEL: {{^}}fge_f64: -; CHECK: v_cmp_nlt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +; CHECK: v_cmp_nlt_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} define amdgpu_kernel void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 @@ -50,7 +50,7 @@ } ; CHECK-LABEL: {{^}}fne_f64: -; CHECK: v_cmp_neq_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +; CHECK: v_cmp_neq_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} define amdgpu_kernel void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 @@ -62,7 +62,7 @@ } ; CHECK-LABEL: {{^}}feq_f64: -; CHECK: v_cmp_nlg_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +; CHECK: v_cmp_nlg_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} define amdgpu_kernel void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 Index: test/CodeGen/AMDGPU/fconst64.ll =================================================================== --- test/CodeGen/AMDGPU/fconst64.ll +++ test/CodeGen/AMDGPU/fconst64.ll @@ -6,8 +6,15 @@ ; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0 define amdgpu_kernel void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) { - %r1 = load double, double addrspace(1)* %in + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds double, double addrspace(1)* %in, i32 %tid + %r1 = load double, double addrspace(1)* %gep %r2 = fadd double %r1, 5.000000e+00 store double %r2, double addrspace(1)* %out ret void } + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/fcopysign.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fcopysign.f16.ll +++ test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX8 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX8 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s declare half @llvm.copysign.f16(half, half) declare float @llvm.copysign.f32(float, float) @@ -9,16 +9,18 @@ declare <3 x half> @llvm.copysign.v3f16(<3 x half>, <3 x half>) declare <4 x half> @llvm.copysign.v4f16(<4 x half>, <4 x half>) +declare i32 @llvm.amdgcn.workitem.id.x() + ; GCN-LABEL: {{^}}test_copysign_f16: -; SI: buffer_load_ushort v[[SIGN:[0-9]+]] -; SI: buffer_load_ushort v[[MAG:[0-9]+]] +; SI: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]] +; SI: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]] ; SI: s_brev_b32 s[[CONST:[0-9]+]], -2 ; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]] ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]] ; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_F32]] ; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]] -; GFX89: buffer_load_ushort v[[SIGN:[0-9]+]] -; GFX89: buffer_load_ushort v[[MAG:[0-9]+]] +; GFX89: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]] +; GFX89: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]] ; GFX89: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff ; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN]] ; GCN: buffer_store_short v[[OUT]] @@ -36,8 +38,8 @@ } ; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f16_sign_f32: -; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]] -; GCN-DAG: buffer_load_dword v[[SIGN:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dword v[[SIGN:[0-9]+]] ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 ; GCN-DAG: v_cvt_f32_f16_e32 v[[MAG_EXT:[0-9]+]], v[[MAG]] ; GCN: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG_EXT]], v[[SIGN]] @@ -48,17 +50,20 @@ half addrspace(1)* %arg_mag, float addrspace(1)* %arg_sign) { entry: - %mag = load half, half addrspace(1)* %arg_mag + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid + %mag = load half, half addrspace(1)* %arg_mag_gep %mag.ext = fpext half %mag to float - %sign = load float, float addrspace(1)* %arg_sign + %arg_sign_gep = getelementptr float, float addrspace(1)* %arg_sign, i32 %tid + %sign = load float, float addrspace(1)* %arg_sign_gep %out = call float @llvm.copysign.f32(float %mag.ext, float %sign) store float %out, float addrspace(1)* %arg_out ret void } ; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f16_sign_f64: -; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]] -; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}} +; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}} ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 ; GCN-DAG: v_cvt_f32_f16_e32 v[[MAG_EXT:[0-9]+]], v[[MAG]] ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[MAG_EXT_LO:[0-9]+]]:[[MAG_EXT_HI:[0-9]+]]{{\]}}, v[[MAG_EXT]] @@ -70,17 +75,20 @@ half addrspace(1)* %arg_mag, double addrspace(1)* %arg_sign) { entry: - %mag = load half, half addrspace(1)* %arg_mag + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid + %mag = load half, half addrspace(1)* %arg_mag_gep %mag.ext = fpext half %mag to double - %sign = load double, double addrspace(1)* %arg_sign + %arg_sign_gep = getelementptr double, double addrspace(1)* %arg_sign, i32 %tid + %sign = load double, double addrspace(1)* %arg_sign_gep %out = call double @llvm.copysign.f64(double %mag.ext, double %sign) store double %out, double addrspace(1)* %arg_out ret void } ; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f32_sign_f16: -; GCN-DAG: buffer_load_dword v[[MAG:[0-9]+]] -; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dword v[[MAG:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]] ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]] ; SI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_F32]] @@ -93,8 +101,11 @@ float addrspace(1)* %arg_mag, half addrspace(1)* %arg_sign) { entry: - %mag = load float, float addrspace(1)* %arg_mag - %sign = load half, half addrspace(1)* %arg_sign + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %arg_mag_gep = getelementptr float, float addrspace(1)* %arg_mag, i32 %tid + %mag = load float, float addrspace(1)* %arg_mag_gep + %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid + %sign = load half, half addrspace(1)* %arg_sign_gep %sign.ext = fpext half %sign to float %out = call float @llvm.copysign.f32(float %mag, float %sign.ext) store float %out, float addrspace(1)* %arg_out @@ -102,8 +113,8 @@ } ; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f64_sign_f16: -; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[MAG_LO:[0-9]+]]:[[MAG_HI:[0-9]+]]{{\]}} -; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[MAG_LO:[0-9]+]]:[[MAG_HI:[0-9]+]]{{\]}} +; GCN-DAG: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]] ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]] ; SI: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_HI]], v[[SIGN_F32]] @@ -116,8 +127,11 @@ double addrspace(1)* %arg_mag, half addrspace(1)* %arg_sign) { entry: - %mag = load double, double addrspace(1)* %arg_mag - %sign = load half, half addrspace(1)* %arg_sign + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %arg_mag_gep = getelementptr double, double addrspace(1)* %arg_mag, i32 %tid + %mag = load double, double addrspace(1)* %arg_mag_gep + %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid + %sign = load half, half addrspace(1)* %arg_sign_gep %sign.ext = fpext half %sign to double %out = call double @llvm.copysign.f64(double %mag, double %sign.ext) store double %out, double addrspace(1)* %arg_out @@ -125,8 +139,8 @@ } ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f32: -; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]] -; GCN-DAG: buffer_load_dword v[[SIGN:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dword v[[SIGN:[0-9]+]] ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 ; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]] ; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN]] @@ -141,8 +155,11 @@ half addrspace(1)* %arg_mag, float addrspace(1)* %arg_sign) { entry: - %mag = load half, half addrspace(1)* %arg_mag - %sign = load float, float addrspace(1)* %arg_sign + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid + %mag = load half, half addrspace(1)* %arg_mag_gep + %arg_sign_gep = getelementptr float, float addrspace(1)* %arg_sign, i32 %tid + %sign = load float, float addrspace(1)* %arg_sign_gep %sign.trunc = fptrunc float %sign to half %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc) store half %out, half addrspace(1)* %arg_out @@ -150,8 +167,8 @@ } ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f64: -; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]] -; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}} +; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}} ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 ; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]] ; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_HI]] @@ -166,8 +183,11 @@ half addrspace(1)* %arg_mag, double addrspace(1)* %arg_sign) { entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid %mag = load half, half addrspace(1)* %arg_mag - %sign = load double, double addrspace(1)* %arg_sign + %arg_sign_gep = getelementptr double, double addrspace(1)* %arg_sign, i32 %tid + %sign = load double, double addrspace(1)* %arg_sign_gep %sign.trunc = fptrunc double %sign to half %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc) store half %out, half addrspace(1)* %arg_out @@ -175,8 +195,8 @@ } ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f32_sign_f16: -; GCN-DAG: buffer_load_dword v[[MAG:[0-9]+]] -; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dword v[[MAG:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]] ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 ; SI-DAG: v_cvt_f16_f32_e32 v[[MAG_TRUNC:[0-9]+]], v[[MAG]] ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]] @@ -193,9 +213,12 @@ float addrspace(1)* %arg_mag, half addrspace(1)* %arg_sign) { entry: - %mag = load float, float addrspace(1)* %arg_mag + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %arg_mag_gep = getelementptr float, float addrspace(1)* %arg_mag, i32 %tid + %mag = load float, float addrspace(1)* %arg_mag_gep %mag.trunc = fptrunc float %mag to half - %sign = load half, half addrspace(1)* %arg_sign + %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid + %sign = load half, half addrspace(1)* %arg_sign_gep %out = call half @llvm.copysign.f16(half %mag.trunc, half %sign) store half %out, half addrspace(1)* %arg_out ret void Index: test/CodeGen/AMDGPU/fma-combine.ll =================================================================== --- test/CodeGen/AMDGPU/fma-combine.ll +++ test/CodeGen/AMDGPU/fma-combine.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s ; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be ; beneficial even without fp32 denormals, but they do require no-infs-fp-math Index: test/CodeGen/AMDGPU/fma.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fma.f64.ll +++ test/CodeGen/AMDGPU/fma.f64.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s declare double @llvm.fma.f64(double, double, double) nounwind readnone declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone Index: test/CodeGen/AMDGPU/fma.ll =================================================================== --- test/CodeGen/AMDGPU/fma.ll +++ test/CodeGen/AMDGPU/fma.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare float @llvm.fma.f32(float, float, float) nounwind readnone declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone Index: test/CodeGen/AMDGPU/fmul.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fmul.f16.ll +++ test/CodeGen/AMDGPU/fmul.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}fmul_f16 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] Index: test/CodeGen/AMDGPU/fmul64.ll =================================================================== --- test/CodeGen/AMDGPU/fmul64.ll +++ test/CodeGen/AMDGPU/fmul64.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s ; FUNC-LABEL: {{^}}fmul_f64: ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} Index: test/CodeGen/AMDGPU/fmuladd.f32.ll =================================================================== --- test/CodeGen/AMDGPU/fmuladd.f32.ll +++ test/CodeGen/AMDGPU/fmuladd.f32.ll @@ -1,12 +1,12 @@ -; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s -; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s -; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s -; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s - -; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s -; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s -; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s -; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s + +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s ; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow. Index: test/CodeGen/AMDGPU/fmuladd.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fmuladd.f64.ll +++ test/CodeGen/AMDGPU/fmuladd.f64.ll @@ -1,9 +1,9 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICTSI %s -; RUN: llc -march=amdgcn -mcpu=verde -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,SI %s -; RUN: llc -march=amdgcn -mcpu=tahiti -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s -; RUN: llc -march=amdgcn -mcpu=verde -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICTSI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI %s ; GCN-LABEL: {{^}}fmuladd_f64: ; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} Index: test/CodeGen/AMDGPU/fmuladd.v2f16.ll =================================================================== --- test/CodeGen/AMDGPU/fmuladd.v2f16.ll +++ test/CodeGen/AMDGPU/fmuladd.v2f16.ll @@ -1,12 +1,12 @@ -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s - -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s + +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s declare i32 @llvm.amdgcn.workitem.id.x() #1 declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #1 Index: test/CodeGen/AMDGPU/fneg-fabs.ll =================================================================== --- test/CodeGen/AMDGPU/fneg-fabs.ll +++ test/CodeGen/AMDGPU/fneg-fabs.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32: ; SI-NOT: and Index: test/CodeGen/AMDGPU/fneg.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fneg.f16.ll +++ test/CodeGen/AMDGPU/fneg.f16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=GFX89 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=GFX89 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=GFX89 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=GFX89 %s ; FIXME: Should be able to do scalar op ; GCN-LABEL: {{^}}s_fneg_f16: Index: test/CodeGen/AMDGPU/fp32_to_fp16.ll =================================================================== --- test/CodeGen/AMDGPU/fp32_to_fp16.ll +++ test/CodeGen/AMDGPU/fp32_to_fp16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone Index: test/CodeGen/AMDGPU/fpext.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fpext.f16.ll +++ test/CodeGen/AMDGPU/fpext.f16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s ; GCN-LABEL: {{^}}fpext_f16_to_f32 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] Index: test/CodeGen/AMDGPU/fptosi.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fptosi.f16.ll +++ test/CodeGen/AMDGPU/fptosi.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}fptosi_f16_to_i16 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] Index: test/CodeGen/AMDGPU/fptoui.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fptoui.f16.ll +++ test/CodeGen/AMDGPU/fptoui.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}fptoui_f16_to_i16 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] Index: test/CodeGen/AMDGPU/fptrunc.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fptrunc.f16.ll +++ test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SIVI %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s ; GCN-LABEL: {{^}}fptrunc_f32_to_f16: ; GCN: buffer_load_dword v[[A_F32:[0-9]+]] Index: test/CodeGen/AMDGPU/fract.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fract.f64.ll +++ test/CodeGen/AMDGPU/fract.f64.ll @@ -1,9 +1,9 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=SI-UNSAFE -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=VI-UNSAFE -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=SI-UNSAFE -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=VI-UNSAFE -check-prefix=FUNC %s declare double @llvm.fabs.f64(double) #0 declare double @llvm.floor.f64(double) #0 Index: test/CodeGen/AMDGPU/fract.ll =================================================================== --- test/CodeGen/AMDGPU/fract.ll +++ test/CodeGen/AMDGPU/fract.ll @@ -1,8 +1,8 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=CI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s declare float @llvm.fabs.f32(float) #0 declare float @llvm.floor.f32(float) #0 Index: test/CodeGen/AMDGPU/frem.ll =================================================================== --- test/CodeGen/AMDGPU/frem.ll +++ test/CodeGen/AMDGPU/frem.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}frem_f32: ; GCN-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}} Index: test/CodeGen/AMDGPU/fsqrt.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fsqrt.f64.ll +++ test/CodeGen/AMDGPU/fsqrt.f64.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}v_safe_fsqrt_f64: ; GCN: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} Index: test/CodeGen/AMDGPU/fsqrt.ll =================================================================== --- test/CodeGen/AMDGPU/fsqrt.ll +++ test/CodeGen/AMDGPU/fsqrt.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s ; Run with unsafe-fp-math to make sure nothing tries to turn this into 1 / rsqrt(x) Index: test/CodeGen/AMDGPU/fsub.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fsub.f16.ll +++ test/CodeGen/AMDGPU/fsub.f16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=VI -check-prefix=SIVI %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=VI -check-prefix=SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s ; GCN-LABEL: {{^}}fsub_f16: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] Index: test/CodeGen/AMDGPU/fsub.ll =================================================================== --- test/CodeGen/AMDGPU/fsub.ll +++ test/CodeGen/AMDGPU/fsub.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}v_fsub_f32: ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} Index: test/CodeGen/AMDGPU/fsub64.ll =================================================================== --- test/CodeGen/AMDGPU/fsub64.ll +++ test/CodeGen/AMDGPU/fsub64.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s declare double @llvm.fabs.f64(double) #0 Index: test/CodeGen/AMDGPU/ftrunc.f64.ll =================================================================== --- test/CodeGen/AMDGPU/ftrunc.f64.ll +++ test/CodeGen/AMDGPU/ftrunc.f64.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s declare double @llvm.trunc.f64(double) nounwind readnone declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone Index: test/CodeGen/AMDGPU/global-extload-i16.ll =================================================================== --- test/CodeGen/AMDGPU/global-extload-i16.ll +++ test/CodeGen/AMDGPU/global-extload-i16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FIXME: cypress is broken because the bigger testcases spill and it's not implemented Index: test/CodeGen/AMDGPU/half.ll =================================================================== --- test/CodeGen/AMDGPU/half.ll +++ test/CodeGen/AMDGPU/half.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; half args should be promoted to float for SI and lower. Index: test/CodeGen/AMDGPU/imm.ll =================================================================== --- test/CodeGen/AMDGPU/imm.ll +++ test/CodeGen/AMDGPU/imm.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; Use a 64-bit value with lo bits that can be represented as an inline constant ; GCN-LABEL: {{^}}i64_imm_inline_lo: Index: test/CodeGen/AMDGPU/immv216.ll =================================================================== --- test/CodeGen/AMDGPU/immv216.ll +++ test/CodeGen/AMDGPU/immv216.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s ; FIXME: Merge into imm.ll ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16: Index: test/CodeGen/AMDGPU/indirect-addressing-si.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s ; Tests for indirect addressing on SI, which is implemented using dynamic ; indexing of vectors. Index: test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll =================================================================== --- test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll +++ test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -amdgpu-load-store-vectorizer=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -amdgpu-load-store-vectorizer=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GatherAllAliases gives up on trying to analyze cases where the ; pointer may have been loaded from an aliased store, so make sure Index: test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.fabs.f16(half %a) declare i1 @llvm.amdgcn.class.f16(half %a, i32 %b) Index: test/CodeGen/AMDGPU/llvm.amdgcn.class.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.class.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.class.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s declare i1 @llvm.amdgcn.class.f32(float, i32) #1 declare i1 @llvm.amdgcn.class.f64(double, i32) #1 Index: test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s ; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=VI %s ; FIXME: Enable for VI. Index: test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.amdgcn.ldexp.f16(half %a, i32 %b) Index: test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}bfe_i32_arg_arg_arg: ; GCN: v_bfe_i32 Index: test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s declare i32 @llvm.amdgcn.sffbh.i32(i32) #1 Index: test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s declare double @llvm.amdgcn.trig.preop.f64(double, i32) nounwind readnone Index: test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}bfe_u32_arg_arg_arg: ; GCN: v_bfe_u32 Index: test/CodeGen/AMDGPU/llvm.ceil.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.ceil.f16.ll +++ test/CodeGen/AMDGPU/llvm.ceil.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.ceil.f16(half %a) declare <2 x half> @llvm.ceil.v2f16(<2 x half> %a) Index: test/CodeGen/AMDGPU/llvm.cos.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.cos.f16(half %a) declare <2 x half> @llvm.cos.v2f16(<2 x half> %a) Index: test/CodeGen/AMDGPU/llvm.exp2.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.exp2.f16.ll +++ test/CodeGen/AMDGPU/llvm.exp2.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.exp2.f16(half %a) declare <2 x half> @llvm.exp2.v2f16(<2 x half> %a) Index: test/CodeGen/AMDGPU/llvm.floor.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.floor.f16.ll +++ test/CodeGen/AMDGPU/llvm.floor.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.floor.f16(half %a) declare <2 x half> @llvm.floor.v2f16(<2 x half> %a) Index: test/CodeGen/AMDGPU/llvm.fma.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.fma.f16.ll +++ test/CodeGen/AMDGPU/llvm.fma.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.fma.f16(half %a, half %b, half %c) declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) Index: test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s -; RUN: llc -march=amdgcn -mattr=+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s declare half @llvm.fmuladd.f16(half %a, half %b, half %c) declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) Index: test/CodeGen/AMDGPU/llvm.log2.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.log2.f16.ll +++ test/CodeGen/AMDGPU/llvm.log2.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.log2.f16(half %a) declare <2 x half> @llvm.log2.v2f16(<2 x half> %a) Index: test/CodeGen/AMDGPU/llvm.maxnum.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.maxnum.f16(half %a, half %b) declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) Index: test/CodeGen/AMDGPU/llvm.minnum.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.minnum.f16(half %a, half %b) declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) Index: test/CodeGen/AMDGPU/llvm.rint.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.rint.f16.ll +++ test/CodeGen/AMDGPU/llvm.rint.f16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=GFX89 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=GFX89 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s declare half @llvm.rint.f16(half %a) declare <2 x half> @llvm.rint.v2f16(<2 x half> %a) Index: test/CodeGen/AMDGPU/llvm.sin.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.sin.f16(half %a) declare <2 x half> @llvm.sin.v2f16(<2 x half> %a) Index: test/CodeGen/AMDGPU/llvm.sqrt.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.sqrt.f16.ll +++ test/CodeGen/AMDGPU/llvm.sqrt.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.sqrt.f16(half %a) declare <2 x half> @llvm.sqrt.v2f16(<2 x half> %a) Index: test/CodeGen/AMDGPU/llvm.trunc.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.trunc.f16.ll +++ test/CodeGen/AMDGPU/llvm.trunc.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.trunc.f16(half %a) declare <2 x half> @llvm.trunc.v2f16(<2 x half> %a) Index: test/CodeGen/AMDGPU/load-global-f32.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-f32.ll +++ test/CodeGen/AMDGPU/load-global-f32.ll @@ -1,9 +1,9 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}global_load_f32: ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}} Index: test/CodeGen/AMDGPU/load-global-f64.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-f64.ll +++ test/CodeGen/AMDGPU/load-global-f64.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}global_load_f64: ; GCN-NOHSA: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] Index: test/CodeGen/AMDGPU/load-global-i16.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-i16.ll +++ test/CodeGen/AMDGPU/load-global-i16.ll @@ -1,8 +1,8 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s ; FIXME: r600 is broken because the bigger testcases spill and it's not implemented Index: test/CodeGen/AMDGPU/load-global-i32.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-i32.ll +++ test/CodeGen/AMDGPU/load-global-i32.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}global_load_i32: Index: test/CodeGen/AMDGPU/load-global-i64.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-i64.ll +++ test/CodeGen/AMDGPU/load-global-i64.ll @@ -1,9 +1,9 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}global_load_i64: ; GCN-NOHSA: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] Index: test/CodeGen/AMDGPU/load-global-i8.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-i8.ll +++ test/CodeGen/AMDGPU/load-global-i8.ll @@ -1,8 +1,8 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,SI,FUNC %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,SI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,VI,FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,SI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,SI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,VI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}global_load_i8: Index: test/CodeGen/AMDGPU/load-weird-sizes.ll =================================================================== --- test/CodeGen/AMDGPU/load-weird-sizes.ll +++ test/CodeGen/AMDGPU/load-weird-sizes.ll @@ -1,8 +1,8 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=CI-HSA -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=CM -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=CI-HSA -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=CM -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}load_i24: ; SI: {{flat|buffer}}_load_ubyte Index: test/CodeGen/AMDGPU/max.ll =================================================================== --- test/CodeGen/AMDGPU/max.ll +++ test/CodeGen/AMDGPU/max.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}v_test_imax_sge_i32: Index: test/CodeGen/AMDGPU/merge-stores.ll =================================================================== --- test/CodeGen/AMDGPU/merge-stores.ll +++ test/CodeGen/AMDGPU/merge-stores.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s ; This test is mostly to test DAG store merging, so disable the vectorizer. ; Run with devices with different unaligned load restrictions. Index: test/CodeGen/AMDGPU/mubuf.ll =================================================================== --- test/CodeGen/AMDGPU/mubuf.ll +++ test/CodeGen/AMDGPU/mubuf.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s declare i32 @llvm.amdgcn.workitem.id.x() readnone Index: test/CodeGen/AMDGPU/mul.ll =================================================================== --- test/CodeGen/AMDGPU/mul.ll +++ test/CodeGen/AMDGPU/mul.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC ; mul24 and mad24 are affected Index: test/CodeGen/AMDGPU/no-shrink-extloads.ll =================================================================== --- test/CodeGen/AMDGPU/no-shrink-extloads.ll +++ test/CodeGen/AMDGPU/no-shrink-extloads.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone Index: test/CodeGen/AMDGPU/or.ll =================================================================== --- test/CodeGen/AMDGPU/or.ll +++ test/CodeGen/AMDGPU/or.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}or_v2i32: Index: test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll +++ test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) #0 declare void @llvm.invariant.end.p0i8({}*, i64, i8* nocapture) #0 Index: test/CodeGen/AMDGPU/reduce-load-width-alignment.ll =================================================================== --- test/CodeGen/AMDGPU/reduce-load-width-alignment.ll +++ test/CodeGen/AMDGPU/reduce-load-width-alignment.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}reduce_i64_load_align_4_width_to_i32: ; GCN: buffer_load_dword [[VAL:v[0-9]+]] Index: test/CodeGen/AMDGPU/reorder-stores.ll =================================================================== --- test/CodeGen/AMDGPU/reorder-stores.ll +++ test/CodeGen/AMDGPU/reorder-stores.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s ; SI-LABEL: {{^}}no_reorder_v2f64_global_load_store: ; SI: buffer_load_dwordx4 Index: test/CodeGen/AMDGPU/rotl.i64.ll =================================================================== --- test/CodeGen/AMDGPU/rotl.i64.ll +++ test/CodeGen/AMDGPU/rotl.i64.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s ; BOTH-LABEL: {{^}}s_rotl_i64: ; BOTH-DAG: s_lshl_b64 Index: test/CodeGen/AMDGPU/rotr.i64.ll =================================================================== --- test/CodeGen/AMDGPU/rotr.i64.ll +++ test/CodeGen/AMDGPU/rotr.i64.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s ; BOTH-LABEL: {{^}}s_rotr_i64: ; BOTH-DAG: s_sub_i32 Index: test/CodeGen/AMDGPU/rsq.ll =================================================================== --- test/CodeGen/AMDGPU/rsq.ll +++ test/CodeGen/AMDGPU/rsq.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s -; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone declare float @llvm.sqrt.f32(float) nounwind readnone Index: test/CodeGen/AMDGPU/s_movk_i32.ll =================================================================== --- test/CodeGen/AMDGPU/s_movk_i32.ll +++ test/CodeGen/AMDGPU/s_movk_i32.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s ; SI-LABEL: {{^}}s_movk_i32_k0: ; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff{{$}} Index: test/CodeGen/AMDGPU/saddo.ll =================================================================== --- test/CodeGen/AMDGPU/saddo.ll +++ test/CodeGen/AMDGPU/saddo.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs< %s declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone Index: test/CodeGen/AMDGPU/salu-to-valu.ll =================================================================== --- test/CodeGen/AMDGPU/salu-to-valu.ll +++ test/CodeGen/AMDGPU/salu-to-valu.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=CI %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI --check-prefix=GCN-HSA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=CI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI --check-prefix=GCN-HSA %s declare i32 @llvm.amdgcn.workitem.id.x() #0 declare i32 @llvm.amdgcn.workitem.id.y() #0 Index: test/CodeGen/AMDGPU/scalar_to_vector.ll =================================================================== --- test/CodeGen/AMDGPU/scalar_to_vector.ll +++ test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; XXX - Why the packing? ; GCN-LABEL: {{^}}scalar_to_vector_v2i32: Index: test/CodeGen/AMDGPU/schedule-global-loads.ll =================================================================== --- test/CodeGen/AMDGPU/schedule-global-loads.ll +++ test/CodeGen/AMDGPU/schedule-global-loads.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s ; FIXME: This currently doesn't do a great job of clustering the ; loads, which end up with extra moves between them. Right now, it Index: test/CodeGen/AMDGPU/scratch-buffer.ll =================================================================== --- test/CodeGen/AMDGPU/scratch-buffer.ll +++ test/CodeGen/AMDGPU/scratch-buffer.ll @@ -1,5 +1,5 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s ; When a frame index offset is more than 12-bits, make sure we don't store ; it in mubuf's offset field. Index: test/CodeGen/AMDGPU/sdiv.ll =================================================================== --- test/CodeGen/AMDGPU/sdiv.ll +++ test/CodeGen/AMDGPU/sdiv.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; The code generated by sdiv is long and complex and may frequently change. ; The goal of this test is to make sure the ISel doesn't fail. Index: test/CodeGen/AMDGPU/sdwa-peephole.ll =================================================================== --- test/CodeGen/AMDGPU/sdwa-peephole.ll +++ test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SDWA -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=SDWA -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SDWA -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=SDWA -check-prefix=GCN %s ; GCN-LABEL: {{^}}add_shr_i32: ; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}} Index: test/CodeGen/AMDGPU/select-vectors.ll =================================================================== --- test/CodeGen/AMDGPU/select-vectors.ll +++ test/CodeGen/AMDGPU/select-vectors.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s ; Test expansion of scalar selects on vectors. ; Evergreen not enabled since it seems to be having problems with doubles. Index: test/CodeGen/AMDGPU/select.f16.ll =================================================================== --- test/CodeGen/AMDGPU/select.f16.ll +++ test/CodeGen/AMDGPU/select.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}select_f16: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] Index: test/CodeGen/AMDGPU/sext-in-reg.ll =================================================================== --- test/CodeGen/AMDGPU/sext-in-reg.ll +++ test/CodeGen/AMDGPU/sext-in-reg.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FIXME: i16 promotion pass ruins the scalar cases when legal. ; FIXME: r600 fails verifier Index: test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll =================================================================== --- test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll +++ test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s ; Copy VGPR -> SGPR used twice as an instruction operand, which is then ; used in an REG_SEQUENCE that also needs to be handled. Index: test/CodeGen/AMDGPU/sgpr-copy.ll =================================================================== --- test/CodeGen/AMDGPU/sgpr-copy.ll +++ test/CodeGen/AMDGPU/sgpr-copy.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s ; CHECK-LABEL: {{^}}phi1: ; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0 Index: test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll =================================================================== --- test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll +++ test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; Extract the high bit of the 1st quarter ; GCN-LABEL: {{^}}v_uextract_bit_31_i128: Index: test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll =================================================================== --- test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll +++ test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; Make sure 64-bit BFE pattern does a 32-bit BFE on the relevant half. Index: test/CodeGen/AMDGPU/shift-i64-opts.ll =================================================================== --- test/CodeGen/AMDGPU/shift-i64-opts.ll +++ test/CodeGen/AMDGPU/shift-i64-opts.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=FAST64 -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=SLOW64 -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=FAST64 -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=SLOW64 -check-prefix=GCN %s ; lshr (i64 x), c: c > 32 => reg_sequence lshr (i32 hi_32(x)), (c - 32), 0 Index: test/CodeGen/AMDGPU/shl.ll =================================================================== --- test/CodeGen/AMDGPU/shl.ll +++ test/CodeGen/AMDGPU/shl.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s ; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.r600.read.tidig.x() #0 Index: test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll =================================================================== --- test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) Index: test/CodeGen/AMDGPU/sign_extend.ll =================================================================== --- test/CodeGen/AMDGPU/sign_extend.ll +++ test/CodeGen/AMDGPU/sign_extend.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s ; GCN-LABEL: {{^}}s_sext_i1_to_i32: ; GCN: v_cndmask_b32_e64 Index: test/CodeGen/AMDGPU/sitofp.f16.ll =================================================================== --- test/CodeGen/AMDGPU/sitofp.f16.ll +++ test/CodeGen/AMDGPU/sitofp.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}sitofp_i16_to_f16 ; GCN: buffer_load_{{sshort|ushort}} v[[A_I16:[0-9]+]] Index: test/CodeGen/AMDGPU/sminmax.ll =================================================================== --- test/CodeGen/AMDGPU/sminmax.ll +++ test/CodeGen/AMDGPU/sminmax.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}s_abs_i32: ; GCN: s_abs_i32 Index: test/CodeGen/AMDGPU/sminmax.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s ; GCN-LABEL: {{^}}s_abs_v2i16: ; GFX9: s_load_dword [[VAL:s[0-9]+]] Index: test/CodeGen/AMDGPU/spill-cfg-position.ll =================================================================== --- test/CodeGen/AMDGPU/spill-cfg-position.ll +++ test/CodeGen/AMDGPU/spill-cfg-position.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -stress-regalloc=6 < %s | FileCheck %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -stress-regalloc=6 < %s | FileCheck %s ; Inline spiller can decide to move a spill as early as possible in the basic block. ; It will skip phis and label, but we also need to make sure it skips instructions Index: test/CodeGen/AMDGPU/sra.ll =================================================================== --- test/CodeGen/AMDGPU/sra.ll +++ test/CodeGen/AMDGPU/sra.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.r600.read.tidig.x() #0 Index: test/CodeGen/AMDGPU/srem.ll =================================================================== --- test/CodeGen/AMDGPU/srem.ll +++ test/CodeGen/AMDGPU/srem.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 Index: test/CodeGen/AMDGPU/srl.ll =================================================================== --- test/CodeGen/AMDGPU/srl.ll +++ test/CodeGen/AMDGPU/srl.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s ; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.r600.read.tidig.x() #0 Index: test/CodeGen/AMDGPU/ssubo.ll =================================================================== --- test/CodeGen/AMDGPU/ssubo.ll +++ test/CodeGen/AMDGPU/ssubo.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs< %s declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone Index: test/CodeGen/AMDGPU/sub.ll =================================================================== --- test/CodeGen/AMDGPU/sub.ll +++ test/CodeGen/AMDGPU/sub.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.r600.read.tidig.x() readnone Index: test/CodeGen/AMDGPU/trunc-bitcast-vector.ll =================================================================== --- test/CodeGen/AMDGPU/trunc-bitcast-vector.ll +++ test/CodeGen/AMDGPU/trunc-bitcast-vector.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s ; CHECK-LABEL: {{^}}trunc_i64_bitcast_v2i32: ; CHECK: buffer_load_dword v Index: test/CodeGen/AMDGPU/trunc.ll =================================================================== --- test/CodeGen/AMDGPU/trunc.ll +++ test/CodeGen/AMDGPU/trunc.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s declare i32 @llvm.r600.read.tidig.x() nounwind readnone Index: test/CodeGen/AMDGPU/uaddo.ll =================================================================== --- test/CodeGen/AMDGPU/uaddo.ll +++ test/CodeGen/AMDGPU/uaddo.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s ; FUNC-LABEL: {{^}}s_uaddo_i64_zext: ; GCN: s_add_u32 Index: test/CodeGen/AMDGPU/udiv.ll =================================================================== --- test/CodeGen/AMDGPU/udiv.ll +++ test/CodeGen/AMDGPU/udiv.ll @@ -1,9 +1,9 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=VI %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}udiv_i32: ; EG-NOT: SETGE_INT Index: test/CodeGen/AMDGPU/uitofp.f16.ll =================================================================== --- test/CodeGen/AMDGPU/uitofp.f16.ll +++ test/CodeGen/AMDGPU/uitofp.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}uitofp_i16_to_f16 ; GCN: buffer_load_ushort v[[A_I16:[0-9]+]] Index: test/CodeGen/AMDGPU/urem.ll =================================================================== --- test/CodeGen/AMDGPU/urem.ll +++ test/CodeGen/AMDGPU/urem.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; The code generated by urem is long and complex and may frequently ; change. The goal of this test is to make sure the ISel doesn't fail Index: test/CodeGen/AMDGPU/usubo.ll =================================================================== --- test/CodeGen/AMDGPU/usubo.ll +++ test/CodeGen/AMDGPU/usubo.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s ; FUNC-LABEL: {{^}}s_usubo_i64_zext: ; GCN: s_sub_u32 Index: test/CodeGen/AMDGPU/v_mac.ll =================================================================== --- test/CodeGen/AMDGPU/v_mac.ll +++ test/CodeGen/AMDGPU/v_mac.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-FLUSH -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-DENORM -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-FLUSH -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-DENORM -check-prefix=GCN %s ; GCN-LABEL: {{^}}mac_vvv: ; GCN: buffer_load_dword [[A:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}} Index: test/CodeGen/AMDGPU/v_mac_f16.ll =================================================================== --- test/CodeGen/AMDGPU/v_mac_f16.ll +++ test/CodeGen/AMDGPU/v_mac_f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}mac_f16: ; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] Index: test/CodeGen/AMDGPU/vectorize-global-local.ll =================================================================== --- test/CodeGen/AMDGPU/vectorize-global-local.ll +++ test/CodeGen/AMDGPU/vectorize-global-local.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s ; CHECK-DAG: flat_load_dwordx4 ; CHECK-DAG: flat_load_dwordx4 ; CHECK-DAG: flat_load_dwordx4 Index: test/CodeGen/AMDGPU/xor.ll =================================================================== --- test/CodeGen/AMDGPU/xor.ll +++ test/CodeGen/AMDGPU/xor.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}xor_v2i32: