Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -260,24 +260,15 @@ static StringRef computeDataLayout(const Triple &TT) { if (TT.getArch() == Triple::r600) { // 32-bit pointers. - if (TT.getEnvironmentName() == "amdgiz" || - TT.getEnvironmentName() == "amdgizcl") return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"; - return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" - "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; } // 32-bit private, local, and region pointers. 64-bit global, constant and // flat. - if (TT.getEnvironmentName() == "amdgiz" || - TT.getEnvironmentName() == "amdgizcl") return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32" "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"; - return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32" - "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" - "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; } LLVM_READNONE Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -907,16 +907,9 @@ AMDGPUAS getAMDGPUAS(Triple T) { auto Env = T.getEnvironmentName(); AMDGPUAS AS; - if (Env == "amdgiz" || Env == "amdgizcl") { - AS.FLAT_ADDRESS = 0; - AS.PRIVATE_ADDRESS = 5; - AS.REGION_ADDRESS = 4; - } - else { - AS.FLAT_ADDRESS = 4; - AS.PRIVATE_ADDRESS = 0; - AS.REGION_ADDRESS = 5; - } + AS.FLAT_ADDRESS = 0; + AS.PRIVATE_ADDRESS = 5; + AS.REGION_ADDRESS = 4; return AS; } Index: test/Analysis/CostModel/AMDGPU/addrspacecast.ll =================================================================== --- test/Analysis/CostModel/AMDGPU/addrspacecast.ll +++ test/Analysis/CostModel/AMDGPU/addrspacecast.ll @@ -1,45 +1,45 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck %s ; CHECK: 'addrspacecast_global_to_flat' -; CHECK: estimated cost of 0 for {{.*}} addrspacecast i8 addrspace(1)* %ptr to i8 addrspace(4)* -define i8 addrspace(4)* @addrspacecast_global_to_flat(i8 addrspace(1)* %ptr) #0 { - %cast = addrspacecast i8 addrspace(1)* %ptr to i8 addrspace(4)* - ret i8 addrspace(4)* %cast +; CHECK: estimated cost of 0 for {{.*}} addrspacecast i8 addrspace(1)* %ptr to i8* +define i8* @addrspacecast_global_to_flat(i8 addrspace(1)* %ptr) #0 { + %cast = addrspacecast i8 addrspace(1)* %ptr to i8* + ret i8* %cast } ; CHECK: 'addrspacecast_global_to_flat_v2' -; CHECK: estimated cost of 0 for {{.*}} addrspacecast <2 x i8 addrspace(1)*> %ptr to <2 x i8 addrspace(4)*> -define <2 x i8 addrspace(4)*> @addrspacecast_global_to_flat_v2(<2 x i8 addrspace(1)*> %ptr) #0 { - %cast = addrspacecast <2 x i8 addrspace(1)*> %ptr to <2 x i8 addrspace(4)*> - ret <2 x i8 addrspace(4)*> %cast +; CHECK: estimated cost of 0 for {{.*}} addrspacecast <2 x i8 addrspace(1)*> %ptr to <2 x i8*> +define <2 x i8*> @addrspacecast_global_to_flat_v2(<2 x i8 addrspace(1)*> %ptr) #0 { + %cast = addrspacecast <2 x i8 addrspace(1)*> %ptr to <2 x i8*> + ret <2 x i8*> %cast } ; CHECK: 'addrspacecast_global_to_flat_v32' -; CHECK: estimated cost of 0 for {{.*}} addrspacecast <32 x i8 addrspace(1)*> %ptr to <32 x i8 addrspace(4)*> -define <32 x i8 addrspace(4)*> @addrspacecast_global_to_flat_v32(<32 x i8 addrspace(1)*> %ptr) #0 { - %cast = addrspacecast <32 x i8 addrspace(1)*> %ptr to <32 x i8 addrspace(4)*> - ret <32 x i8 addrspace(4)*> %cast +; CHECK: estimated cost of 0 for {{.*}} addrspacecast <32 x i8 addrspace(1)*> %ptr to <32 x i8*> +define <32 x i8*> @addrspacecast_global_to_flat_v32(<32 x i8 addrspace(1)*> %ptr) #0 { + %cast = addrspacecast <32 x i8 addrspace(1)*> %ptr to <32 x i8*> + ret <32 x i8*> %cast } ; CHECK: 'addrspacecast_local_to_flat' -; CHECK: estimated cost of 1 for {{.*}} addrspacecast i8 addrspace(3)* %ptr to i8 addrspace(4)* -define i8 addrspace(4)* @addrspacecast_local_to_flat(i8 addrspace(3)* %ptr) #0 { - %cast = addrspacecast i8 addrspace(3)* %ptr to i8 addrspace(4)* - ret i8 addrspace(4)* %cast +; CHECK: estimated cost of 1 for {{.*}} addrspacecast i8 addrspace(3)* %ptr to i8* +define i8* @addrspacecast_local_to_flat(i8 addrspace(3)* %ptr) #0 { + %cast = addrspacecast i8 addrspace(3)* %ptr to i8* + ret i8* %cast } ; CHECK: 'addrspacecast_local_to_flat_v2' -; CHECK: estimated cost of 2 for {{.*}} addrspacecast <2 x i8 addrspace(3)*> %ptr to <2 x i8 addrspace(4)*> -define <2 x i8 addrspace(4)*> @addrspacecast_local_to_flat_v2(<2 x i8 addrspace(3)*> %ptr) #0 { - %cast = addrspacecast <2 x i8 addrspace(3)*> %ptr to <2 x i8 addrspace(4)*> - ret <2 x i8 addrspace(4)*> %cast +; CHECK: estimated cost of 2 for {{.*}} addrspacecast <2 x i8 addrspace(3)*> %ptr to <2 x i8*> +define <2 x i8*> @addrspacecast_local_to_flat_v2(<2 x i8 addrspace(3)*> %ptr) #0 { + %cast = addrspacecast <2 x i8 addrspace(3)*> %ptr to <2 x i8*> + ret <2 x i8*> %cast } ; CHECK: 'addrspacecast_local_to_flat_v32' -; CHECK: estimated cost of 32 for {{.*}} addrspacecast <32 x i8 addrspace(3)*> %ptr to <32 x i8 addrspace(4)*> -define <32 x i8 addrspace(4)*> @addrspacecast_local_to_flat_v32(<32 x i8 addrspace(3)*> %ptr) #0 { - %cast = addrspacecast <32 x i8 addrspace(3)*> %ptr to <32 x i8 addrspace(4)*> - ret <32 x i8 addrspace(4)*> %cast +; CHECK: estimated cost of 32 for {{.*}} addrspacecast <32 x i8 addrspace(3)*> %ptr to <32 x i8*> +define <32 x i8*> @addrspacecast_local_to_flat_v32(<32 x i8 addrspace(3)*> %ptr) #0 { + %cast = addrspacecast <32 x i8 addrspace(3)*> %ptr to <32 x i8*> + ret <32 x i8*> %cast } attributes #0 = { nounwind readnone } Index: test/CodeGen/AMDGPU/InlineAsmCrash.ll =================================================================== --- test/CodeGen/AMDGPU/InlineAsmCrash.ll +++ test/CodeGen/AMDGPU/InlineAsmCrash.ll @@ -4,9 +4,9 @@ ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: ;;#ASMEND -define void @foo(i32* %ptr) { +define void @foo(i32 addrspace(5)* %ptr) { %tmp = tail call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "s_nop 0", "=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65"(i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2) %tmp2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %tmp, 0 - store i32 %tmp2, i32* %ptr, align 4 + store i32 %tmp2, i32 addrspace(5)* %ptr, align 4 ret void } Index: test/CodeGen/AMDGPU/addrspacecast.ll =================================================================== --- test/CodeGen/AMDGPU/addrspacecast.ll +++ test/CodeGen/AMDGPU/addrspacecast.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=CI %s ; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=GFX9 %s +target datalayout = "A5" ; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast: ; HSA: enable_sgpr_private_segment_buffer = 1 @@ -35,8 +36,8 @@ ; CI: NumSgprs: {{[0-9][0-9]+}} ; GFX9: NumSgprs: {{[0-9]+}} define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 { - %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* - store volatile i32 7, i32 addrspace(4)* %stof + %stof = addrspacecast i32 addrspace(3)* %ptr to i32* + store volatile i32 7, i32* %stof ret void } @@ -73,9 +74,9 @@ ; CI: NumSgprs: {{[0-9][0-9]+}} ; GFX9: NumSgprs: {{[0-9]+}} -define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #0 { - %stof = addrspacecast i32* %ptr to i32 addrspace(4)* - store volatile i32 7, i32 addrspace(4)* %stof +define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32 addrspace(5)* %ptr) #0 { + %stof = addrspacecast i32 addrspace(5)* %ptr to i32* + store volatile i32 7, i32* %stof ret void } @@ -89,8 +90,8 @@ ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 ; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]] define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 { - %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)* - store volatile i32 7, i32 addrspace(4)* %stof + %stof = addrspacecast i32 addrspace(1)* %ptr to i32* + store volatile i32 7, i32* %stof ret void } @@ -101,8 +102,8 @@ ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] ; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}} define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 { - %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)* - %ld = load volatile i32, i32 addrspace(4)* %stof + %stof = addrspacecast i32 addrspace(2)* %ptr to i32* + %ld = load volatile i32, i32* %stof ret void } @@ -117,8 +118,8 @@ ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]] ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}} ; HSA: ds_write_b32 [[CASTPTR]], v[[K]] -define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 { - %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)* +define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32* %ptr) #0 { + %ftos = addrspacecast i32* %ptr to i32 addrspace(3)* store volatile i32 0, i32 addrspace(3)* %ftos ret void } @@ -134,9 +135,9 @@ ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], 0, v[[VPTR_LO]] ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}} ; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} -define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 { - %ftos = addrspacecast i32 addrspace(4)* %ptr to i32* - store volatile i32 0, i32* %ftos +define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32* %ptr) #0 { + %ftos = addrspacecast i32* %ptr to i32 addrspace(5)* + store volatile i32 0, i32 addrspace(5)* %ftos ret void } @@ -148,8 +149,8 @@ ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0 ; HSA: {{flat|global}}_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]] -define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 { - %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)* +define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32* %ptr) #0 { + %ftos = addrspacecast i32* %ptr to i32 addrspace(1)* store volatile i32 0, i32 addrspace(1)* %ftos ret void } @@ -159,8 +160,8 @@ ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0 ; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0 -define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 { - %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)* +define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32* %ptr) #0 { + %ftos = addrspacecast i32* %ptr to i32 addrspace(2)* load volatile i32, i32 addrspace(2)* %ftos ret void } @@ -178,8 +179,8 @@ ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} ; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] define amdgpu_kernel void @cast_0_group_to_flat_addrspacecast() #0 { - %cast = addrspacecast i32 addrspace(3)* null to i32 addrspace(4)* - store volatile i32 7, i32 addrspace(4)* %cast + %cast = addrspacecast i32 addrspace(3)* null to i32* + store volatile i32 7, i32* %cast ret void } @@ -188,7 +189,7 @@ ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} ; HSA: ds_write_b32 [[PTR]], [[K]] define amdgpu_kernel void @cast_0_flat_to_group_addrspacecast() #0 { - %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(3)* + %cast = addrspacecast i32* null to i32 addrspace(3)* store volatile i32 7, i32 addrspace(3)* %cast ret void } @@ -199,8 +200,8 @@ ; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast() #0 { - %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32 addrspace(4)* - store volatile i32 7, i32 addrspace(4)* %cast + %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32* + store volatile i32 7, i32* %cast ret void } @@ -209,7 +210,7 @@ ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} ; HSA: ds_write_b32 [[PTR]], [[K]] define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 { - %cast = addrspacecast i32 addrspace(4)* inttoptr (i64 -1 to i32 addrspace(4)*) to i32 addrspace(3)* + %cast = addrspacecast i32* inttoptr (i64 -1 to i32*) to i32 addrspace(3)* store volatile i32 7, i32 addrspace(3)* %cast ret void } @@ -224,8 +225,8 @@ ; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 { - %cast = addrspacecast i32* null to i32 addrspace(4)* - store volatile i32 7, i32 addrspace(4)* %cast + %cast = addrspacecast i32 addrspace(5)* null to i32* + store volatile i32 7, i32* %cast ret void } @@ -233,8 +234,8 @@ ; HSA: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} ; HSA: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 { - %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(0)* - store volatile i32 7, i32* %cast + %cast = addrspacecast i32* null to i32 addrspace(5)* + store volatile i32 7, i32 addrspace(5)* %cast ret void } @@ -250,17 +251,17 @@ br i1 %cmp, label %local, label %global local: - %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)* + %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32* br label %end global: - %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* + %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32* br label %end end: - %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ] - store volatile i32 %x, i32 addrspace(4)* %fptr, align 4 -; %val = load i32, i32 addrspace(4)* %fptr, align 4 + %fptr = phi i32* [ %flat_local, %local ], [ %flat_global, %global ] + store volatile i32 %x, i32* %fptr, align 4 +; %val = load i32, i32* %fptr, align 4 ; store i32 %val, i32 addrspace(1)* %out, align 4 ret void } @@ -278,14 +279,14 @@ ; HSA: s_barrier ; HSA: {{flat|global}}_load_dword define amdgpu_kernel void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 { - %alloca = alloca i32, i32 9, align 4 + %alloca = alloca i32, i32 9, align 4, addrspace(5) %x = call i32 @llvm.amdgcn.workitem.id.x() #2 - %pptr = getelementptr i32, i32* %alloca, i32 %x - %fptr = addrspacecast i32* %pptr to i32 addrspace(4)* - store volatile i32 %x, i32 addrspace(4)* %fptr + %pptr = getelementptr i32, i32 addrspace(5)* %alloca, i32 %x + %fptr = addrspacecast i32 addrspace(5)* %pptr to i32* + store volatile i32 %x, i32* %fptr ; Dummy call call void @llvm.amdgcn.s.barrier() #1 - %reload = load volatile i32, i32 addrspace(4)* %fptr, align 4 + %reload = load volatile i32, i32* %fptr, align 4 store volatile i32 %reload, i32 addrspace(1)* %out, align 4 ret void } Index: test/CodeGen/AMDGPU/amdgcn.private-memory.ll =================================================================== --- test/CodeGen/AMDGPU/amdgcn.private-memory.ll +++ test/CodeGen/AMDGPU/amdgcn.private-memory.ll @@ -4,6 +4,7 @@ ; RUN: llc -mattr=-promote-alloca,-flat-for-global -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-ALLOCA -check-prefix=HSA %s ; RUN: llc -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-PROMOTE %s ; RUN: llc -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-ALLOCA %s +target datalayout = "A5" declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -17,13 +18,13 @@ ; GCN: buffer_store_dword [[RESULT]] define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) { entry: - %0 = alloca [2 x i32] - %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0 - %2 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 1 - store i32 0, i32* %1 - store i32 1, i32* %2 - %3 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 %in - %4 = load i32, i32* %3 + %0 = alloca [2 x i32], addrspace(5) + %1 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %0, i32 0, i32 0 + %2 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %0, i32 0, i32 1 + store i32 0, i32 addrspace(5)* %1 + store i32 1, i32 addrspace(5)* %2 + %3 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %0, i32 0, i32 %in + %4 = load i32, i32 addrspace(5)* %3 %5 = call i32 @llvm.amdgcn.workitem.id.x() %6 = add i32 %4, %5 store i32 %6, i32 addrspace(1)* %out Index: test/CodeGen/AMDGPU/amdgpu-alias-analysis.ll =================================================================== --- test/CodeGen/AMDGPU/amdgpu-alias-analysis.ll +++ test/CodeGen/AMDGPU/amdgpu-alias-analysis.ll @@ -1,9 +1,9 @@ ; RUN: opt -mtriple=amdgcn-- -O3 -aa-eval -print-all-alias-modref-info -disable-output < %s 2>&1 | FileCheck %s ; RUN: opt -mtriple=r600-- -O3 -aa-eval -print-all-alias-modref-info -disable-output < %s 2>&1 | FileCheck %s -; CHECK: NoAlias: i8 addrspace(1)* %p1, i8* %p +; CHECK: NoAlias: i8 addrspace(1)* %p1, i8 addrspace(5)* %p -define void @test(i8* %p, i8 addrspace(1)* %p1) { +define void @test(i8 addrspace(5)* %p, i8 addrspace(1)* %p1) { ret void } Index: test/CodeGen/AMDGPU/amdgpu-inline.ll =================================================================== --- test/CodeGen/AMDGPU/amdgpu-inline.ll +++ test/CodeGen/AMDGPU/amdgpu-inline.ll @@ -1,5 +1,6 @@ ; RUN: opt -mtriple=amdgcn--amdhsa -O3 -S -amdgpu-function-calls -inline-threshold=1 < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-INL1 %s ; RUN: opt -mtriple=amdgcn--amdhsa -O3 -S -amdgpu-function-calls < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-INLDEF %s +target datalayout = "A5" define coldcc float @foo(float %x, float %y) { entry: @@ -10,30 +11,30 @@ ret float %cond } -define coldcc void @foo_private_ptr(float* nocapture %p) { +define coldcc void @foo_private_ptr(float addrspace(5)* nocapture %p) { entry: - %tmp1 = load float, float* %p, align 4 + %tmp1 = load float, float addrspace(5)* %p, align 4 %cmp = fcmp ogt float %tmp1, 1.000000e+00 br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry %div = fdiv float 1.000000e+00, %tmp1 - store float %div, float* %p, align 4 + store float %div, float addrspace(5)* %p, align 4 br label %if.end if.end: ; preds = %if.then, %entry ret void } -define coldcc void @foo_private_ptr2(float* nocapture %p1, float* nocapture %p2) { +define coldcc void @foo_private_ptr2(float addrspace(5)* nocapture %p1, float addrspace(5)* nocapture %p2) { entry: - %tmp1 = load float, float* %p1, align 4 + %tmp1 = load float, float addrspace(5)* %p1, align 4 %cmp = fcmp ogt float %tmp1, 1.000000e+00 br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry %div = fdiv float 2.000000e+00, %tmp1 - store float %div, float* %p2, align 4 + store float %div, float addrspace(5)* %p2, align 4 br label %if.end if.end: ; preds = %if.then, %entry @@ -46,11 +47,11 @@ ret float %call } -define void @foo_noinline(float* nocapture %p) #0 { +define void @foo_noinline(float addrspace(5)* nocapture %p) #0 { entry: - %tmp1 = load float, float* %p, align 4 + %tmp1 = load float, float addrspace(5)* %p, align 4 %mul = fmul float %tmp1, 2.000000e+00 - store float %mul, float* %p, align 4 + store float %mul, float addrspace(5)* %p, align 4 ret void } @@ -63,7 +64,7 @@ ; GCN: tail call float @_Z3sinf( define amdgpu_kernel void @test_inliner(float addrspace(1)* nocapture %a, i32 %n) { entry: - %pvt_arr = alloca [64 x float], align 4 + %pvt_arr = alloca [64 x float], align 4, addrspace(5) %tid = tail call i32 @llvm.amdgcn.workitem.id.x() %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i32 %tid %tmp2 = load float, float addrspace(1)* %arrayidx, align 4 @@ -72,22 +73,22 @@ %tmp5 = load float, float addrspace(1)* %arrayidx2, align 4 %c1 = tail call coldcc float @foo(float %tmp2, float %tmp5) %or = or i32 %tid, %n - %arrayidx5 = getelementptr inbounds [64 x float], [64 x float]* %pvt_arr, i32 0, i32 %or - store float %c1, float* %arrayidx5, align 4 - %arrayidx7 = getelementptr inbounds [64 x float], [64 x float]* %pvt_arr, i32 0, i32 %or - call coldcc void @foo_private_ptr(float* %arrayidx7) - %arrayidx8 = getelementptr inbounds [64 x float], [64 x float]* %pvt_arr, i32 0, i32 1 - %arrayidx9 = getelementptr inbounds [64 x float], [64 x float]* %pvt_arr, i32 0, i32 2 - call coldcc void @foo_private_ptr2(float* %arrayidx8, float* %arrayidx9) - call void @foo_noinline(float* %arrayidx7) + %arrayidx5 = getelementptr inbounds [64 x float], [64 x float] addrspace(5)* %pvt_arr, i32 0, i32 %or + store float %c1, float addrspace(5)* %arrayidx5, align 4 + %arrayidx7 = getelementptr inbounds [64 x float], [64 x float] addrspace(5)* %pvt_arr, i32 0, i32 %or + call coldcc void @foo_private_ptr(float addrspace(5)* %arrayidx7) + %arrayidx8 = getelementptr inbounds [64 x float], [64 x float] addrspace(5)* %pvt_arr, i32 0, i32 1 + %arrayidx9 = getelementptr inbounds [64 x float], [64 x float] addrspace(5)* %pvt_arr, i32 0, i32 2 + call coldcc void @foo_private_ptr2(float addrspace(5)* %arrayidx8, float addrspace(5)* %arrayidx9) + call void @foo_noinline(float addrspace(5)* %arrayidx7) %and = and i32 %tid, %n - %arrayidx11 = getelementptr inbounds [64 x float], [64 x float]* %pvt_arr, i32 0, i32 %and - %tmp12 = load float, float* %arrayidx11, align 4 + %arrayidx11 = getelementptr inbounds [64 x float], [64 x float] addrspace(5)* %pvt_arr, i32 0, i32 %and + %tmp12 = load float, float addrspace(5)* %arrayidx11, align 4 %c2 = call coldcc float @sin_wrapper(float %tmp12) - store float %c2, float* %arrayidx7, align 4 + store float %c2, float addrspace(5)* %arrayidx7, align 4 %xor = xor i32 %tid, %n - %arrayidx16 = getelementptr inbounds [64 x float], [64 x float]* %pvt_arr, i32 0, i32 %xor - %tmp16 = load float, float* %arrayidx16, align 4 + %arrayidx16 = getelementptr inbounds [64 x float], [64 x float] addrspace(5)* %pvt_arr, i32 0, i32 %xor + %tmp16 = load float, float addrspace(5)* %arrayidx16, align 4 store float %tmp16, float addrspace(1)* %arrayidx, align 4 ret void } @@ -96,23 +97,23 @@ ; GCN: %div.i{{[0-9]*}} = fdiv float 2.000000e+00, %tmp1.i define amdgpu_kernel void @test_inliner_multi_pvt_ptr(float addrspace(1)* nocapture %a, i32 %n, float %v) { entry: - %pvt_arr1 = alloca [32 x float], align 4 - %pvt_arr2 = alloca [32 x float], align 4 + %pvt_arr1 = alloca [32 x float], align 4, addrspace(5) + %pvt_arr2 = alloca [32 x float], align 4, addrspace(5) %tid = tail call i32 @llvm.amdgcn.workitem.id.x() %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i32 %tid %or = or i32 %tid, %n - %arrayidx4 = getelementptr inbounds [32 x float], [32 x float]* %pvt_arr1, i32 0, i32 %or - %arrayidx5 = getelementptr inbounds [32 x float], [32 x float]* %pvt_arr2, i32 0, i32 %or - store float %v, float* %arrayidx4, align 4 - store float %v, float* %arrayidx5, align 4 - %arrayidx8 = getelementptr inbounds [32 x float], [32 x float]* %pvt_arr1, i32 0, i32 1 - %arrayidx9 = getelementptr inbounds [32 x float], [32 x float]* %pvt_arr2, i32 0, i32 2 - call coldcc void @foo_private_ptr2(float* %arrayidx8, float* %arrayidx9) + %arrayidx4 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr1, i32 0, i32 %or + %arrayidx5 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr2, i32 0, i32 %or + store float %v, float addrspace(5)* %arrayidx4, align 4 + store float %v, float addrspace(5)* %arrayidx5, align 4 + %arrayidx8 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr1, i32 0, i32 1 + %arrayidx9 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr2, i32 0, i32 2 + call coldcc void @foo_private_ptr2(float addrspace(5)* %arrayidx8, float addrspace(5)* %arrayidx9) %xor = xor i32 %tid, %n - %arrayidx15 = getelementptr inbounds [32 x float], [32 x float]* %pvt_arr1, i32 0, i32 %xor - %arrayidx16 = getelementptr inbounds [32 x float], [32 x float]* %pvt_arr2, i32 0, i32 %xor - %tmp15 = load float, float* %arrayidx15, align 4 - %tmp16 = load float, float* %arrayidx16, align 4 + %arrayidx15 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr1, i32 0, i32 %xor + %arrayidx16 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr2, i32 0, i32 %xor + %tmp15 = load float, float addrspace(5)* %arrayidx15, align 4 + %tmp16 = load float, float addrspace(5)* %arrayidx16, align 4 %tmp17 = fadd float %tmp15, %tmp16 store float %tmp17, float addrspace(1)* %arrayidx, align 4 ret void @@ -123,23 +124,23 @@ ; GCN-INLDEF: %div.i{{[0-9]*}} = fdiv float 2.000000e+00, %tmp1.i define amdgpu_kernel void @test_inliner_multi_pvt_ptr_cutoff(float addrspace(1)* nocapture %a, i32 %n, float %v) { entry: - %pvt_arr1 = alloca [32 x float], align 4 - %pvt_arr2 = alloca [33 x float], align 4 + %pvt_arr1 = alloca [32 x float], align 4, addrspace(5) + %pvt_arr2 = alloca [33 x float], align 4, addrspace(5) %tid = tail call i32 @llvm.amdgcn.workitem.id.x() %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i32 %tid %or = or i32 %tid, %n - %arrayidx4 = getelementptr inbounds [32 x float], [32 x float]* %pvt_arr1, i32 0, i32 %or - %arrayidx5 = getelementptr inbounds [33 x float], [33 x float]* %pvt_arr2, i32 0, i32 %or - store float %v, float* %arrayidx4, align 4 - store float %v, float* %arrayidx5, align 4 - %arrayidx8 = getelementptr inbounds [32 x float], [32 x float]* %pvt_arr1, i32 0, i32 1 - %arrayidx9 = getelementptr inbounds [33 x float], [33 x float]* %pvt_arr2, i32 0, i32 2 - call coldcc void @foo_private_ptr2(float* %arrayidx8, float* %arrayidx9) + %arrayidx4 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr1, i32 0, i32 %or + %arrayidx5 = getelementptr inbounds [33 x float], [33 x float] addrspace(5)* %pvt_arr2, i32 0, i32 %or + store float %v, float addrspace(5)* %arrayidx4, align 4 + store float %v, float addrspace(5)* %arrayidx5, align 4 + %arrayidx8 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr1, i32 0, i32 1 + %arrayidx9 = getelementptr inbounds [33 x float], [33 x float] addrspace(5)* %pvt_arr2, i32 0, i32 2 + call coldcc void @foo_private_ptr2(float addrspace(5)* %arrayidx8, float addrspace(5)* %arrayidx9) %xor = xor i32 %tid, %n - %arrayidx15 = getelementptr inbounds [32 x float], [32 x float]* %pvt_arr1, i32 0, i32 %xor - %arrayidx16 = getelementptr inbounds [33 x float], [33 x float]* %pvt_arr2, i32 0, i32 %xor - %tmp15 = load float, float* %arrayidx15, align 4 - %tmp16 = load float, float* %arrayidx16, align 4 + %arrayidx15 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr1, i32 0, i32 %xor + %arrayidx16 = getelementptr inbounds [33 x float], [33 x float] addrspace(5)* %pvt_arr2, i32 0, i32 %xor + %tmp15 = load float, float addrspace(5)* %arrayidx15, align 4 + %tmp16 = load float, float addrspace(5)* %arrayidx16, align 4 %tmp17 = fadd float %tmp15, %tmp16 store float %tmp17, float addrspace(1)* %arrayidx, align 4 ret void Index: test/CodeGen/AMDGPU/amdgpu.private-memory.ll =================================================================== --- test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -9,6 +9,7 @@ ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -enable-var-scope -check-prefix=NOHSAOPT -check-prefix=OPT %s ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC +target datalayout = "A5" ; HSAOPT: @mova_same_clause.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4 @@ -80,19 +81,19 @@ ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !1 define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: - %stack = alloca [5 x i32], align 4 + %stack = alloca [5 x i32], align 4, addrspace(5) %0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 - store i32 4, i32* %arrayidx1, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0 + store i32 4, i32 addrspace(5)* %arrayidx1, align 4 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 - store i32 5, i32* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 - %2 = load i32, i32* %arrayidx10, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1 + store i32 5, i32 addrspace(5)* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 + %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 store i32 %2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 - %3 = load i32, i32* %arrayidx12 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 + %3 = load i32, i32 addrspace(5)* %arrayidx12 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 store i32 %3, i32 addrspace(1)* %arrayidx13 ret void @@ -102,19 +103,19 @@ ; OPT: getelementptr inbounds [256 x [8 x i32]], [256 x [8 x i32]] addrspace(3)* @high_alignment.stack, i32 0, i32 %{{[0-9]+}} define amdgpu_kernel void @high_alignment(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: - %stack = alloca [8 x i32], align 16 + %stack = alloca [8 x i32], align 16, addrspace(5) %0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [8 x i32], [8 x i32]* %stack, i32 0, i32 %0 - store i32 4, i32* %arrayidx1, align 4 + %arrayidx1 = getelementptr inbounds [8 x i32], [8 x i32] addrspace(5)* %stack, i32 0, i32 %0 + store i32 4, i32 addrspace(5)* %arrayidx1, align 4 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [8 x i32], [8 x i32]* %stack, i32 0, i32 %1 - store i32 5, i32* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [8 x i32], [8 x i32]* %stack, i32 0, i32 0 - %2 = load i32, i32* %arrayidx10, align 4 + %arrayidx3 = getelementptr inbounds [8 x i32], [8 x i32] addrspace(5)* %stack, i32 0, i32 %1 + store i32 5, i32 addrspace(5)* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [8 x i32], [8 x i32] addrspace(5)* %stack, i32 0, i32 0 + %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 store i32 %2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [8 x i32], [8 x i32]* %stack, i32 0, i32 1 - %3 = load i32, i32* %arrayidx12 + %arrayidx12 = getelementptr inbounds [8 x i32], [8 x i32] addrspace(5)* %stack, i32 0, i32 1 + %3 = load i32, i32 addrspace(5)* %arrayidx12 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 store i32 %3, i32 addrspace(1)* %arrayidx13 ret void @@ -127,19 +128,19 @@ ; SI-NOT: ds_write define amdgpu_kernel void @no_replace_inbounds_gep(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: - %stack = alloca [5 x i32], align 4 + %stack = alloca [5 x i32], align 4, addrspace(5) %0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 - store i32 4, i32* %arrayidx1, align 4 + %arrayidx1 = getelementptr [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0 + store i32 4, i32 addrspace(5)* %arrayidx1, align 4 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 - store i32 5, i32* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 - %2 = load i32, i32* %arrayidx10, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1 + store i32 5, i32 addrspace(5)* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 + %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 store i32 %2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 - %3 = load i32, i32* %arrayidx12 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 + %3 = load i32, i32 addrspace(5)* %arrayidx12 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 store i32 %3, i32 addrspace(1)* %arrayidx13 ret void @@ -162,20 +163,20 @@ define amdgpu_kernel void @multiple_structs(i32 addrspace(1)* %out) #0 { entry: - %a = alloca %struct.point - %b = alloca %struct.point - %a.x.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0 - %a.y.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 1 - %b.x.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0 - %b.y.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 1 - store i32 0, i32* %a.x.ptr - store i32 1, i32* %a.y.ptr - store i32 2, i32* %b.x.ptr - store i32 3, i32* %b.y.ptr - %a.indirect.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0 - %b.indirect.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0 - %a.indirect = load i32, i32* %a.indirect.ptr - %b.indirect = load i32, i32* %b.indirect.ptr + %a = alloca %struct.point, addrspace(5) + %b = alloca %struct.point, addrspace(5) + %a.x.ptr = getelementptr %struct.point, %struct.point addrspace(5)* %a, i32 0, i32 0 + %a.y.ptr = getelementptr %struct.point, %struct.point addrspace(5)* %a, i32 0, i32 1 + %b.x.ptr = getelementptr %struct.point, %struct.point addrspace(5)* %b, i32 0, i32 0 + %b.y.ptr = getelementptr %struct.point, %struct.point addrspace(5)* %b, i32 0, i32 1 + store i32 0, i32 addrspace(5)* %a.x.ptr + store i32 1, i32 addrspace(5)* %a.y.ptr + store i32 2, i32 addrspace(5)* %b.x.ptr + store i32 3, i32 addrspace(5)* %b.y.ptr + %a.indirect.ptr = getelementptr %struct.point, %struct.point addrspace(5)* %a, i32 0, i32 0 + %b.indirect.ptr = getelementptr %struct.point, %struct.point addrspace(5)* %b, i32 0, i32 0 + %a.indirect = load i32, i32 addrspace(5)* %a.indirect.ptr + %b.indirect = load i32, i32 addrspace(5)* %b.indirect.ptr %0 = add i32 %a.indirect, %b.indirect store i32 %0, i32 addrspace(1)* %out ret void @@ -191,32 +192,32 @@ define amdgpu_kernel void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { entry: - %prv_array_const = alloca [2 x i32] - %prv_array = alloca [2 x i32] + %prv_array_const = alloca [2 x i32], addrspace(5) + %prv_array = alloca [2 x i32], addrspace(5) %a = load i32, i32 addrspace(1)* %in %b_src_ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 %b = load i32, i32 addrspace(1)* %b_src_ptr - %a_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0 - store i32 %a, i32* %a_dst_ptr - %b_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 1 - store i32 %b, i32* %b_dst_ptr + %a_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array_const, i32 0, i32 0 + store i32 %a, i32 addrspace(5)* %a_dst_ptr + %b_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array_const, i32 0, i32 1 + store i32 %b, i32 addrspace(5)* %b_dst_ptr br label %for.body for.body: %inc = phi i32 [0, %entry], [%count, %for.body] - %x_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0 - %x = load i32, i32* %x_ptr - %y_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0 - %y = load i32, i32* %y_ptr + %x_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array_const, i32 0, i32 0 + %x = load i32, i32 addrspace(5)* %x_ptr + %y_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array, i32 0, i32 0 + %y = load i32, i32 addrspace(5)* %y_ptr %xy = add i32 %x, %y - store i32 %xy, i32* %y_ptr + store i32 %xy, i32 addrspace(5)* %y_ptr %count = add i32 %inc, 1 %done = icmp eq i32 %count, 4095 br i1 %done, label %for.end, label %for.body for.end: - %value_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0 - %value = load i32, i32* %value_ptr + %value_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array, i32 0, i32 0 + %value = load i32, i32 addrspace(5)* %value_ptr store i32 %value, i32 addrspace(1)* %out ret void } @@ -235,13 +236,13 @@ ; SI-PROMOTE: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[SCALED_IDX]], 16 define amdgpu_kernel void @short_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: - %0 = alloca [2 x i16] - %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0 - %2 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 1 - store i16 0, i16* %1 - store i16 1, i16* %2 - %3 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 %index - %4 = load i16, i16* %3 + %0 = alloca [2 x i16], addrspace(5) + %1 = getelementptr inbounds [2 x i16], [2 x i16] addrspace(5)* %0, i32 0, i32 0 + %2 = getelementptr inbounds [2 x i16], [2 x i16] addrspace(5)* %0, i32 0, i32 1 + store i16 0, i16 addrspace(5)* %1 + store i16 1, i16 addrspace(5)* %2 + %3 = getelementptr inbounds [2 x i16], [2 x i16] addrspace(5)* %0, i32 0, i32 %index + %4 = load i16, i16 addrspace(5)* %3 %5 = sext i16 %4 to i32 store i32 %5, i32 addrspace(1)* %out ret void @@ -257,13 +258,13 @@ ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: [0x05,0x00,0x60,0xe0 define amdgpu_kernel void @char_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: - %0 = alloca [2 x i8] - %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0 - %2 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 1 - store i8 0, i8* %1 - store i8 1, i8* %2 - %3 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 %index - %4 = load i8, i8* %3 + %0 = alloca [2 x i8], addrspace(5) + %1 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %0, i32 0, i32 0 + %2 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %0, i32 0, i32 1 + store i8 0, i8 addrspace(5)* %1 + store i8 1, i8 addrspace(5)* %2 + %3 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %0, i32 0, i32 %index + %4 = load i8, i8 addrspace(5)* %3 %5 = sext i8 %4 to i32 store i32 %5, i32 addrspace(1)* %out ret void @@ -280,22 +281,22 @@ ; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; define amdgpu_kernel void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 { entry: - %0 = alloca [3 x i8], align 1 - %1 = alloca [2 x i8], align 1 - %2 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 0 - %3 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 1 - %4 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 2 - %5 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 0 - %6 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 1 - store i8 0, i8* %2 - store i8 1, i8* %3 - store i8 2, i8* %4 - store i8 1, i8* %5 - store i8 0, i8* %6 - %7 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 %in - %8 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 %in - %9 = load i8, i8* %7 - %10 = load i8, i8* %8 + %0 = alloca [3 x i8], align 1, addrspace(5) + %1 = alloca [2 x i8], align 1, addrspace(5) + %2 = getelementptr [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 0 + %3 = getelementptr [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 1 + %4 = getelementptr [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 2 + %5 = getelementptr [2 x i8], [2 x i8] addrspace(5)* %1, i32 0, i32 0 + %6 = getelementptr [2 x i8], [2 x i8] addrspace(5)* %1, i32 0, i32 1 + store i8 0, i8 addrspace(5)* %2 + store i8 1, i8 addrspace(5)* %3 + store i8 2, i8 addrspace(5)* %4 + store i8 1, i8 addrspace(5)* %5 + store i8 0, i8 addrspace(5)* %6 + %7 = getelementptr [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 %in + %8 = getelementptr [2 x i8], [2 x i8] addrspace(5)* %1, i32 0, i32 %in + %9 = load i8, i8 addrspace(5)* %7 + %10 = load i8, i8 addrspace(5)* %8 %11 = add i8 %9, %10 %12 = sext i8 %11 to i32 store i32 %12, i32 addrspace(1)* %out @@ -304,13 +305,13 @@ define amdgpu_kernel void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: - %alloca = alloca [2 x [2 x i8]] - %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0 - %gep1 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 1 - store i8 0, i8* %gep0 - store i8 1, i8* %gep1 - %gep2 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 %index - %load = load i8, i8* %gep2 + %alloca = alloca [2 x [2 x i8]], addrspace(5) + %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]] addrspace(5)* %alloca, i32 0, i32 0, i32 0 + %gep1 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]] addrspace(5)* %alloca, i32 0, i32 0, i32 1 + store i8 0, i8 addrspace(5)* %gep0 + store i8 1, i8 addrspace(5)* %gep1 + %gep2 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index + %load = load i8, i8 addrspace(5)* %gep2 %sext = sext i8 %load to i32 store i32 %sext, i32 addrspace(1)* %out ret void @@ -318,26 +319,26 @@ define amdgpu_kernel void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: - %alloca = alloca [2 x [2 x i32]] - %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0 - %gep1 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1 - store i32 0, i32* %gep0 - store i32 1, i32* %gep1 - %gep2 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index - %load = load i32, i32* %gep2 + %alloca = alloca [2 x [2 x i32]], addrspace(5) + %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 0 + %gep1 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 1 + store i32 0, i32 addrspace(5)* %gep0 + store i32 1, i32 addrspace(5)* %gep1 + %gep2 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index + %load = load i32, i32 addrspace(5)* %gep2 store i32 %load, i32 addrspace(1)* %out ret void } define amdgpu_kernel void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 { entry: - %alloca = alloca [2 x [2 x i64]] - %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0 - %gep1 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 1 - store i64 0, i64* %gep0 - store i64 1, i64* %gep1 - %gep2 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 %index - %load = load i64, i64* %gep2 + %alloca = alloca [2 x [2 x i64]], addrspace(5) + %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]] addrspace(5)* %alloca, i32 0, i32 0, i32 0 + %gep1 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]] addrspace(5)* %alloca, i32 0, i32 0, i32 1 + store i64 0, i64 addrspace(5)* %gep0 + store i64 1, i64 addrspace(5)* %gep1 + %gep2 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index + %load = load i64, i64 addrspace(5)* %gep2 store i64 %load, i64 addrspace(1)* %out ret void } @@ -346,40 +347,40 @@ define amdgpu_kernel void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: - %alloca = alloca [2 x [2 x %struct.pair32]] - %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1 - %gep1 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 1, i32 1 - store i32 0, i32* %gep0 - store i32 1, i32* %gep1 - %gep2 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 %index, i32 0 - %load = load i32, i32* %gep2 + %alloca = alloca [2 x [2 x %struct.pair32]], addrspace(5) + %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]] addrspace(5)* %alloca, i32 0, i32 0, i32 0, i32 1 + %gep1 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]] addrspace(5)* %alloca, i32 0, i32 0, i32 1, i32 1 + store i32 0, i32 addrspace(5)* %gep0 + store i32 1, i32 addrspace(5)* %gep1 + %gep2 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index, i32 0 + %load = load i32, i32 addrspace(5)* %gep2 store i32 %load, i32 addrspace(1)* %out ret void } define amdgpu_kernel void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: - %alloca = alloca [2 x %struct.pair32] - %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1 - %gep1 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 1, i32 0 - store i32 0, i32* %gep0 - store i32 1, i32* %gep1 - %gep2 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 %index, i32 0 - %load = load i32, i32* %gep2 + %alloca = alloca [2 x %struct.pair32], addrspace(5) + %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32] addrspace(5)* %alloca, i32 0, i32 0, i32 1 + %gep1 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32] addrspace(5)* %alloca, i32 0, i32 1, i32 0 + store i32 0, i32 addrspace(5)* %gep0 + store i32 1, i32 addrspace(5)* %gep1 + %gep2 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32] addrspace(5)* %alloca, i32 0, i32 %index, i32 0 + %load = load i32, i32 addrspace(5)* %gep2 store i32 %load, i32 addrspace(1)* %out ret void } define amdgpu_kernel void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind { entry: - %tmp = alloca [2 x i32] - %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 - %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1 - store i32 0, i32* %tmp1 - store i32 1, i32* %tmp2 + %tmp = alloca [2 x i32], addrspace(5) + %tmp1 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 0 + %tmp2 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 1 + store i32 0, i32 addrspace(5)* %tmp1 + store i32 1, i32 addrspace(5)* %tmp2 %cmp = icmp eq i32 %in, 0 - %sel = select i1 %cmp, i32* %tmp1, i32* %tmp2 - %load = load i32, i32* %sel + %sel = select i1 %cmp, i32 addrspace(5)* %tmp1, i32 addrspace(5)* %tmp2 + %load = load i32, i32 addrspace(5)* %sel store i32 %load, i32 addrspace(1)* %out ret void } @@ -393,14 +394,14 @@ ; SI: v_add_{{[iu]}}32_e32 [[ADD_OFFSET:v[0-9]+]], vcc, 5, ; SI: buffer_load_dword v{{[0-9]+}}, [[ADD_OFFSET:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { - %alloca = alloca [16 x i32] - %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a - store i32 5, i32* %tmp0 - %tmp1 = ptrtoint [16 x i32]* %alloca to i32 + %alloca = alloca [16 x i32], addrspace(5) + %tmp0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a + store i32 5, i32 addrspace(5)* %tmp0 + %tmp1 = ptrtoint [16 x i32] addrspace(5)* %alloca to i32 %tmp2 = add i32 %tmp1, 5 - %tmp3 = inttoptr i32 %tmp2 to i32* - %tmp4 = getelementptr i32, i32* %tmp3, i32 %b - %tmp5 = load i32, i32* %tmp4 + %tmp3 = inttoptr i32 %tmp2 to i32 addrspace(5)* + %tmp4 = getelementptr i32, i32 addrspace(5)* %tmp3, i32 %b + %tmp5 = load i32, i32 addrspace(5)* %tmp4 store i32 %tmp5, i32 addrspace(1)* %out ret void } @@ -410,15 +411,15 @@ ; OPT: load i32 addrspace(1)*, i32 addrspace(1)* addrspace(3)* %{{[0-9]+}}, align 4 define amdgpu_kernel void @pointer_typed_alloca(i32 addrspace(1)* %A) { entry: - %A.addr = alloca i32 addrspace(1)*, align 4 - store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4 - %ld0 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4 + %A.addr = alloca i32 addrspace(1)*, align 4, addrspace(5) + store i32 addrspace(1)* %A, i32 addrspace(1)* addrspace(5)* %A.addr, align 4 + %ld0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %A.addr, align 4 %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %ld0, i32 0 store i32 1, i32 addrspace(1)* %arrayidx, align 4 - %ld1 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4 + %ld1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %A.addr, align 4 %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %ld1, i32 1 store i32 2, i32 addrspace(1)* %arrayidx1, align 4 - %ld2 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4 + %ld2 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %A.addr, align 4 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %ld2, i32 2 store i32 3, i32 addrspace(1)* %arrayidx2, align 4 ret void @@ -461,9 +462,9 @@ ; SI: buffer_load_dword define amdgpu_kernel void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) { - %alloca = alloca [2 x <16 x i32>] - %tmp0 = getelementptr [2 x <16 x i32>], [2 x <16 x i32>]* %alloca, i32 0, i32 %a - %tmp5 = load <16 x i32>, <16 x i32>* %tmp0 + %alloca = alloca [2 x <16 x i32>], addrspace(5) + %tmp0 = getelementptr [2 x <16 x i32>], [2 x <16 x i32>] addrspace(5)* %alloca, i32 0, i32 %a + %tmp5 = load <16 x i32>, <16 x i32> addrspace(5)* %tmp0 store <16 x i32> %tmp5, <16 x i32> addrspace(1)* %out ret void } @@ -505,9 +506,9 @@ ; SI: buffer_load_dword define amdgpu_kernel void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) { - %alloca = alloca [2 x <16 x float>] - %tmp0 = getelementptr [2 x <16 x float>], [2 x <16 x float>]* %alloca, i32 0, i32 %a - %tmp5 = load <16 x float>, <16 x float>* %tmp0 + %alloca = alloca [2 x <16 x float>], addrspace(5) + %tmp0 = getelementptr [2 x <16 x float>], [2 x <16 x float>] addrspace(5)* %alloca, i32 0, i32 %a + %tmp5 = load <16 x float>, <16 x float> addrspace(5)* %tmp0 store <16 x float> %tmp5, <16 x float> addrspace(1)* %out ret void } @@ -521,9 +522,9 @@ ; SI: buffer_load_dword define amdgpu_kernel void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) { - %alloca = alloca [16 x <2 x float>] - %tmp0 = getelementptr [16 x <2 x float>], [16 x <2 x float>]* %alloca, i32 0, i32 %a - %tmp5 = load <2 x float>, <2 x float>* %tmp0 + %alloca = alloca [16 x <2 x float>], addrspace(5) + %tmp0 = getelementptr [16 x <2 x float>], [16 x <2 x float>] addrspace(5)* %alloca, i32 0, i32 %a + %tmp5 = load <2 x float>, <2 x float> addrspace(5)* %tmp0 store <2 x float> %tmp5, <2 x float> addrspace(1)* %out ret void } @@ -533,9 +534,9 @@ ; OPT: load [0 x i32], [0 x i32] addrspace(3)* define amdgpu_kernel void @direct_alloca_read_0xi32([0 x i32] addrspace(1)* %out, i32 %index) { entry: - %tmp = alloca [0 x i32] - store [0 x i32] [], [0 x i32]* %tmp - %load = load [0 x i32], [0 x i32]* %tmp + %tmp = alloca [0 x i32], addrspace(5) + store [0 x i32] [], [0 x i32] addrspace(5)* %tmp + %load = load [0 x i32], [0 x i32] addrspace(5)* %tmp store [0 x i32] %load, [0 x i32] addrspace(1)* %out ret void } @@ -545,9 +546,9 @@ ; OPT: load [1 x i32], [1 x i32] addrspace(3)* define amdgpu_kernel void @direct_alloca_read_1xi32([1 x i32] addrspace(1)* %out, i32 %index) { entry: - %tmp = alloca [1 x i32] - store [1 x i32] [i32 0], [1 x i32]* %tmp - %load = load [1 x i32], [1 x i32]* %tmp + %tmp = alloca [1 x i32], addrspace(5) + store [1 x i32] [i32 0], [1 x i32] addrspace(5)* %tmp + %load = load [1 x i32], [1 x i32] addrspace(5)* %tmp store [1 x i32] %load, [1 x i32] addrspace(1)* %out ret void } Index: test/CodeGen/AMDGPU/amdpal.ll =================================================================== --- test/CodeGen/AMDGPU/amdpal.ll +++ test/CodeGen/AMDGPU/amdpal.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=tahiti | FileCheck --check-prefix=PAL --enable-var-scope %s +target datalayout = "A5" ; PAL: .AMDGPU.config @@ -17,14 +18,14 @@ ; PAL: s_load_dwordx4 s{{\[}}[[SCRATCHDESC:[0-9]+]]:{{[0-9]+]}}, s{{\[}}[[GITPTR]]: ; PAL: buffer_store{{.*}}, s{{\[}}[[SCRATCHDESC]]: -define amdgpu_kernel void @scratch(<2 x i32> %in, i32 %idx, i32* %out) { +define amdgpu_kernel void @scratch(<2 x i32> %in, i32 %idx, i32 addrspace(5)* %out) { entry: - %v = alloca [2 x i32] - %vv = bitcast [2 x i32]* %v to <2 x i32>* - store <2 x i32> %in, <2 x i32>* %vv - %e = getelementptr [2 x i32], [2 x i32]* %v, i32 0, i32 %idx - %x = load i32, i32* %e - store i32 %x, i32* %out + %v = alloca [2 x i32], addrspace(5) + %vv = bitcast [2 x i32] addrspace(5)* %v to <2 x i32> addrspace(5)* + store <2 x i32> %in, <2 x i32> addrspace(5)* %vv + %e = getelementptr [2 x i32], [2 x i32] addrspace(5)* %v, i32 0, i32 %idx + %x = load i32, i32 addrspace(5)* %e + store i32 %x, i32 addrspace(5)* %out ret void } @@ -41,14 +42,14 @@ ; PAL: s_load_dwordx4 s{{\[}}[[SCRATCHDESC:[0-9]+]]:{{[0-9]+]}}, s{{\[}}[[GITPTR]]: ; PAL: buffer_store{{.*}}, s{{\[}}[[SCRATCHDESC]]: -define amdgpu_kernel void @scratch2(<2 x i32> %in, i32 %idx, i32* %out) #0 { +define amdgpu_kernel void @scratch2(<2 x i32> %in, i32 %idx, i32 addrspace(5)* %out) #0 { entry: - %v = alloca [2 x i32] - %vv = bitcast [2 x i32]* %v to <2 x i32>* - store <2 x i32> %in, <2 x i32>* %vv - %e = getelementptr [2 x i32], [2 x i32]* %v, i32 0, i32 %idx - %x = load i32, i32* %e - store i32 %x, i32* %out + %v = alloca [2 x i32], addrspace(5) + %vv = bitcast [2 x i32] addrspace(5)* %v to <2 x i32> addrspace(5)* + store <2 x i32> %in, <2 x i32> addrspace(5)* %vv + %e = getelementptr [2 x i32], [2 x i32] addrspace(5)* %v, i32 0, i32 %idx + %x = load i32, i32 addrspace(5)* %e + store i32 %x, i32 addrspace(5)* %out ret void } Index: test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll =================================================================== --- test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -176,57 +176,57 @@ ; HSA: define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 { define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 { - %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* - store volatile i32 0, i32 addrspace(4)* %stof + %stof = addrspacecast i32 addrspace(3)* %ptr to i32* + store volatile i32 0, i32* %stof ret void } -; HSA: define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #11 { -define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #1 { - %stof = addrspacecast i32* %ptr to i32 addrspace(4)* - store volatile i32 0, i32 addrspace(4)* %stof +; HSA: define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32 addrspace(5)* %ptr) #11 { +define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32 addrspace(5)* %ptr) #1 { + %stof = addrspacecast i32 addrspace(5)* %ptr to i32* + store volatile i32 0, i32* %stof ret void } -; HSA: define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 { -define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 { - %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)* +; HSA: define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32* %ptr) #1 { +define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32* %ptr) #1 { + %ftos = addrspacecast i32* %ptr to i32 addrspace(3)* store volatile i32 0, i32 addrspace(3)* %ftos ret void } -; HSA: define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 { -define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 { - %ftos = addrspacecast i32 addrspace(4)* %ptr to i32* - store volatile i32 0, i32* %ftos +; HSA: define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32* %ptr) #1 { +define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32* %ptr) #1 { + %ftos = addrspacecast i32* %ptr to i32 addrspace(5)* + store volatile i32 0, i32 addrspace(5)* %ftos ret void } ; No-op addrspacecast should not use queue ptr ; HSA: define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 { define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 { - %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)* - store volatile i32 0, i32 addrspace(4)* %stof + %stof = addrspacecast i32 addrspace(1)* %ptr to i32* + store volatile i32 0, i32* %stof ret void } ; HSA: define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 { define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 { - %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)* - %ld = load volatile i32, i32 addrspace(4)* %stof + %stof = addrspacecast i32 addrspace(2)* %ptr to i32* + %ld = load volatile i32, i32* %stof ret void } -; HSA: define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 { -define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 { - %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)* +; HSA: define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32* %ptr) #1 { +define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32* %ptr) #1 { + %ftos = addrspacecast i32* %ptr to i32 addrspace(1)* store volatile i32 0, i32 addrspace(1)* %ftos ret void } -; HSA: define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 { -define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 { - %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)* +; HSA: define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32* %ptr) #1 { +define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32* %ptr) #1 { + %ftos = addrspacecast i32* %ptr to i32 addrspace(2)* %ld = load volatile i32, i32 addrspace(2)* %ftos ret void } Index: test/CodeGen/AMDGPU/array-ptr-calc-i32.ll =================================================================== --- test/CodeGen/AMDGPU/array-ptr-calc-i32.ll +++ test/CodeGen/AMDGPU/array-ptr-calc-i32.ll @@ -1,5 +1,6 @@ ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tahiti -mattr=+promote-alloca < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s +target datalayout = "A5" declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1 @@ -20,12 +21,12 @@ ; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this ; alloca to a vector. It currently fails because it does not know how ; to interpret: -; getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 1, i32 %b +; getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 1, i32 %b ; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 64 ; SI-PROMOTE: ds_write_b32 [[PTRREG]] define amdgpu_kernel void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) #0 { - %alloca = alloca [16 x i32], align 16 + %alloca = alloca [16 x i32], align 16, addrspace(5) %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0); %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo) %a_ptr = getelementptr inbounds i32, i32 addrspace(1)* %inA, i32 %tid @@ -33,11 +34,11 @@ %a = load i32, i32 addrspace(1)* %a_ptr, !range !0 %b = load i32, i32 addrspace(1)* %b_ptr, !range !0 %result = add i32 %a, %b - %alloca_ptr = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 1, i32 %b - store i32 %result, i32* %alloca_ptr, align 4 + %alloca_ptr = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 1, i32 %b + store i32 %result, i32 addrspace(5)* %alloca_ptr, align 4 ; Dummy call call void @llvm.amdgcn.s.barrier() - %reload = load i32, i32* %alloca_ptr, align 4, !range !0 + %reload = load i32, i32 addrspace(5)* %alloca_ptr, align 4, !range !0 %out_ptr = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid store i32 %reload, i32 addrspace(1)* %out_ptr, align 4 ret void Index: test/CodeGen/AMDGPU/byval-frame-setup.ll =================================================================== --- test/CodeGen/AMDGPU/byval-frame-setup.ll +++ test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -1,5 +1,6 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s +target datalayout = "A5" %struct.ByValStruct = type { [4 x i32] } @@ -14,16 +15,16 @@ ; GCN-NOT: s32 ; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s5 offset:20{{$}} ; GCN-NOT: s32 -define void @void_func_byval_struct(%struct.ByValStruct* byval noalias nocapture align 4 %arg0, %struct.ByValStruct* byval noalias nocapture align 4 %arg1) #1 { +define void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 { entry: - %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0 - %tmp = load volatile i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 + %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 4 %add = add nsw i32 %tmp, 1 - store volatile i32 %add, i32* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0 - %tmp1 = load volatile i32, i32* %arrayidx2, align 4 + store volatile i32 %add, i32 addrspace(5)* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 + %tmp1 = load volatile i32, i32 addrspace(5)* %arrayidx2, align 4 %add3 = add nsw i32 %tmp1, 2 - store volatile i32 %add3, i32* %arrayidx2, align 4 + store volatile i32 %add3, i32 addrspace(5)* %arrayidx2, align 4 store volatile i32 9, i32 addrspace(1)* null, align 4 ret void } @@ -54,17 +55,17 @@ ; GCN: buffer_load_dword v33, ; GCN: s_sub_u32 s32, s32, 0xb00{{$}} ; GCN: s_setpc_b64 -define void @void_func_byval_struct_non_leaf(%struct.ByValStruct* byval noalias nocapture align 4 %arg0, %struct.ByValStruct* byval noalias nocapture align 4 %arg1) #1 { +define void @void_func_byval_struct_non_leaf(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 { entry: - %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0 - %tmp = load volatile i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 + %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 4 %add = add nsw i32 %tmp, 1 - store volatile i32 %add, i32* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0 - %tmp1 = load volatile i32, i32* %arrayidx2, align 4 + store volatile i32 %add, i32 addrspace(5)* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 + %tmp1 = load volatile i32, i32 addrspace(5)* %arrayidx2, align 4 %add3 = add nsw i32 %tmp1, 2 call void @external_void_func_void() - store volatile i32 %add3, i32* %arrayidx2, align 4 + store volatile i32 %add3, i32 addrspace(5)* %arrayidx2, align 4 store volatile i32 9, i32 addrspace(1)* null, align 4 ret void } @@ -114,19 +115,19 @@ ; GCN-NEXT: s_setpc_b64 define void @call_void_func_byval_struct_func() #0 { entry: - %arg0 = alloca %struct.ByValStruct, align 4 - %arg1 = alloca %struct.ByValStruct, align 4 - %tmp = bitcast %struct.ByValStruct* %arg0 to i8* - call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp) - %tmp1 = bitcast %struct.ByValStruct* %arg1 to i8* - call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp1) - %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0 - store volatile i32 9, i32* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0 - store volatile i32 13, i32* %arrayidx2, align 4 - call void @void_func_byval_struct(%struct.ByValStruct* byval nonnull align 4 %arg0, %struct.ByValStruct* byval nonnull align 4 %arg1) - call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp1) - call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp) + %arg0 = alloca %struct.ByValStruct, align 4, addrspace(5) + %arg1 = alloca %struct.ByValStruct, align 4, addrspace(5) + %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)* + call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp) + %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)* + call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1) + %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 + store volatile i32 9, i32 addrspace(5)* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 + store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 4 + call void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg1) + call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1) + call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp) ret void } @@ -167,45 +168,45 @@ ; GCN: s_endpgm define amdgpu_kernel void @call_void_func_byval_struct_kernel() #0 { entry: - %arg0 = alloca %struct.ByValStruct, align 4 - %arg1 = alloca %struct.ByValStruct, align 4 - %tmp = bitcast %struct.ByValStruct* %arg0 to i8* - call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp) - %tmp1 = bitcast %struct.ByValStruct* %arg1 to i8* - call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp1) - %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0 - store volatile i32 9, i32* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0 - store volatile i32 13, i32* %arrayidx2, align 4 - call void @void_func_byval_struct(%struct.ByValStruct* byval nonnull align 4 %arg0, %struct.ByValStruct* byval nonnull align 4 %arg1) - call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp1) - call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp) + %arg0 = alloca %struct.ByValStruct, align 4, addrspace(5) + %arg1 = alloca %struct.ByValStruct, align 4, addrspace(5) + %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)* + call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp) + %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)* + call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1) + %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 + store volatile i32 9, i32 addrspace(5)* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 + store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 4 + call void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg1) + call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1) + call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp) ret void } ; GCN-LABEL: {{^}}call_void_func_byval_struct_kernel_no_frame_pointer_elim: define amdgpu_kernel void @call_void_func_byval_struct_kernel_no_frame_pointer_elim() #2 { entry: - %arg0 = alloca %struct.ByValStruct, align 4 - %arg1 = alloca %struct.ByValStruct, align 4 - %tmp = bitcast %struct.ByValStruct* %arg0 to i8* - call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp) - %tmp1 = bitcast %struct.ByValStruct* %arg1 to i8* - call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp1) - %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0 - store volatile i32 9, i32* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0 - store volatile i32 13, i32* %arrayidx2, align 4 - call void @void_func_byval_struct(%struct.ByValStruct* byval nonnull align 4 %arg0, %struct.ByValStruct* byval nonnull align 4 %arg1) - call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp1) - call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp) + %arg0 = alloca %struct.ByValStruct, align 4, addrspace(5) + %arg1 = alloca %struct.ByValStruct, align 4, addrspace(5) + %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)* + call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp) + %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)* + call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1) + %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 + store volatile i32 9, i32 addrspace(5)* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 + store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 4 + call void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg1) + call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1) + call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp) ret void } declare void @external_void_func_void() #0 -declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #3 -declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #3 +declare void @llvm.lifetime.start.p5i8(i64, i8 addrspace(5)* nocapture) #3 +declare void @llvm.lifetime.end.p5i8(i64, i8 addrspace(5)* nocapture) #3 attributes #0 = { nounwind } attributes #1 = { noinline norecurse nounwind } Index: test/CodeGen/AMDGPU/call-argument-types.ll =================================================================== --- test/CodeGen/AMDGPU/call-argument-types.ll +++ test/CodeGen/AMDGPU/call-argument-types.ll @@ -2,6 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=hawaii -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MESA %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VI,MESA %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,HSA %s +target datalayout = "A5" declare void @external_void_func_i1(i1) #0 declare void @external_void_func_i1_signext(i1 signext) #0 @@ -38,8 +39,8 @@ ; Structs declare void @external_void_func_struct_i8_i32({ i8, i32 }) #0 -declare void @external_void_func_byval_struct_i8_i32({ i8, i32 }* byval) #0 -declare void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 }* sret, { i8, i32 }* byval) #0 +declare void @external_void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval) #0 +declare void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 } addrspace(5)* sret, { i8, i32 } addrspace(5)* byval) #0 declare void @external_void_func_v16i8(<16 x i8>) #0 @@ -465,12 +466,12 @@ ; GCN-NEXT: s_swappc_b64 ; GCN-NOT: [[SP]] define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 { - %val = alloca { i8, i32 }, align 4 - %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %val, i32 0, i32 0 - %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %val, i32 0, i32 1 - store i8 3, i8* %gep0 - store i32 8, i32* %gep1 - call void @external_void_func_byval_struct_i8_i32({ i8, i32 }* %val) + %val = alloca { i8, i32 }, align 4, addrspace(5) + %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %val, i32 0, i32 0 + %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %val, i32 0, i32 1 + store i8 3, i8 addrspace(5)* %gep0 + store i32 8, i32 addrspace(5)* %gep1 + call void @external_void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* %val) ret void } @@ -497,17 +498,17 @@ ; GCN: buffer_store_byte [[LOAD_OUT_VAL0]], off ; GCN: buffer_store_dword [[LOAD_OUT_VAL1]], off define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(i32) #0 { - %in.val = alloca { i8, i32 }, align 4 - %out.val = alloca { i8, i32 }, align 4 - %in.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %in.val, i32 0, i32 0 - %in.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %in.val, i32 0, i32 1 - store i8 3, i8* %in.gep0 - store i32 8, i32* %in.gep1 - call void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 }* %out.val, { i8, i32 }* %in.val) - %out.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %out.val, i32 0, i32 0 - %out.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %out.val, i32 0, i32 1 - %out.val0 = load i8, i8* %out.gep0 - %out.val1 = load i32, i32* %out.gep1 + %in.val = alloca { i8, i32 }, align 4, addrspace(5) + %out.val = alloca { i8, i32 }, align 4, addrspace(5) + %in.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %in.val, i32 0, i32 0 + %in.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %in.val, i32 0, i32 1 + store i8 3, i8 addrspace(5)* %in.gep0 + store i32 8, i32 addrspace(5)* %in.gep1 + call void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 } addrspace(5)* %out.val, { i8, i32 } addrspace(5)* %in.val) + %out.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %out.val, i32 0, i32 0 + %out.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %out.val, i32 0, i32 1 + %out.val0 = load i8, i8 addrspace(5)* %out.gep0 + %out.val1 = load i32, i32 addrspace(5)* %out.gep1 store volatile i8 %out.val0, i8 addrspace(1)* undef store volatile i32 %out.val1, i32 addrspace(1)* undef Index: test/CodeGen/AMDGPU/call-graph-register-usage.ll =================================================================== --- test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -1,6 +1,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-BUG %s +target datalayout = "A5" ; Make sure to run a GPU with the SGPR allocation bug. @@ -132,24 +133,24 @@ ; GCN-LABEL: {{^}}use_stack0: ; GCN: ScratchSize: 2052 define void @use_stack0() #1 { - %alloca = alloca [512 x i32], align 4 - call void asm sideeffect "; use $0", "v"([512 x i32]* %alloca) #0 + %alloca = alloca [512 x i32], align 4, addrspace(5) + call void asm sideeffect "; use $0", "v"([512 x i32] addrspace(5)* %alloca) #0 ret void } ; GCN-LABEL: {{^}}use_stack1: ; GCN: ScratchSize: 404 define void @use_stack1() #1 { - %alloca = alloca [100 x i32], align 4 - call void asm sideeffect "; use $0", "v"([100 x i32]* %alloca) #0 + %alloca = alloca [100 x i32], align 4, addrspace(5) + call void asm sideeffect "; use $0", "v"([100 x i32] addrspace(5)* %alloca) #0 ret void } ; GCN-LABEL: {{^}}indirect_use_stack: ; GCN: ScratchSize: 2124 define void @indirect_use_stack() #1 { - %alloca = alloca [16 x i32], align 4 - call void asm sideeffect "; use $0", "v"([16 x i32]* %alloca) #0 + %alloca = alloca [16 x i32], align 4, addrspace(5) + call void asm sideeffect "; use $0", "v"([16 x i32] addrspace(5)* %alloca) #0 call void @use_stack0() ret void } @@ -201,8 +202,8 @@ ; GCN-LABEL: {{^}}direct_recursion_use_stack: ; GCN: ScratchSize: 2056 define void @direct_recursion_use_stack(i32 %val) #2 { - %alloca = alloca [512 x i32], align 4 - call void asm sideeffect "; use $0", "v"([512 x i32]* %alloca) #0 + %alloca = alloca [512 x i32], align 4, addrspace(5) + call void asm sideeffect "; use $0", "v"([512 x i32] addrspace(5)* %alloca) #0 %cmp = icmp eq i32 %val, 0 br i1 %cmp, label %ret, label %call Index: test/CodeGen/AMDGPU/callee-frame-setup.ll =================================================================== --- test/CodeGen/AMDGPU/callee-frame-setup.ll +++ test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s +target datalayout = "A5" ; GCN-LABEL: {{^}}callee_no_stack: ; GCN: ; %bb.0: @@ -28,8 +29,8 @@ ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @callee_with_stack() #0 { - %alloca = alloca i32 - store volatile i32 0, i32* %alloca + %alloca = alloca i32, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca ret void } @@ -57,8 +58,8 @@ ; GCN: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @callee_with_stack_and_call() #0 { - %alloca = alloca i32 - store volatile i32 0, i32* %alloca + %alloca = alloca i32, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca call void @external_void_func_void() ret void } Index: test/CodeGen/AMDGPU/callee-special-input-sgprs.ll =================================================================== --- test/CodeGen/AMDGPU/callee-special-input-sgprs.ll +++ test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -1,5 +1,6 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +target datalayout = "A5" ; GCN-LABEL: {{^}}use_dispatch_ptr: ; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 @@ -43,8 +44,8 @@ ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]] ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+}}:[[HI]]{{\]}} define void @use_queue_ptr_addrspacecast() #1 { - %asc = addrspacecast i32 addrspace(3)* inttoptr (i32 16 to i32 addrspace(3)*) to i32 addrspace(4)* - store volatile i32 0, i32 addrspace(4)* %asc + %asc = addrspacecast i32 addrspace(3)* inttoptr (i32 16 to i32 addrspace(3)*) to i32* + store volatile i32 0, i32* %asc ret void } @@ -113,8 +114,8 @@ ; GCN: ; use s6 ; GCN: s_setpc_b64 define void @use_stack_workgroup_id_x() #1 { - %alloca = alloca i32 - store volatile i32 0, i32* %alloca + %alloca = alloca i32, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca %val = call i32 @llvm.amdgcn.workgroup.id.x() call void asm sideeffect "; use $0", "s"(i32 %val) ret void @@ -432,8 +433,8 @@ ; GCN: ; use s15 ; GCN: ; use s16 define void @use_every_sgpr_input() #1 { - %alloca = alloca i32, align 4 - store volatile i32 0, i32* %alloca + %alloca = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 %dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* @@ -512,8 +513,8 @@ ; GCN-DAG: s_mov_b32 s8, s16 ; GCN: s_swappc_b64 define void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 { - %alloca = alloca i32, align 4 - store volatile i32 0, i32* %alloca + %alloca = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 %dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* @@ -568,10 +569,10 @@ ; GCN: ; use [[SAVE_Y]] ; GCN: ; use [[SAVE_Z]] define void @func_use_every_sgpr_input_call_use_workgroup_id_xyz_spill() #1 { - %alloca = alloca i32, align 4 + %alloca = alloca i32, align 4, addrspace(5) call void @use_workgroup_id_xyz() - store volatile i32 0, i32* %alloca + store volatile i32 0, i32 addrspace(5)* %alloca %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 %dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* Index: test/CodeGen/AMDGPU/callee-special-input-vgprs.ll =================================================================== --- test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +target datalayout = "A5" ; GCN-LABEL: {{^}}use_workitem_id_x: ; GCN: s_waitcnt @@ -368,7 +369,7 @@ i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, - i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31, i32* byval %arg32) #1 { + i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31, i32 addrspace(5)* byval %arg32) #1 { %val = call i32 @llvm.amdgcn.workitem.id.x() store volatile i32 %val, i32 addrspace(1)* undef @@ -407,7 +408,7 @@ store volatile i32 %arg29, i32 addrspace(1)* undef store volatile i32 %arg30, i32 addrspace(1)* undef store volatile i32 %arg31, i32 addrspace(1)* undef - %private = load volatile i32, i32* %arg32 + %private = load volatile i32, i32 addrspace(5)* %arg32 ret void } @@ -435,8 +436,8 @@ ; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]], ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 { - %alloca = alloca i32, align 4 - store volatile i32 999, i32* %alloca + %alloca = alloca i32, align 4, addrspace(5) + store volatile i32 999, i32 addrspace(5)* %alloca call void @too_many_args_use_workitem_id_x_byval( i32 10, i32 20, i32 30, i32 40, i32 50, i32 60, i32 70, i32 80, @@ -446,7 +447,7 @@ i32 210, i32 220, i32 230, i32 240, i32 250, i32 260, i32 270, i32 280, i32 290, i32 300, i32 310, i32 320, - i32* %alloca) + i32 addrspace(5)* %alloca) ret void } @@ -460,8 +461,8 @@ ; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]], ; GCN: s_swappc_b64 define void @func_call_too_many_args_use_workitem_id_x_byval() #1 { - %alloca = alloca i32, align 4 - store volatile i32 999, i32* %alloca + %alloca = alloca i32, align 4, addrspace(5) + store volatile i32 999, i32 addrspace(5)* %alloca call void @too_many_args_use_workitem_id_x_byval( i32 10, i32 20, i32 30, i32 40, i32 50, i32 60, i32 70, i32 80, @@ -471,7 +472,7 @@ i32 210, i32 220, i32 230, i32 240, i32 250, i32 260, i32 270, i32 280, i32 290, i32 300, i32 310, i32 320, - i32* %alloca) + i32 addrspace(5)* %alloca) ret void } Index: test/CodeGen/AMDGPU/captured-frame-index.ll =================================================================== --- test/CodeGen/AMDGPU/captured-frame-index.ll +++ test/CodeGen/AMDGPU/captured-frame-index.ll @@ -1,14 +1,15 @@ ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +target datalayout = "A5" ; GCN-LABEL: {{^}}store_fi_lifetime: ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} ; GCN: buffer_store_dword [[FI]] define amdgpu_kernel void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 { entry: - %b = alloca i8 - call void @llvm.lifetime.start.p0i8(i64 1, i8* %b) - store volatile i8* %b, i8* addrspace(1)* undef - call void @llvm.lifetime.end.p0i8(i64 1, i8* %b) + %b = alloca i8, addrspace(5) + call void @llvm.lifetime.start.p5i8(i64 1, i8 addrspace(5)* %b) + store volatile i8 addrspace(5)* %b, i8 addrspace(5)* addrspace(1)* undef + call void @llvm.lifetime.end.p5i8(i64 1, i8 addrspace(5)* %b) ret void } @@ -18,10 +19,10 @@ ; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 4{{$}} ; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]] ; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO0]] -define amdgpu_kernel void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 { - %tmp = alloca float - store float 4.0, float *%tmp - store float* %tmp, float* addrspace(3)* %ptr +define amdgpu_kernel void @stored_fi_to_lds(float addrspace(5)* addrspace(3)* %ptr) #0 { + %tmp = alloca float, addrspace(5) + store float 4.0, float addrspace(5)*%tmp + store float addrspace(5)* %tmp, float addrspace(5)* addrspace(3)* %ptr ret void } @@ -38,13 +39,13 @@ ; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}} ; GCN: ds_write_b32 [[VLDSPTR]], [[FI1]] -define amdgpu_kernel void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 { - %tmp0 = alloca float - %tmp1 = alloca float - store float 4.0, float* %tmp0 - store float 4.0, float* %tmp1 - store volatile float* %tmp0, float* addrspace(3)* %ptr - store volatile float* %tmp1, float* addrspace(3)* %ptr +define amdgpu_kernel void @stored_fi_to_lds_2_small_objects(float addrspace(5)* addrspace(3)* %ptr) #0 { + %tmp0 = alloca float, addrspace(5) + %tmp1 = alloca float, addrspace(5) + store float 4.0, float addrspace(5)* %tmp0 + store float 4.0, float addrspace(5)* %tmp1 + store volatile float addrspace(5)* %tmp0, float addrspace(5)* addrspace(3)* %ptr + store volatile float addrspace(5)* %tmp1, float addrspace(5)* addrspace(3)* %ptr ret void } @@ -55,12 +56,12 @@ ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}} ; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} define amdgpu_kernel void @stored_fi_to_self() #0 { - %tmp = alloca i32* + %tmp = alloca i32 addrspace(5)*, addrspace(5) ; Avoid optimizing everything out - store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp - %bitcast = bitcast i32** %tmp to i32* - store volatile i32* %bitcast, i32** %tmp + store volatile i32 addrspace(5)* inttoptr (i32 1234 to i32 addrspace(5)*), i32 addrspace(5)* addrspace(5)* %tmp + %bitcast = bitcast i32 addrspace(5)* addrspace(5)* %tmp to i32 addrspace(5)* + store volatile i32 addrspace(5)* %bitcast, i32 addrspace(5)* addrspace(5)* %tmp ret void } @@ -74,17 +75,17 @@ ; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x804{{$}} ; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2052{{$}} define amdgpu_kernel void @stored_fi_to_self_offset() #0 { - %tmp0 = alloca [512 x i32] - %tmp1 = alloca i32* + %tmp0 = alloca [512 x i32], addrspace(5) + %tmp1 = alloca i32 addrspace(5)*, addrspace(5) ; Avoid optimizing everything out - %tmp0.cast = bitcast [512 x i32]* %tmp0 to i32* - store volatile i32 32, i32* %tmp0.cast + %tmp0.cast = bitcast [512 x i32] addrspace(5)* %tmp0 to i32 addrspace(5)* + store volatile i32 32, i32 addrspace(5)* %tmp0.cast - store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp1 + store volatile i32 addrspace(5)* inttoptr (i32 1234 to i32 addrspace(5)*), i32 addrspace(5)* addrspace(5)* %tmp1 - %bitcast = bitcast i32** %tmp1 to i32* - store volatile i32* %bitcast, i32** %tmp1 + %bitcast = bitcast i32 addrspace(5)* addrspace(5)* %tmp1 to i32 addrspace(5)* + store volatile i32 addrspace(5)* %bitcast, i32 addrspace(5)* addrspace(5)* %tmp1 ret void } @@ -99,18 +100,18 @@ ; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}} ; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}} define amdgpu_kernel void @stored_fi_to_fi() #0 { - %tmp0 = alloca i32* - %tmp1 = alloca i32* - %tmp2 = alloca i32* - store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp0 - store volatile i32* inttoptr (i32 5678 to i32*), i32** %tmp1 - store volatile i32* inttoptr (i32 9999 to i32*), i32** %tmp2 - - %bitcast1 = bitcast i32** %tmp1 to i32* - %bitcast2 = bitcast i32** %tmp2 to i32* ; at offset 8 - - store volatile i32* %bitcast1, i32** %tmp2 ; store offset 4 at offset 8 - store volatile i32* %bitcast2, i32** %tmp1 ; store offset 8 at offset 4 + %tmp0 = alloca i32 addrspace(5)*, addrspace(5) + %tmp1 = alloca i32 addrspace(5)*, addrspace(5) + %tmp2 = alloca i32 addrspace(5)*, addrspace(5) + store volatile i32 addrspace(5)* inttoptr (i32 1234 to i32 addrspace(5)*), i32 addrspace(5)* addrspace(5)* %tmp0 + store volatile i32 addrspace(5)* inttoptr (i32 5678 to i32 addrspace(5)*), i32 addrspace(5)* addrspace(5)* %tmp1 + store volatile i32 addrspace(5)* inttoptr (i32 9999 to i32 addrspace(5)*), i32 addrspace(5)* addrspace(5)* %tmp2 + + %bitcast1 = bitcast i32 addrspace(5)* addrspace(5)* %tmp1 to i32 addrspace(5)* + %bitcast2 = bitcast i32 addrspace(5)* addrspace(5)* %tmp2 to i32 addrspace(5)* ; at offset 8 + + store volatile i32 addrspace(5)* %bitcast1, i32 addrspace(5)* addrspace(5)* %tmp2 ; store offset 4 at offset 8 + store volatile i32 addrspace(5)* %bitcast2, i32 addrspace(5)* addrspace(5)* %tmp1 ; store offset 8 at offset 4 ret void } @@ -118,10 +119,10 @@ ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} ; GCN: buffer_store_dword [[FI]] -define amdgpu_kernel void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 { - %tmp = alloca float - store float 0.0, float *%tmp - store float* %tmp, float* addrspace(1)* %ptr +define amdgpu_kernel void @stored_fi_to_global(float addrspace(5)* addrspace(1)* %ptr) #0 { + %tmp = alloca float, addrspace(5) + store float 0.0, float addrspace(5)*%tmp + store float addrspace(5)* %tmp, float addrspace(5)* addrspace(1)* %ptr ret void } @@ -136,15 +137,15 @@ ; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}} ; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -define amdgpu_kernel void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 { - %tmp0 = alloca float - %tmp1 = alloca float - %tmp2 = alloca float - store volatile float 0.0, float *%tmp0 - store volatile float 0.0, float *%tmp1 - store volatile float 0.0, float *%tmp2 - store volatile float* %tmp1, float* addrspace(1)* %ptr - store volatile float* %tmp2, float* addrspace(1)* %ptr +define amdgpu_kernel void @stored_fi_to_global_2_small_objects(float addrspace(5)* addrspace(1)* %ptr) #0 { + %tmp0 = alloca float, addrspace(5) + %tmp1 = alloca float, addrspace(5) + %tmp2 = alloca float, addrspace(5) + store volatile float 0.0, float addrspace(5)*%tmp0 + store volatile float 0.0, float addrspace(5)*%tmp1 + store volatile float 0.0, float addrspace(5)*%tmp2 + store volatile float addrspace(5)* %tmp1, float addrspace(5)* addrspace(1)* %ptr + store volatile float addrspace(5)* %tmp2, float addrspace(5)* addrspace(1)* %ptr ret void } @@ -163,19 +164,19 @@ ; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_1]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} ; GCN: buffer_store_dword [[BASE_1_OFF_2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -define amdgpu_kernel void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 { - %tmp0 = alloca [4096 x i32] - %tmp1 = alloca [4096 x i32] - %gep0.tmp0 = getelementptr [4096 x i32], [4096 x i32]* %tmp0, i32 0, i32 0 - store volatile i32 0, i32* %gep0.tmp0 - %gep1.tmp0 = getelementptr [4096 x i32], [4096 x i32]* %tmp0, i32 0, i32 4095 - store volatile i32 999, i32* %gep1.tmp0 - %gep0.tmp1 = getelementptr [4096 x i32], [4096 x i32]* %tmp0, i32 0, i32 14 - store i32* %gep0.tmp1, i32* addrspace(1)* %ptr +define amdgpu_kernel void @stored_fi_to_global_huge_frame_offset(i32 addrspace(5)* addrspace(1)* %ptr) #0 { + %tmp0 = alloca [4096 x i32], addrspace(5) + %tmp1 = alloca [4096 x i32], addrspace(5) + %gep0.tmp0 = getelementptr [4096 x i32], [4096 x i32] addrspace(5)* %tmp0, i32 0, i32 0 + store volatile i32 0, i32 addrspace(5)* %gep0.tmp0 + %gep1.tmp0 = getelementptr [4096 x i32], [4096 x i32] addrspace(5)* %tmp0, i32 0, i32 4095 + store volatile i32 999, i32 addrspace(5)* %gep1.tmp0 + %gep0.tmp1 = getelementptr [4096 x i32], [4096 x i32] addrspace(5)* %tmp0, i32 0, i32 14 + store i32 addrspace(5)* %gep0.tmp1, i32 addrspace(5)* addrspace(1)* %ptr ret void } -@g1 = external addrspace(1) global i32* +@g1 = external addrspace(1) global i32 addrspace(5)* ; This was leaving a dead node around resulting in failing to select ; on the leftover AssertZext's ValueType operand. @@ -188,16 +189,16 @@ ; GCN: buffer_store_dword [[FI]] define amdgpu_kernel void @cannot_select_assertzext_valuetype(i32 addrspace(1)* %out, i32 %idx) #0 { entry: - %b = alloca i32, align 4 - %tmp1 = load volatile i32*, i32* addrspace(1)* @g1, align 4 - %arrayidx = getelementptr inbounds i32, i32* %tmp1, i32 %idx - %tmp2 = load i32, i32* %arrayidx, align 4 - store volatile i32* %b, i32* addrspace(1)* undef + %b = alloca i32, align 4, addrspace(5) + %tmp1 = load volatile i32 addrspace(5)*, i32 addrspace(5)* addrspace(1)* @g1, align 4 + %arrayidx = getelementptr inbounds i32, i32 addrspace(5)* %tmp1, i32 %idx + %tmp2 = load i32, i32 addrspace(5)* %arrayidx, align 4 + store volatile i32 addrspace(5)* %b, i32 addrspace(5)* addrspace(1)* undef ret void } -declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 -declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 +declare void @llvm.lifetime.start.p5i8(i64, i8 addrspace(5)* nocapture) #1 +declare void @llvm.lifetime.end.p5i8(i64, i8 addrspace(5)* nocapture) #1 attributes #0 = { nounwind } attributes #1 = { argmemonly nounwind } Index: test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll =================================================================== --- test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll +++ test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll @@ -6,32 +6,32 @@ ; RUN: llc -march=amdgcn -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s ; OPT-LABEL: @test_no_sink_flat_small_offset_i32( -; OPT-CIVI: getelementptr i32, i32 addrspace(4)* %in +; OPT-CIVI: getelementptr i32, i32* %in ; OPT-CIVI: br i1 ; OPT-CIVI-NOT: ptrtoint ; OPT-GFX9: br -; OPT-GFX9: %sunkaddr = getelementptr i8, i8 addrspace(4)* %0, i64 28 -; OPT-GFX9: %1 = bitcast i8 addrspace(4)* %sunkaddr to i32 addrspace(4)* -; OPT-GFX9: load i32, i32 addrspace(4)* %1 +; OPT-GFX9: %sunkaddr = getelementptr i8, i8* %0, i64 28 +; OPT-GFX9: %1 = bitcast i8* %sunkaddr to i32* +; OPT-GFX9: load i32, i32* %1 ; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32: ; GCN: flat_load_dword ; GCN: {{^}}BB0_2: -define amdgpu_kernel void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { +define amdgpu_kernel void @test_no_sink_flat_small_offset_i32(i32* %out, i32* %in, i32 %cond) { entry: - %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999 - %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7 + %out.gep = getelementptr i32, i32* %out, i64 999999 + %in.gep = getelementptr i32, i32* %in, i64 7 %tmp0 = icmp eq i32 %cond, 0 br i1 %tmp0, label %endif, label %if if: - %tmp1 = load i32, i32 addrspace(4)* %in.gep + %tmp1 = load i32, i32* %in.gep br label %endif endif: %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] - store i32 %x, i32 addrspace(4)* %out.gep + store i32 %x, i32* %out.gep br label %done done: @@ -39,7 +39,7 @@ } ; OPT-LABEL: @test_sink_noop_addrspacecast_flat_to_global_i32( -; OPT: getelementptr i32, i32 addrspace(4)* %out, +; OPT: getelementptr i32, i32* %out, ; rOPT-CI-NOT: getelementptr ; OPT: br i1 @@ -50,11 +50,11 @@ ; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_global_i32: ; CI: buffer_load_dword {{v[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28 -define amdgpu_kernel void @test_sink_noop_addrspacecast_flat_to_global_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { +define amdgpu_kernel void @test_sink_noop_addrspacecast_flat_to_global_i32(i32* %out, i32* %in, i32 %cond) { entry: - %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999 - %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7 - %cast = addrspacecast i32 addrspace(4)* %in.gep to i32 addrspace(1)* + %out.gep = getelementptr i32, i32* %out, i64 999999 + %in.gep = getelementptr i32, i32* %in, i64 7 + %cast = addrspacecast i32* %in.gep to i32 addrspace(1)* %tmp0 = icmp eq i32 %cond, 0 br i1 %tmp0, label %endif, label %if @@ -64,7 +64,7 @@ endif: %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] - store i32 %x, i32 addrspace(4)* %out.gep + store i32 %x, i32* %out.gep br label %done done: @@ -72,7 +72,7 @@ } ; OPT-LABEL: @test_sink_noop_addrspacecast_flat_to_constant_i32( -; OPT: getelementptr i32, i32 addrspace(4)* %out, +; OPT: getelementptr i32, i32* %out, ; OPT-CI-NOT: getelementptr ; OPT: br i1 @@ -83,11 +83,11 @@ ; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_constant_i32: ; CI: s_load_dword {{s[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd -define amdgpu_kernel void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { +define amdgpu_kernel void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32* %out, i32* %in, i32 %cond) { entry: - %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999 - %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7 - %cast = addrspacecast i32 addrspace(4)* %in.gep to i32 addrspace(2)* + %out.gep = getelementptr i32, i32* %out, i64 999999 + %in.gep = getelementptr i32, i32* %in, i64 7 + %cast = addrspacecast i32* %in.gep to i32 addrspace(2)* %tmp0 = icmp eq i32 %cond, 0 br i1 %tmp0, label %endif, label %if @@ -97,7 +97,7 @@ endif: %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] - store i32 %x, i32 addrspace(4)* %out.gep + store i32 %x, i32* %out.gep br label %done done: @@ -105,34 +105,34 @@ } ; OPT-LABEL: @test_sink_flat_small_max_flat_offset( -; OPT-CIVI: %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 4095 +; OPT-CIVI: %in.gep = getelementptr i8, i8* %in, i64 4095 ; OPT-CIVI: br ; OPT-CIVI-NOT: getelementptr -; OPT-CIVI: load i8, i8 addrspace(4)* %in.gep +; OPT-CIVI: load i8, i8* %in.gep ; OPT-GFX9: br -; OPT-GFX9: %sunkaddr = getelementptr i8, i8 addrspace(4)* %in, i64 4095 -; OPT-GFX9: load i8, i8 addrspace(4)* %sunkaddr +; OPT-GFX9: %sunkaddr = getelementptr i8, i8* %in, i64 4095 +; OPT-GFX9: load i8, i8* %sunkaddr ; GCN-LABEL: {{^}}test_sink_flat_small_max_flat_offset: ; GFX9: flat_load_sbyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095{{$}} ; CIVI: flat_load_sbyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @test_sink_flat_small_max_flat_offset(i32 addrspace(4)* %out, i8 addrspace(4)* %in) #1 { +define amdgpu_kernel void @test_sink_flat_small_max_flat_offset(i32* %out, i8* %in) #1 { entry: - %out.gep = getelementptr i32, i32 addrspace(4)* %out, i32 1024 - %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 4095 + %out.gep = getelementptr i32, i32* %out, i32 1024 + %in.gep = getelementptr i8, i8* %in, i64 4095 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: - %tmp1 = load i8, i8 addrspace(4)* %in.gep + %tmp1 = load i8, i8* %in.gep %tmp2 = sext i8 %tmp1 to i32 br label %endif endif: %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] - store i32 %x, i32 addrspace(4)* %out.gep + store i32 %x, i32* %out.gep br label %done done: @@ -140,29 +140,29 @@ } ; OPT-LABEL: @test_sink_flat_small_max_plus_1_flat_offset( -; OPT: %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 4096 +; OPT: %in.gep = getelementptr i8, i8* %in, i64 4096 ; OPT: br ; OPT-NOT: getelementptr -; OPT: load i8, i8 addrspace(4)* %in.gep +; OPT: load i8, i8* %in.gep ; GCN-LABEL: {{^}}test_sink_flat_small_max_plus_1_flat_offset: ; GCN: flat_load_sbyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @test_sink_flat_small_max_plus_1_flat_offset(i32 addrspace(4)* %out, i8 addrspace(4)* %in) #1 { +define amdgpu_kernel void @test_sink_flat_small_max_plus_1_flat_offset(i32* %out, i8* %in) #1 { entry: - %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 99999 - %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 4096 + %out.gep = getelementptr i32, i32* %out, i64 99999 + %in.gep = getelementptr i8, i8* %in, i64 4096 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: - %tmp1 = load i8, i8 addrspace(4)* %in.gep + %tmp1 = load i8, i8* %in.gep %tmp2 = sext i8 %tmp1 to i32 br label %endif endif: %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] - store i32 %x, i32 addrspace(4)* %out.gep + store i32 %x, i32* %out.gep br label %done done: @@ -170,30 +170,30 @@ } ; OPT-LABEL: @test_no_sink_flat_reg_offset( -; OPT: %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 %reg +; OPT: %in.gep = getelementptr i8, i8* %in, i64 %reg ; OPT: br ; OPT-NOT: getelementptr -; OPT: load i8, i8 addrspace(4)* %in.gep +; OPT: load i8, i8* %in.gep ; GCN-LABEL: {{^}}test_no_sink_flat_reg_offset: ; GCN: flat_load_sbyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @test_no_sink_flat_reg_offset(i32 addrspace(4)* %out, i8 addrspace(4)* %in, i64 %reg) #1 { +define amdgpu_kernel void @test_no_sink_flat_reg_offset(i32* %out, i8* %in, i64 %reg) #1 { entry: - %out.gep = getelementptr i32, i32 addrspace(4)* %out, i32 1024 - %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 %reg + %out.gep = getelementptr i32, i32* %out, i32 1024 + %in.gep = getelementptr i8, i8* %in, i64 %reg %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: - %tmp1 = load i8, i8 addrspace(4)* %in.gep + %tmp1 = load i8, i8* %in.gep %tmp2 = sext i8 %tmp1 to i32 br label %endif endif: %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] - store i32 %x, i32 addrspace(4)* %out.gep + store i32 %x, i32* %out.gep br label %done done: Index: test/CodeGen/AMDGPU/cgp-addressing-modes.ll =================================================================== --- test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -7,7 +7,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s -target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" +target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5" ; OPT-LABEL: @test_sink_global_small_offset_i32( ; OPT-CI-NOT: getelementptr i32, i32 addrspace(1)* %in @@ -137,24 +137,24 @@ ; GCN: {{^}}BB4_2: define amdgpu_kernel void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { entry: - %alloca = alloca [512 x i32], align 4 + %alloca = alloca [512 x i32], align 4, addrspace(5) %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %add.arg = add i32 %arg, 8 - %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1022 + %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1022 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: - store volatile i32 123, i32* %alloca.gep - %tmp1 = load volatile i32, i32* %alloca.gep + store volatile i32 123, i32 addrspace(5)* %alloca.gep + %tmp1 = load volatile i32, i32 addrspace(5)* %alloca.gep br label %endif endif: %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] store i32 %x, i32 addrspace(1)* %out.gep.0 - %load = load volatile i32, i32* %alloca.gep + %load = load volatile i32, i32 addrspace(5)* %alloca.gep store i32 %load, i32 addrspace(1)* %out.gep.1 br label %done @@ -178,24 +178,24 @@ define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { entry: - %alloca = alloca [512 x i32], align 4 + %alloca = alloca [512 x i32], align 4, addrspace(5) %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %add.arg = add i32 %arg, 8 - %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1023 + %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1023 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: - store volatile i32 123, i32* %alloca.gep - %tmp1 = load volatile i32, i32* %alloca.gep + store volatile i32 123, i32 addrspace(5)* %alloca.gep + %tmp1 = load volatile i32, i32 addrspace(5)* %alloca.gep br label %endif endif: %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] store i32 %x, i32 addrspace(1)* %out.gep.0 - %load = load volatile i32, i32* %alloca.gep + %load = load volatile i32, i32 addrspace(5)* %alloca.gep store i32 %load, i32 addrspace(1)* %out.gep.1 br label %done @@ -204,7 +204,7 @@ } ; OPT-LABEL: @test_no_sink_scratch_large_offset_i32( -; OPT: %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1024 +; OPT: %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1024 ; OPT: br i1 ; OPT-NOT: ptrtoint @@ -215,24 +215,24 @@ ; GCN: {{^BB[0-9]+}}_2: define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { entry: - %alloca = alloca [512 x i32], align 4 + %alloca = alloca [512 x i32], align 4, addrspace(5) %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %add.arg = add i32 %arg, 8 - %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1024 + %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1024 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: - store volatile i32 123, i32* %alloca.gep - %tmp1 = load volatile i32, i32* %alloca.gep + store volatile i32 123, i32 addrspace(5)* %alloca.gep + %tmp1 = load volatile i32, i32 addrspace(5)* %alloca.gep br label %endif endif: %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] store i32 %x, i32 addrspace(1)* %out.gep.0 - %load = load volatile i32, i32* %alloca.gep + %load = load volatile i32, i32 addrspace(5)* %alloca.gep store i32 %load, i32 addrspace(1)* %out.gep.1 br label %done Index: test/CodeGen/AMDGPU/collapse-endcf.ll =================================================================== --- test/CodeGen/AMDGPU/collapse-endcf.ll +++ test/CodeGen/AMDGPU/collapse-endcf.ll @@ -237,7 +237,7 @@ br i1 %tmp3, label %bb4, label %bb10 bb4: ; preds = %bb2 - %tmp6 = load float, float* undef + %tmp6 = load float, float addrspace(5)* undef %tmp7 = fcmp olt float %tmp6, 0.0 br i1 %tmp7, label %bb8, label %Flow @@ -257,7 +257,7 @@ br label %bb1 bb12: ; preds = %bb10 - store volatile <4 x float> %tmp11, <4 x float>* undef, align 16 + store volatile <4 x float> %tmp11, <4 x float> addrspace(5)* undef, align 16 ret void } Index: test/CodeGen/AMDGPU/commute-compares.ll =================================================================== --- test/CodeGen/AMDGPU/commute-compares.ll +++ test/CodeGen/AMDGPU/commute-compares.ll @@ -1,4 +1,5 @@ ; RUN: llc -march=amdgcn -amdgpu-sdwa-peephole=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +target datalayout = "A5" declare i32 @llvm.amdgcn.workitem.id.x() #0 @@ -703,9 +704,9 @@ ; GCN: v_cmp_eq_u32_e32 vcc, v{{[0-9]+}}, [[FI]] define amdgpu_kernel void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 { entry: - %stack0 = alloca i32 - %ptr0 = load volatile i32*, i32* addrspace(1)* undef - %eq = icmp eq i32* %ptr0, %stack0 + %stack0 = alloca i32, addrspace(5) + %ptr0 = load volatile i32 addrspace(5)*, i32 addrspace(5)* addrspace(1)* undef + %eq = icmp eq i32 addrspace(5)* %ptr0, %stack0 %ext = zext i1 %eq to i32 store volatile i32 %ext, i32 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/copy-to-reg.ll =================================================================== --- test/CodeGen/AMDGPU/copy-to-reg.ll +++ test/CodeGen/AMDGPU/copy-to-reg.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mattr=-promote-alloca -verify-machineinstrs < %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca -verify-machineinstrs < %s +target datalayout = "A5" ; Test that CopyToReg instructions don't have non-register operands prior ; to being emitted. @@ -8,20 +9,20 @@ ; CHECK-LABEL: {{^}}copy_to_reg_frameindex: define amdgpu_kernel void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { entry: - %alloca = alloca [16 x i32] + %alloca = alloca [16 x i32], addrspace(5) br label %loop loop: %inc = phi i32 [0, %entry], [%inc.i, %loop] - %ptr = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %inc - store i32 %inc, i32* %ptr + %ptr = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %inc + store i32 %inc, i32 addrspace(5)* %ptr %inc.i = add i32 %inc, 1 %cnd = icmp uge i32 %inc.i, 16 br i1 %cnd, label %done, label %loop done: - %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 0 - %tmp1 = load i32, i32* %tmp0 + %tmp0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0 + %tmp1 = load i32, i32 addrspace(5)* %tmp0 store i32 %tmp1, i32 addrspace(1)* %out ret void } Index: test/CodeGen/AMDGPU/extload-private.ll =================================================================== --- test/CodeGen/AMDGPU/extload-private.ll +++ test/CodeGen/AMDGPU/extload-private.ll @@ -1,12 +1,13 @@ ; RUN: llc -march=amdgcn -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +target datalayout = "A5" ; FUNC-LABEL: {{^}}load_i8_sext_private: ; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}} define amdgpu_kernel void @load_i8_sext_private(i32 addrspace(1)* %out) { entry: - %tmp0 = alloca i8 - %tmp1 = load i8, i8* %tmp0 + %tmp0 = alloca i8, addrspace(5) + %tmp1 = load i8, i8 addrspace(5)* %tmp0 %tmp2 = sext i8 %tmp1 to i32 store i32 %tmp2, i32 addrspace(1)* %out ret void @@ -16,8 +17,8 @@ ; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}} define amdgpu_kernel void @load_i8_zext_private(i32 addrspace(1)* %out) { entry: - %tmp0 = alloca i8 - %tmp1 = load i8, i8* %tmp0 + %tmp0 = alloca i8, addrspace(5) + %tmp1 = load i8, i8 addrspace(5)* %tmp0 %tmp2 = zext i8 %tmp1 to i32 store i32 %tmp2, i32 addrspace(1)* %out ret void @@ -27,8 +28,8 @@ ; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}} define amdgpu_kernel void @load_i16_sext_private(i32 addrspace(1)* %out) { entry: - %tmp0 = alloca i16 - %tmp1 = load i16, i16* %tmp0 + %tmp0 = alloca i16, addrspace(5) + %tmp1 = load i16, i16 addrspace(5)* %tmp0 %tmp2 = sext i16 %tmp1 to i32 store i32 %tmp2, i32 addrspace(1)* %out ret void @@ -38,8 +39,8 @@ ; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}} define amdgpu_kernel void @load_i16_zext_private(i32 addrspace(1)* %out) { entry: - %tmp0 = alloca i16 - %tmp1 = load volatile i16, i16* %tmp0 + %tmp0 = alloca i16, addrspace(5) + %tmp1 = load volatile i16, i16 addrspace(5)* %tmp0 %tmp2 = zext i16 %tmp1 to i32 store i32 %tmp2, i32 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/flat-address-space.ll =================================================================== --- test/CodeGen/AMDGPU/flat-address-space.ll +++ test/CodeGen/AMDGPU/flat-address-space.ll @@ -2,6 +2,7 @@ ; RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,CIVI %s ; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,HSA %s ; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,HSA,GFX9 %s +target datalayout = "A5" ; Disable optimizations in case there are optimizations added that ; specialize away generic pointer accesses. @@ -19,42 +20,42 @@ ; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]] ; CHECK: flat_store_dword v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}, v[[DATA]] define amdgpu_kernel void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 { - %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* - store volatile i32 %x, i32 addrspace(4)* %fptr, align 4 + %fptr = addrspacecast i32 addrspace(1)* %gptr to i32* + store volatile i32 %x, i32* %fptr, align 4 ret void } ; CHECK-LABEL: {{^}}store_flat_i64: ; CHECK: flat_store_dwordx2 define amdgpu_kernel void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 { - %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)* - store volatile i64 %x, i64 addrspace(4)* %fptr, align 8 + %fptr = addrspacecast i64 addrspace(1)* %gptr to i64* + store volatile i64 %x, i64* %fptr, align 8 ret void } ; CHECK-LABEL: {{^}}store_flat_v4i32: ; CHECK: flat_store_dwordx4 define amdgpu_kernel void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 { - %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)* - store volatile <4 x i32> %x, <4 x i32> addrspace(4)* %fptr, align 16 + %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32>* + store volatile <4 x i32> %x, <4 x i32>* %fptr, align 16 ret void } ; CHECK-LABEL: {{^}}store_flat_trunc_i16: ; CHECK: flat_store_short define amdgpu_kernel void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 { - %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* + %fptr = addrspacecast i16 addrspace(1)* %gptr to i16* %y = trunc i32 %x to i16 - store volatile i16 %y, i16 addrspace(4)* %fptr, align 2 + store volatile i16 %y, i16* %fptr, align 2 ret void } ; CHECK-LABEL: {{^}}store_flat_trunc_i8: ; CHECK: flat_store_byte define amdgpu_kernel void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 { - %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* + %fptr = addrspacecast i8 addrspace(1)* %gptr to i8* %y = trunc i32 %x to i8 - store volatile i8 %y, i8 addrspace(4)* %fptr, align 2 + store volatile i8 %y, i8* %fptr, align 2 ret void } @@ -63,8 +64,8 @@ ; CHECK-LABEL: load_flat_i32: ; CHECK: flat_load_dword define amdgpu_kernel void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 { - %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* - %fload = load volatile i32, i32 addrspace(4)* %fptr, align 4 + %fptr = addrspacecast i32 addrspace(1)* %gptr to i32* + %fload = load volatile i32, i32* %fptr, align 4 store i32 %fload, i32 addrspace(1)* %out, align 4 ret void } @@ -72,8 +73,8 @@ ; CHECK-LABEL: load_flat_i64: ; CHECK: flat_load_dwordx2 define amdgpu_kernel void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 { - %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)* - %fload = load volatile i64, i64 addrspace(4)* %fptr, align 8 + %fptr = addrspacecast i64 addrspace(1)* %gptr to i64* + %fload = load volatile i64, i64* %fptr, align 8 store i64 %fload, i64 addrspace(1)* %out, align 8 ret void } @@ -81,8 +82,8 @@ ; CHECK-LABEL: load_flat_v4i32: ; CHECK: flat_load_dwordx4 define amdgpu_kernel void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 { - %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)* - %fload = load volatile <4 x i32>, <4 x i32> addrspace(4)* %fptr, align 32 + %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32>* + %fload = load volatile <4 x i32>, <4 x i32>* %fptr, align 32 store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8 ret void } @@ -90,8 +91,8 @@ ; CHECK-LABEL: sextload_flat_i8: ; CHECK: flat_load_sbyte define amdgpu_kernel void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { - %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* - %fload = load volatile i8, i8 addrspace(4)* %fptr, align 4 + %fptr = addrspacecast i8 addrspace(1)* %gptr to i8* + %fload = load volatile i8, i8* %fptr, align 4 %ext = sext i8 %fload to i32 store i32 %ext, i32 addrspace(1)* %out, align 4 ret void @@ -100,8 +101,8 @@ ; CHECK-LABEL: zextload_flat_i8: ; CHECK: flat_load_ubyte define amdgpu_kernel void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { - %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* - %fload = load volatile i8, i8 addrspace(4)* %fptr, align 4 + %fptr = addrspacecast i8 addrspace(1)* %gptr to i8* + %fload = load volatile i8, i8* %fptr, align 4 %ext = zext i8 %fload to i32 store i32 %ext, i32 addrspace(1)* %out, align 4 ret void @@ -110,8 +111,8 @@ ; CHECK-LABEL: sextload_flat_i16: ; CHECK: flat_load_sshort define amdgpu_kernel void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { - %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* - %fload = load volatile i16, i16 addrspace(4)* %fptr, align 4 + %fptr = addrspacecast i16 addrspace(1)* %gptr to i16* + %fload = load volatile i16, i16* %fptr, align 4 %ext = sext i16 %fload to i32 store i32 %ext, i32 addrspace(1)* %out, align 4 ret void @@ -120,8 +121,8 @@ ; CHECK-LABEL: zextload_flat_i16: ; CHECK: flat_load_ushort define amdgpu_kernel void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { - %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* - %fload = load volatile i16, i16 addrspace(4)* %fptr, align 4 + %fptr = addrspacecast i16 addrspace(1)* %gptr to i16* + %fload = load volatile i16, i16* %fptr, align 4 %ext = zext i16 %fload to i32 store i32 %ext, i32 addrspace(1)* %out, align 4 ret void @@ -133,9 +134,9 @@ ; CHECK: flat_load_ubyte ; CHECK: flat_load_ubyte define amdgpu_kernel void @flat_scratch_unaligned_load() { - %scratch = alloca i32 - %fptr = addrspacecast i32* %scratch to i32 addrspace(4)* - %ld = load volatile i32, i32 addrspace(4)* %fptr, align 1 + %scratch = alloca i32, addrspace(5) + %fptr = addrspacecast i32 addrspace(5)* %scratch to i32* + %ld = load volatile i32, i32* %fptr, align 1 ret void } @@ -145,9 +146,9 @@ ; CHECK: flat_store_byte ; CHECK: flat_store_byte define amdgpu_kernel void @flat_scratch_unaligned_store() { - %scratch = alloca i32 - %fptr = addrspacecast i32* %scratch to i32 addrspace(4)* - store volatile i32 0, i32 addrspace(4)* %fptr, align 1 + %scratch = alloca i32, addrspace(5) + %fptr = addrspacecast i32 addrspace(5)* %scratch to i32* + store volatile i32 0, i32* %fptr, align 1 ret void } @@ -156,9 +157,9 @@ ; HSA: flat_load_dword ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr define amdgpu_kernel void @flat_scratch_multidword_load() { - %scratch = alloca <2 x i32> - %fptr = addrspacecast <2 x i32>* %scratch to <2 x i32> addrspace(4)* - %ld = load volatile <2 x i32>, <2 x i32> addrspace(4)* %fptr + %scratch = alloca <2 x i32>, addrspace(5) + %fptr = addrspacecast <2 x i32> addrspace(5)* %scratch to <2 x i32>* + %ld = load volatile <2 x i32>, <2 x i32>* %fptr ret void } @@ -167,59 +168,59 @@ ; HSA: flat_store_dword ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr define amdgpu_kernel void @flat_scratch_multidword_store() { - %scratch = alloca <2 x i32> - %fptr = addrspacecast <2 x i32>* %scratch to <2 x i32> addrspace(4)* - store volatile <2 x i32> zeroinitializer, <2 x i32> addrspace(4)* %fptr + %scratch = alloca <2 x i32>, addrspace(5) + %fptr = addrspacecast <2 x i32> addrspace(5)* %scratch to <2 x i32>* + store volatile <2 x i32> zeroinitializer, <2 x i32>* %fptr ret void } ; CHECK-LABEL: {{^}}store_flat_i8_max_offset: ; CIVI: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}} ; GFX9: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:4095{{$}} -define amdgpu_kernel void @store_flat_i8_max_offset(i8 addrspace(4)* %fptr, i8 %x) #0 { - %fptr.offset = getelementptr inbounds i8, i8 addrspace(4)* %fptr, i64 4095 - store volatile i8 %x, i8 addrspace(4)* %fptr.offset +define amdgpu_kernel void @store_flat_i8_max_offset(i8* %fptr, i8 %x) #0 { + %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4095 + store volatile i8 %x, i8* %fptr.offset ret void } ; CHECK-LABEL: {{^}}store_flat_i8_max_offset_p1: ; CHECK: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}} -define amdgpu_kernel void @store_flat_i8_max_offset_p1(i8 addrspace(4)* %fptr, i8 %x) #0 { - %fptr.offset = getelementptr inbounds i8, i8 addrspace(4)* %fptr, i64 4096 - store volatile i8 %x, i8 addrspace(4)* %fptr.offset +define amdgpu_kernel void @store_flat_i8_max_offset_p1(i8* %fptr, i8 %x) #0 { + %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4096 + store volatile i8 %x, i8* %fptr.offset ret void } ; CHECK-LABEL: {{^}}store_flat_i8_neg_offset: ; CHECK: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}} -define amdgpu_kernel void @store_flat_i8_neg_offset(i8 addrspace(4)* %fptr, i8 %x) #0 { - %fptr.offset = getelementptr inbounds i8, i8 addrspace(4)* %fptr, i64 -2 - store volatile i8 %x, i8 addrspace(4)* %fptr.offset +define amdgpu_kernel void @store_flat_i8_neg_offset(i8* %fptr, i8 %x) #0 { + %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 -2 + store volatile i8 %x, i8* %fptr.offset ret void } ; CHECK-LABEL: {{^}}load_flat_i8_max_offset: ; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}} ; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095{{$}} -define amdgpu_kernel void @load_flat_i8_max_offset(i8 addrspace(4)* %fptr) #0 { - %fptr.offset = getelementptr inbounds i8, i8 addrspace(4)* %fptr, i64 4095 - %val = load volatile i8, i8 addrspace(4)* %fptr.offset +define amdgpu_kernel void @load_flat_i8_max_offset(i8* %fptr) #0 { + %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4095 + %val = load volatile i8, i8* %fptr.offset ret void } ; CHECK-LABEL: {{^}}load_flat_i8_max_offset_p1: ; CHECK: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}} -define amdgpu_kernel void @load_flat_i8_max_offset_p1(i8 addrspace(4)* %fptr) #0 { - %fptr.offset = getelementptr inbounds i8, i8 addrspace(4)* %fptr, i64 4096 - %val = load volatile i8, i8 addrspace(4)* %fptr.offset +define amdgpu_kernel void @load_flat_i8_max_offset_p1(i8* %fptr) #0 { + %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4096 + %val = load volatile i8, i8* %fptr.offset ret void } ; CHECK-LABEL: {{^}}load_flat_i8_neg_offset: ; CHECK: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}} -define amdgpu_kernel void @load_flat_i8_neg_offset(i8 addrspace(4)* %fptr) #0 { - %fptr.offset = getelementptr inbounds i8, i8 addrspace(4)* %fptr, i64 -2 - %val = load volatile i8, i8 addrspace(4)* %fptr.offset +define amdgpu_kernel void @load_flat_i8_neg_offset(i8* %fptr) #0 { + %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 -2 + %val = load volatile i8, i8* %fptr.offset ret void } Index: test/CodeGen/AMDGPU/flat_atomics.ll =================================================================== --- test/CodeGen/AMDGPU/flat_atomics.ll +++ test/CodeGen/AMDGPU/flat_atomics.ll @@ -5,29 +5,29 @@ ; GCN-LABEL: {{^}}atomic_add_i32_offset: ; CIVI: flat_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} ; GFX9: flat_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset:16{{$}} -define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_add_i32_offset(i32* %out, i32 %in) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst + %gep = getelementptr i32, i32* %out, i32 4 + %val = atomicrmw volatile add i32* %gep, i32 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_add_i32_max_offset: ; CIVI: flat_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} ; GFX9: flat_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset:4092{{$}} -define amdgpu_kernel void @atomic_add_i32_max_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_add_i32_max_offset(i32* %out, i32 %in) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 1023 - %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst + %gep = getelementptr i32, i32* %out, i32 1023 + %val = atomicrmw volatile add i32* %gep, i32 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_add_i32_max_offset_p1: ; GCN: flat_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -define amdgpu_kernel void @atomic_add_i32_max_offset_p1(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_add_i32_max_offset_p1(i32* %out, i32 %in) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 1024 - %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst + %gep = getelementptr i32, i32* %out, i32 1024 + %val = atomicrmw volatile add i32* %gep, i32 %in seq_cst ret void } @@ -35,22 +35,22 @@ ; CIVI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GFX9: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_add_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_add_i32_ret_offset(i32* %out, i32* %out2, i32 %in) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %gep = getelementptr i32, i32* %out, i32 4 + %val = atomicrmw volatile add i32* %gep, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_add_i32_addr64_offset: ; CIVI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} ; GFX9: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}} -define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32* %out, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst + %ptr = getelementptr i32, i32* %out, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + %val = atomicrmw volatile add i32* %gep, i32 %in seq_cst ret void } @@ -58,60 +58,60 @@ ; CIVI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GFX9: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %ptr = getelementptr i32, i32* %out, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + %val = atomicrmw volatile add i32* %gep, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_add_i32: ; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define amdgpu_kernel void @atomic_add_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_add_i32(i32* %out, i32 %in) { entry: - %val = atomicrmw volatile add i32 addrspace(4)* %out, i32 %in seq_cst + %val = atomicrmw volatile add i32* %out, i32 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_add_i32_ret: ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_add_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_add_i32_ret(i32* %out, i32* %out2, i32 %in) { entry: - %val = atomicrmw volatile add i32 addrspace(4)* %out, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %val = atomicrmw volatile add i32* %out, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_add_i32_addr64: ; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define amdgpu_kernel void @atomic_add_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i32_addr64(i32* %out, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %val = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %in seq_cst + %ptr = getelementptr i32, i32* %out, i64 %index + %val = atomicrmw volatile add i32* %ptr, i32 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_add_i32_ret_addr64: ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_add_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %val = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %ptr = getelementptr i32, i32* %out, i64 %index + %val = atomicrmw volatile add i32* %ptr, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_and_i32_offset: ; CIVI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} ; GFX9: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}} -define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_and_i32_offset(i32* %out, i32 %in) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst + %gep = getelementptr i32, i32* %out, i32 4 + %val = atomicrmw volatile and i32* %gep, i32 %in seq_cst ret void } @@ -119,22 +119,22 @@ ; CIVI: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GFX9: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_and_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_and_i32_ret_offset(i32* %out, i32* %out2, i32 %in) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %gep = getelementptr i32, i32* %out, i32 4 + %val = atomicrmw volatile and i32* %gep, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_and_i32_addr64_offset: ; CIVI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} ; GFX9: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}} -define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32* %out, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst + %ptr = getelementptr i32, i32* %out, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + %val = atomicrmw volatile and i32* %gep, i32 %in seq_cst ret void } @@ -142,60 +142,60 @@ ; CIVI: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GFX9: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %ptr = getelementptr i32, i32* %out, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + %val = atomicrmw volatile and i32* %gep, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_and_i32: ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define amdgpu_kernel void @atomic_and_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_and_i32(i32* %out, i32 %in) { entry: - %val = atomicrmw volatile and i32 addrspace(4)* %out, i32 %in seq_cst + %val = atomicrmw volatile and i32* %out, i32 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_and_i32_ret: ; GCN: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_and_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_and_i32_ret(i32* %out, i32* %out2, i32 %in) { entry: - %val = atomicrmw volatile and i32 addrspace(4)* %out, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %val = atomicrmw volatile and i32* %out, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_and_i32_addr64: ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define amdgpu_kernel void @atomic_and_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i32_addr64(i32* %out, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %val = atomicrmw volatile and i32 addrspace(4)* %ptr, i32 %in seq_cst + %ptr = getelementptr i32, i32* %out, i64 %index + %val = atomicrmw volatile and i32* %ptr, i32 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_and_i32_ret_addr64: ; GCN: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_and_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %val = atomicrmw volatile and i32 addrspace(4)* %ptr, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %ptr = getelementptr i32, i32* %out, i64 %index + %val = atomicrmw volatile and i32* %ptr, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_sub_i32_offset: ; CIVI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} ; GFX9: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}} -define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_sub_i32_offset(i32* %out, i32 %in) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst + %gep = getelementptr i32, i32* %out, i32 4 + %val = atomicrmw volatile sub i32* %gep, i32 %in seq_cst ret void } @@ -203,22 +203,22 @@ ; CIVI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GFX9: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_sub_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_sub_i32_ret_offset(i32* %out, i32* %out2, i32 %in) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %gep = getelementptr i32, i32* %out, i32 4 + %val = atomicrmw volatile sub i32* %gep, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_sub_i32_addr64_offset: ; CIVI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} ; GFX9: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}} -define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32* %out, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst + %ptr = getelementptr i32, i32* %out, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + %val = atomicrmw volatile sub i32* %gep, i32 %in seq_cst ret void } @@ -226,60 +226,60 @@ ; CIVI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GFX9: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %ptr = getelementptr i32, i32* %out, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + %val = atomicrmw volatile sub i32* %gep, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_sub_i32: ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define amdgpu_kernel void @atomic_sub_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_sub_i32(i32* %out, i32 %in) { entry: - %val = atomicrmw volatile sub i32 addrspace(4)* %out, i32 %in seq_cst + %val = atomicrmw volatile sub i32* %out, i32 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_sub_i32_ret: ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_sub_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_sub_i32_ret(i32* %out, i32* %out2, i32 %in) { entry: - %val = atomicrmw volatile sub i32 addrspace(4)* %out, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %val = atomicrmw volatile sub i32* %out, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_sub_i32_addr64: ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define amdgpu_kernel void @atomic_sub_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i32_addr64(i32* %out, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %val = atomicrmw volatile sub i32 addrspace(4)* %ptr, i32 %in seq_cst + %ptr = getelementptr i32, i32* %out, i64 %index + %val = atomicrmw volatile sub i32* %ptr, i32 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_sub_i32_ret_addr64: ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_sub_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %val = atomicrmw volatile sub i32 addrspace(4)* %ptr, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %ptr = getelementptr i32, i32* %out, i64 %index + %val = atomicrmw volatile sub i32* %ptr, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_max_i32_offset: ; CIVI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} ; GFX9: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}} -define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_max_i32_offset(i32* %out, i32 %in) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst + %gep = getelementptr i32, i32* %out, i32 4 + %val = atomicrmw volatile max i32* %gep, i32 %in seq_cst ret void } @@ -287,22 +287,22 @@ ; CIVI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GFX9: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_max_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_max_i32_ret_offset(i32* %out, i32* %out2, i32 %in) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %gep = getelementptr i32, i32* %out, i32 4 + %val = atomicrmw volatile max i32* %gep, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_max_i32_addr64_offset: ; CIVI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} ; GFX9: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}} -define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32* %out, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst + %ptr = getelementptr i32, i32* %out, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + %val = atomicrmw volatile max i32* %gep, i32 %in seq_cst ret void } @@ -310,60 +310,60 @@ ; CIVI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GFX9: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %ptr = getelementptr i32, i32* %out, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + %val = atomicrmw volatile max i32* %gep, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_max_i32: ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define amdgpu_kernel void @atomic_max_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_max_i32(i32* %out, i32 %in) { entry: - %val = atomicrmw volatile max i32 addrspace(4)* %out, i32 %in seq_cst + %val = atomicrmw volatile max i32* %out, i32 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_max_i32_ret: ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_max_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_max_i32_ret(i32* %out, i32* %out2, i32 %in) { entry: - %val = atomicrmw volatile max i32 addrspace(4)* %out, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %val = atomicrmw volatile max i32* %out, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_max_i32_addr64: ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define amdgpu_kernel void @atomic_max_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i32_addr64(i32* %out, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %val = atomicrmw volatile max i32 addrspace(4)* %ptr, i32 %in seq_cst + %ptr = getelementptr i32, i32* %out, i64 %index + %val = atomicrmw volatile max i32* %ptr, i32 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_max_i32_ret_addr64: ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_max_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %val = atomicrmw volatile max i32 addrspace(4)* %ptr, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %ptr = getelementptr i32, i32* %out, i64 %index + %val = atomicrmw volatile max i32* %ptr, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_umax_i32_offset: ; CIVI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} ; GFX9: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}} -define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_umax_i32_offset(i32* %out, i32 %in) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst + %gep = getelementptr i32, i32* %out, i32 4 + %val = atomicrmw volatile umax i32* %gep, i32 %in seq_cst ret void } @@ -371,22 +371,22 @@ ; CIVI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GFX9: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32* %out, i32* %out2, i32 %in) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %gep = getelementptr i32, i32* %out, i32 4 + %val = atomicrmw volatile umax i32* %gep, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_umax_i32_addr64_offset: ; CIVI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} ; GFX9: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}} -define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32* %out, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst + %ptr = getelementptr i32, i32* %out, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + %val = atomicrmw volatile umax i32* %gep, i32 %in seq_cst ret void } @@ -394,60 +394,60 @@ ; CIVI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GFX9: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %ptr = getelementptr i32, i32* %out, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + %val = atomicrmw volatile umax i32* %gep, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_umax_i32: ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_umax_i32(i32* %out, i32 %in) { entry: - %val = atomicrmw volatile umax i32 addrspace(4)* %out, i32 %in seq_cst + %val = atomicrmw volatile umax i32* %out, i32 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_umax_i32_ret: ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_umax_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_umax_i32_ret(i32* %out, i32* %out2, i32 %in) { entry: - %val = atomicrmw volatile umax i32 addrspace(4)* %out, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %val = atomicrmw volatile umax i32* %out, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_umax_i32_addr64: ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define amdgpu_kernel void @atomic_umax_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i32_addr64(i32* %out, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %val = atomicrmw volatile umax i32 addrspace(4)* %ptr, i32 %in seq_cst + %ptr = getelementptr i32, i32* %out, i64 %index + %val = atomicrmw volatile umax i32* %ptr, i32 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_umax_i32_ret_addr64: ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_umax_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %val = atomicrmw volatile umax i32 addrspace(4)* %ptr, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %ptr = getelementptr i32, i32* %out, i64 %index + %val = atomicrmw volatile umax i32* %ptr, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_min_i32_offset: ; CIVI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} ; GFX9: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}} -define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_min_i32_offset(i32* %out, i32 %in) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst + %gep = getelementptr i32, i32* %out, i32 4 + %val = atomicrmw volatile min i32* %gep, i32 %in seq_cst ret void } @@ -455,22 +455,22 @@ ; CIVI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GFX9: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_min_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_min_i32_ret_offset(i32* %out, i32* %out2, i32 %in) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %gep = getelementptr i32, i32* %out, i32 4 + %val = atomicrmw volatile min i32* %gep, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_min_i32_addr64_offset: ; CIVI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} ; GFX9: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}} -define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32* %out, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst + %ptr = getelementptr i32, i32* %out, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + %val = atomicrmw volatile min i32* %gep, i32 %in seq_cst ret void } @@ -478,60 +478,60 @@ ; CIVI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GFX9: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %ptr = getelementptr i32, i32* %out, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + %val = atomicrmw volatile min i32* %gep, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_min_i32: ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define amdgpu_kernel void @atomic_min_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_min_i32(i32* %out, i32 %in) { entry: - %val = atomicrmw volatile min i32 addrspace(4)* %out, i32 %in seq_cst + %val = atomicrmw volatile min i32* %out, i32 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_min_i32_ret: ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_min_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_min_i32_ret(i32* %out, i32* %out2, i32 %in) { entry: - %val = atomicrmw volatile min i32 addrspace(4)* %out, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %val = atomicrmw volatile min i32* %out, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_min_i32_addr64: ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define amdgpu_kernel void @atomic_min_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i32_addr64(i32* %out, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %val = atomicrmw volatile min i32 addrspace(4)* %ptr, i32 %in seq_cst + %ptr = getelementptr i32, i32* %out, i64 %index + %val = atomicrmw volatile min i32* %ptr, i32 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_min_i32_ret_addr64: ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_min_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %val = atomicrmw volatile min i32 addrspace(4)* %ptr, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %ptr = getelementptr i32, i32* %out, i64 %index + %val = atomicrmw volatile min i32* %ptr, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_umin_i32_offset: ; CIVI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} ; GFX9: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}} -define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_umin_i32_offset(i32* %out, i32 %in) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = atomicrmw volatile umin i32 addrspace(4)* %gep, i32 %in seq_cst + %gep = getelementptr i32, i32* %out, i32 4 + %val = atomicrmw volatile umin i32* %gep, i32 %in seq_cst ret void } @@ -539,22 +539,22 @@ ; CIVI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GFX9: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_umin_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_umin_i32_ret_offset(i32* %out, i32* %out2, i32 %in) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = atomicrmw volatile umin i32 addrspace(4)* %gep, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %gep = getelementptr i32, i32* %out, i32 4 + %val = atomicrmw volatile umin i32* %gep, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_umin_i32_addr64_offset: ; CIVI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} ; GFX9: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}} -define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32* %out, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %val = atomicrmw volatile umin i32 addrspace(4)* %gep, i32 %in seq_cst + %ptr = getelementptr i32, i32* %out, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + %val = atomicrmw volatile umin i32* %gep, i32 %in seq_cst ret void } @@ -562,60 +562,60 @@ ; CIVI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GFX9: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %val = atomicrmw volatile umin i32 addrspace(4)* %gep, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %ptr = getelementptr i32, i32* %out, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + %val = atomicrmw volatile umin i32* %gep, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_umin_i32: ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define amdgpu_kernel void @atomic_umin_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_umin_i32(i32* %out, i32 %in) { entry: - %val = atomicrmw volatile umin i32 addrspace(4)* %out, i32 %in seq_cst + %val = atomicrmw volatile umin i32* %out, i32 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_umin_i32_ret: ; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_umin_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_umin_i32_ret(i32* %out, i32* %out2, i32 %in) { entry: - %val = atomicrmw volatile umin i32 addrspace(4)* %out, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %val = atomicrmw volatile umin i32* %out, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_umin_i32_addr64: ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define amdgpu_kernel void @atomic_umin_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i32_addr64(i32* %out, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %val = atomicrmw volatile umin i32 addrspace(4)* %ptr, i32 %in seq_cst + %ptr = getelementptr i32, i32* %out, i64 %index + %val = atomicrmw volatile umin i32* %ptr, i32 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_umin_i32_ret_addr64: ; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]{{$}} - define amdgpu_kernel void @atomic_umin_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { + define amdgpu_kernel void @atomic_umin_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %val = atomicrmw volatile umin i32 addrspace(4)* %ptr, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %ptr = getelementptr i32, i32* %out, i64 %index + %val = atomicrmw volatile umin i32* %ptr, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_or_i32_offset: ; CIVI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} ; GFX9: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}} -define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_or_i32_offset(i32* %out, i32 %in) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = atomicrmw volatile or i32 addrspace(4)* %gep, i32 %in seq_cst + %gep = getelementptr i32, i32* %out, i32 4 + %val = atomicrmw volatile or i32* %gep, i32 %in seq_cst ret void } @@ -623,22 +623,22 @@ ; CIVI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GFX9: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_or_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_or_i32_ret_offset(i32* %out, i32* %out2, i32 %in) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = atomicrmw volatile or i32 addrspace(4)* %gep, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %gep = getelementptr i32, i32* %out, i32 4 + %val = atomicrmw volatile or i32* %gep, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_or_i32_addr64_offset: ; CIVI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} ; GFX9: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}} -define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32* %out, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %val = atomicrmw volatile or i32 addrspace(4)* %gep, i32 %in seq_cst + %ptr = getelementptr i32, i32* %out, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + %val = atomicrmw volatile or i32* %gep, i32 %in seq_cst ret void } @@ -646,60 +646,60 @@ ; CIVI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GFX9: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %val = atomicrmw volatile or i32 addrspace(4)* %gep, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %ptr = getelementptr i32, i32* %out, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + %val = atomicrmw volatile or i32* %gep, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_or_i32: ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define amdgpu_kernel void @atomic_or_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_or_i32(i32* %out, i32 %in) { entry: - %val = atomicrmw volatile or i32 addrspace(4)* %out, i32 %in seq_cst + %val = atomicrmw volatile or i32* %out, i32 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_or_i32_ret: ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_or_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_or_i32_ret(i32* %out, i32* %out2, i32 %in) { entry: - %val = atomicrmw volatile or i32 addrspace(4)* %out, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %val = atomicrmw volatile or i32* %out, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_or_i32_addr64: ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define amdgpu_kernel void @atomic_or_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i32_addr64(i32* %out, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %val = atomicrmw volatile or i32 addrspace(4)* %ptr, i32 %in seq_cst + %ptr = getelementptr i32, i32* %out, i64 %index + %val = atomicrmw volatile or i32* %ptr, i32 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_or_i32_ret_addr64: ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_or_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %val = atomicrmw volatile or i32 addrspace(4)* %ptr, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %ptr = getelementptr i32, i32* %out, i64 %index + %val = atomicrmw volatile or i32* %ptr, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_xchg_i32_offset: ; CIVI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} ; GFX9: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}} -define amdgpu_kernel void @atomic_xchg_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_xchg_i32_offset(i32* %out, i32 %in) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = atomicrmw volatile xchg i32 addrspace(4)* %gep, i32 %in seq_cst + %gep = getelementptr i32, i32* %out, i32 4 + %val = atomicrmw volatile xchg i32* %gep, i32 %in seq_cst ret void } @@ -707,22 +707,22 @@ ; CIVI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GFX9: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_xchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_xchg_i32_ret_offset(i32* %out, i32* %out2, i32 %in) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = atomicrmw volatile xchg i32 addrspace(4)* %gep, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %gep = getelementptr i32, i32* %out, i32 4 + %val = atomicrmw volatile xchg i32* %gep, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_xchg_i32_addr64_offset: ; CIVI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} ; GFX9: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}} -define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32* %out, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %val = atomicrmw volatile xchg i32 addrspace(4)* %gep, i32 %in seq_cst + %ptr = getelementptr i32, i32* %out, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + %val = atomicrmw volatile xchg i32* %gep, i32 %in seq_cst ret void } @@ -730,50 +730,50 @@ ; CIVI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GFX9: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %val = atomicrmw volatile xchg i32 addrspace(4)* %gep, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %ptr = getelementptr i32, i32* %out, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + %val = atomicrmw volatile xchg i32* %gep, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_xchg_i32: ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -define amdgpu_kernel void @atomic_xchg_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_xchg_i32(i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in seq_cst + %val = atomicrmw volatile xchg i32* %out, i32 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_xchg_i32_ret: ; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_xchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_xchg_i32_ret(i32* %out, i32* %out2, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %val = atomicrmw volatile xchg i32* %out, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_xchg_i32_addr64: ; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define amdgpu_kernel void @atomic_xchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i32_addr64(i32* %out, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %val = atomicrmw volatile xchg i32 addrspace(4)* %ptr, i32 %in seq_cst + %ptr = getelementptr i32, i32* %out, i64 %index + %val = atomicrmw volatile xchg i32* %ptr, i32 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_addr64: ; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %val = atomicrmw volatile xchg i32 addrspace(4)* %ptr, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %ptr = getelementptr i32, i32* %out, i64 %index + %val = atomicrmw volatile xchg i32* %ptr, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } @@ -782,10 +782,10 @@ ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_offset: ; CIVI: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GFX9: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] offset:16{{$}} -define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32 addrspace(4)* %out, i32 %in, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst seq_cst ret void } @@ -793,23 +793,23 @@ ; CIVI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; GFX9: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]] -define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(i32* %out, i32* %out2, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst seq_cst %flag = extractvalue { i32, i1 } %val, 0 - store i32 %flag, i32 addrspace(4)* %out2 + store i32 %flag, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_addr64_offset: ; CIVI: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GFX9: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] offset:16{{$}} -define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32* %out, i32 %in, i64 %index, i32 %old) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst + %ptr = getelementptr i32, i32* %out, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst seq_cst ret void } @@ -817,63 +817,63 @@ ; CIVI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GFX9: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]] -define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index, i32 %old) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst + %ptr = getelementptr i32, i32* %out, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst seq_cst %flag = extractvalue { i32, i1 } %val, 0 - store i32 %flag, i32 addrspace(4)* %out2 + store i32 %flag, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_cmpxchg_i32: ; GCN: flat_atomic_cmpswap v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -define amdgpu_kernel void @atomic_cmpxchg_i32(i32 addrspace(4)* %out, i32 %in, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32(i32* %out, i32 %in, i32 %old) { entry: - %val = cmpxchg volatile i32 addrspace(4)* %out, i32 %old, i32 %in seq_cst seq_cst + %val = cmpxchg volatile i32* %out, i32 %old, i32 %in seq_cst seq_cst ret void } ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret: ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] glc ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]] -define amdgpu_kernel void @atomic_cmpxchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_ret(i32* %out, i32* %out2, i32 %in, i32 %old) { entry: - %val = cmpxchg volatile i32 addrspace(4)* %out, i32 %old, i32 %in seq_cst seq_cst + %val = cmpxchg volatile i32* %out, i32 %old, i32 %in seq_cst seq_cst %flag = extractvalue { i32, i1 } %val, 0 - store i32 %flag, i32 addrspace(4)* %out2 + store i32 %flag, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_addr64: ; GCN: flat_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}} -define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(i32* %out, i32 %in, i64 %index, i32 %old) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %val = cmpxchg volatile i32 addrspace(4)* %ptr, i32 %old, i32 %in seq_cst seq_cst + %ptr = getelementptr i32, i32* %out, i64 %index + %val = cmpxchg volatile i32* %ptr, i32 %old, i32 %in seq_cst seq_cst ret void } ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64: ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]] -define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index, i32 %old) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %val = cmpxchg volatile i32 addrspace(4)* %ptr, i32 %old, i32 %in seq_cst seq_cst + %ptr = getelementptr i32, i32* %out, i64 %index + %val = cmpxchg volatile i32* %ptr, i32 %old, i32 %in seq_cst seq_cst %flag = extractvalue { i32, i1 } %val, 0 - store i32 %flag, i32 addrspace(4)* %out2 + store i32 %flag, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_xor_i32_offset: ; CIVI: flat_atomic_xor v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} ; GFX9: flat_atomic_xor v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset:16{{$}} -define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_xor_i32_offset(i32* %out, i32 %in) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = atomicrmw volatile xor i32 addrspace(4)* %gep, i32 %in seq_cst + %gep = getelementptr i32, i32* %out, i32 4 + %val = atomicrmw volatile xor i32* %gep, i32 %in seq_cst ret void } @@ -881,22 +881,22 @@ ; CIVI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GFX9: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_xor_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_xor_i32_ret_offset(i32* %out, i32* %out2, i32 %in) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = atomicrmw volatile xor i32 addrspace(4)* %gep, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %gep = getelementptr i32, i32* %out, i32 4 + %val = atomicrmw volatile xor i32* %gep, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_xor_i32_addr64_offset: ; CIVI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} ; GFX9: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}} -define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32* %out, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %val = atomicrmw volatile xor i32 addrspace(4)* %gep, i32 %in seq_cst + %ptr = getelementptr i32, i32* %out, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + %val = atomicrmw volatile xor i32* %gep, i32 %in seq_cst ret void } @@ -904,50 +904,50 @@ ; CIVI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GFX9: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %val = atomicrmw volatile xor i32 addrspace(4)* %gep, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %ptr = getelementptr i32, i32* %out, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + %val = atomicrmw volatile xor i32* %gep, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_xor_i32: ; GCN: flat_atomic_xor v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -define amdgpu_kernel void @atomic_xor_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_xor_i32(i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xor i32 addrspace(4)* %out, i32 %in seq_cst + %val = atomicrmw volatile xor i32* %out, i32 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_xor_i32_ret: ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_xor_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_xor_i32_ret(i32* %out, i32* %out2, i32 %in) { entry: - %val = atomicrmw volatile xor i32 addrspace(4)* %out, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %val = atomicrmw volatile xor i32* %out, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } ; GCN-LABEL: {{^}}atomic_xor_i32_addr64: ; GCN: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define amdgpu_kernel void @atomic_xor_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i32_addr64(i32* %out, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %val = atomicrmw volatile xor i32 addrspace(4)* %ptr, i32 %in seq_cst + %ptr = getelementptr i32, i32* %out, i64 %index + %val = atomicrmw volatile xor i32* %ptr, i32 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_xor_i32_ret_addr64: ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_xor_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %val = atomicrmw volatile xor i32 addrspace(4)* %ptr, i32 %in seq_cst - store i32 %val, i32 addrspace(4)* %out2 + %ptr = getelementptr i32, i32* %out, i64 %index + %val = atomicrmw volatile xor i32* %ptr, i32 %in seq_cst + store i32 %val, i32* %out2 ret void } @@ -955,21 +955,21 @@ ; CIVI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; GFX9: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_load_i32_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +define amdgpu_kernel void @atomic_load_i32_offset(i32* %in, i32* %out) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %in, i32 4 - %val = load atomic i32, i32 addrspace(4)* %gep seq_cst, align 4 - store i32 %val, i32 addrspace(4)* %out + %gep = getelementptr i32, i32* %in, i32 4 + %val = load atomic i32, i32* %gep seq_cst, align 4 + store i32 %val, i32* %out ret void } ; GCN-LABEL: {{^}}atomic_load_i32: ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_load_i32(i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +define amdgpu_kernel void @atomic_load_i32(i32* %in, i32* %out) { entry: - %val = load atomic i32, i32 addrspace(4)* %in seq_cst, align 4 - store i32 %val, i32 addrspace(4)* %out + %val = load atomic i32, i32* %in seq_cst, align 4 + store i32 %val, i32* %out ret void } @@ -977,60 +977,60 @@ ; CIVI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GFX9: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_load_i32_addr64_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) { +define amdgpu_kernel void @atomic_load_i32_addr64_offset(i32* %in, i32* %out, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %in, i64 %index - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %val = load atomic i32, i32 addrspace(4)* %gep seq_cst, align 4 - store i32 %val, i32 addrspace(4)* %out + %ptr = getelementptr i32, i32* %in, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + %val = load atomic i32, i32* %gep seq_cst, align 4 + store i32 %val, i32* %out ret void } ; GCN-LABEL: {{^}}atomic_load_i32_addr64: ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_load_i32_addr64(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) { +define amdgpu_kernel void @atomic_load_i32_addr64(i32* %in, i32* %out, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %in, i64 %index - %val = load atomic i32, i32 addrspace(4)* %ptr seq_cst, align 4 - store i32 %val, i32 addrspace(4)* %out + %ptr = getelementptr i32, i32* %in, i64 %index + %val = load atomic i32, i32* %ptr seq_cst, align 4 + store i32 %val, i32* %out ret void } ; GCN-LABEL: {{^}}atomic_store_i32_offset: ; CIVI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} ; GFX9: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} offset:16{{$}} -define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(4)* %out) { +define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32* %out) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - store atomic i32 %in, i32 addrspace(4)* %gep seq_cst, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + store atomic i32 %in, i32* %gep seq_cst, align 4 ret void } ; GCN-LABEL: {{^}}atomic_store_i32: ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @atomic_store_i32(i32 %in, i32 addrspace(4)* %out) { +define amdgpu_kernel void @atomic_store_i32(i32 %in, i32* %out) { entry: - store atomic i32 %in, i32 addrspace(4)* %out seq_cst, align 4 + store atomic i32 %in, i32* %out seq_cst, align 4 ret void } ; GCN-LABEL: {{^}}atomic_store_i32_addr64_offset: ; CIVI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} ; GFX9: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} offset:16{{$}} -define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(4)* %out, i64 %index) { +define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32* %out, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - store atomic i32 %in, i32 addrspace(4)* %gep seq_cst, align 4 + %ptr = getelementptr i32, i32* %out, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + store atomic i32 %in, i32* %gep seq_cst, align 4 ret void } ; GCN-LABEL: {{^}}atomic_store_i32_addr64: ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32 addrspace(4)* %out, i64 %index) { +define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32* %out, i64 %index) { entry: - %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index - store atomic i32 %in, i32 addrspace(4)* %ptr seq_cst, align 4 + %ptr = getelementptr i32, i32* %out, i64 %index + store atomic i32 %in, i32* %ptr seq_cst, align 4 ret void } Index: test/CodeGen/AMDGPU/flat_atomics_i64.ll =================================================================== --- test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -3,973 +3,973 @@ ; GCN-LABEL: {{^}}atomic_add_i64_offset: ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}} -define amdgpu_kernel void @atomic_add_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_add_i64_offset(i64* %out, i64 %in) { entry: - %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 - %tmp0 = atomicrmw volatile add i64 addrspace(4)* %gep, i64 %in seq_cst + %gep = getelementptr i64, i64* %out, i64 4 + %tmp0 = atomicrmw volatile add i64* %gep, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_add_i64_ret_offset: ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_add_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_add_i64_ret_offset(i64* %out, i64* %out2, i64 %in) { entry: - %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 - %tmp0 = atomicrmw volatile add i64 addrspace(4)* %gep, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %gep = getelementptr i64, i64* %out, i64 4 + %tmp0 = atomicrmw volatile add i64* %gep, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_add_i64_addr64_offset: ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}} -define amdgpu_kernel void @atomic_add_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i64_addr64_offset(i64* %out, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 - %tmp0 = atomicrmw volatile add i64 addrspace(4)* %gep, i64 %in seq_cst + %ptr = getelementptr i64, i64* %out, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + %tmp0 = atomicrmw volatile add i64* %gep, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_add_i64_ret_addr64_offset: ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 - %tmp0 = atomicrmw volatile add i64 addrspace(4)* %gep, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %ptr = getelementptr i64, i64* %out, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + %tmp0 = atomicrmw volatile add i64* %gep, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_add_i64: ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_add_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_add_i64(i64* %out, i64 %in) { entry: - %tmp0 = atomicrmw volatile add i64 addrspace(4)* %out, i64 %in seq_cst + %tmp0 = atomicrmw volatile add i64* %out, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_add_i64_ret: ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_add_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_add_i64_ret(i64* %out, i64* %out2, i64 %in) { entry: - %tmp0 = atomicrmw volatile add i64 addrspace(4)* %out, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %tmp0 = atomicrmw volatile add i64* %out, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_add_i64_addr64: ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_add_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i64_addr64(i64* %out, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %tmp0 = atomicrmw volatile add i64 addrspace(4)* %ptr, i64 %in seq_cst + %ptr = getelementptr i64, i64* %out, i64 %index + %tmp0 = atomicrmw volatile add i64* %ptr, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_add_i64_ret_addr64: ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_add_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i64_ret_addr64(i64* %out, i64* %out2, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %tmp0 = atomicrmw volatile add i64 addrspace(4)* %ptr, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %ptr = getelementptr i64, i64* %out, i64 %index + %tmp0 = atomicrmw volatile add i64* %ptr, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_and_i64_offset: ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_and_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_and_i64_offset(i64* %out, i64 %in) { entry: - %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 - %tmp0 = atomicrmw volatile and i64 addrspace(4)* %gep, i64 %in seq_cst + %gep = getelementptr i64, i64* %out, i64 4 + %tmp0 = atomicrmw volatile and i64* %gep, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_and_i64_ret_offset: ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_and_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_and_i64_ret_offset(i64* %out, i64* %out2, i64 %in) { entry: - %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 - %tmp0 = atomicrmw volatile and i64 addrspace(4)* %gep, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %gep = getelementptr i64, i64* %out, i64 4 + %tmp0 = atomicrmw volatile and i64* %gep, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_and_i64_addr64_offset: ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_and_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i64_addr64_offset(i64* %out, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 - %tmp0 = atomicrmw volatile and i64 addrspace(4)* %gep, i64 %in seq_cst + %ptr = getelementptr i64, i64* %out, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + %tmp0 = atomicrmw volatile and i64* %gep, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_and_i64_ret_addr64_offset: ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 - %tmp0 = atomicrmw volatile and i64 addrspace(4)* %gep, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %ptr = getelementptr i64, i64* %out, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + %tmp0 = atomicrmw volatile and i64* %gep, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_and_i64: ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_and_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_and_i64(i64* %out, i64 %in) { entry: - %tmp0 = atomicrmw volatile and i64 addrspace(4)* %out, i64 %in seq_cst + %tmp0 = atomicrmw volatile and i64* %out, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_and_i64_ret: ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_and_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_and_i64_ret(i64* %out, i64* %out2, i64 %in) { entry: - %tmp0 = atomicrmw volatile and i64 addrspace(4)* %out, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %tmp0 = atomicrmw volatile and i64* %out, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_and_i64_addr64: ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_and_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i64_addr64(i64* %out, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %tmp0 = atomicrmw volatile and i64 addrspace(4)* %ptr, i64 %in seq_cst + %ptr = getelementptr i64, i64* %out, i64 %index + %tmp0 = atomicrmw volatile and i64* %ptr, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_and_i64_ret_addr64: ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_and_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i64_ret_addr64(i64* %out, i64* %out2, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %tmp0 = atomicrmw volatile and i64 addrspace(4)* %ptr, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %ptr = getelementptr i64, i64* %out, i64 %index + %tmp0 = atomicrmw volatile and i64* %ptr, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_sub_i64_offset: ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_sub_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_sub_i64_offset(i64* %out, i64 %in) { entry: - %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 - %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %gep, i64 %in seq_cst + %gep = getelementptr i64, i64* %out, i64 4 + %tmp0 = atomicrmw volatile sub i64* %gep, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_sub_i64_ret_offset: ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_sub_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_sub_i64_ret_offset(i64* %out, i64* %out2, i64 %in) { entry: - %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 - %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %gep, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %gep = getelementptr i64, i64* %out, i64 4 + %tmp0 = atomicrmw volatile sub i64* %gep, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_sub_i64_addr64_offset: ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_sub_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i64_addr64_offset(i64* %out, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 - %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %gep, i64 %in seq_cst + %ptr = getelementptr i64, i64* %out, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + %tmp0 = atomicrmw volatile sub i64* %gep, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_sub_i64_ret_addr64_offset: ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 - %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %gep, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %ptr = getelementptr i64, i64* %out, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + %tmp0 = atomicrmw volatile sub i64* %gep, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_sub_i64: ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_sub_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_sub_i64(i64* %out, i64 %in) { entry: - %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %out, i64 %in seq_cst + %tmp0 = atomicrmw volatile sub i64* %out, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_sub_i64_ret: ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_sub_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_sub_i64_ret(i64* %out, i64* %out2, i64 %in) { entry: - %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %out, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %tmp0 = atomicrmw volatile sub i64* %out, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_sub_i64_addr64: ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_sub_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i64_addr64(i64* %out, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %ptr, i64 %in seq_cst + %ptr = getelementptr i64, i64* %out, i64 %index + %tmp0 = atomicrmw volatile sub i64* %ptr, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_sub_i64_ret_addr64: ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_sub_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i64_ret_addr64(i64* %out, i64* %out2, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %ptr, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %ptr = getelementptr i64, i64* %out, i64 %index + %tmp0 = atomicrmw volatile sub i64* %ptr, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_max_i64_offset: ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_max_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_max_i64_offset(i64* %out, i64 %in) { entry: - %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 - %tmp0 = atomicrmw volatile max i64 addrspace(4)* %gep, i64 %in seq_cst + %gep = getelementptr i64, i64* %out, i64 4 + %tmp0 = atomicrmw volatile max i64* %gep, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_max_i64_ret_offset: ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_max_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_max_i64_ret_offset(i64* %out, i64* %out2, i64 %in) { entry: - %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 - %tmp0 = atomicrmw volatile max i64 addrspace(4)* %gep, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %gep = getelementptr i64, i64* %out, i64 4 + %tmp0 = atomicrmw volatile max i64* %gep, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_max_i64_addr64_offset: ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_max_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i64_addr64_offset(i64* %out, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 - %tmp0 = atomicrmw volatile max i64 addrspace(4)* %gep, i64 %in seq_cst + %ptr = getelementptr i64, i64* %out, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + %tmp0 = atomicrmw volatile max i64* %gep, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_max_i64_ret_addr64_offset: ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 - %tmp0 = atomicrmw volatile max i64 addrspace(4)* %gep, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %ptr = getelementptr i64, i64* %out, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + %tmp0 = atomicrmw volatile max i64* %gep, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_max_i64: ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_max_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_max_i64(i64* %out, i64 %in) { entry: - %tmp0 = atomicrmw volatile max i64 addrspace(4)* %out, i64 %in seq_cst + %tmp0 = atomicrmw volatile max i64* %out, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_max_i64_ret: ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_max_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_max_i64_ret(i64* %out, i64* %out2, i64 %in) { entry: - %tmp0 = atomicrmw volatile max i64 addrspace(4)* %out, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %tmp0 = atomicrmw volatile max i64* %out, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_max_i64_addr64: ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_max_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i64_addr64(i64* %out, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %tmp0 = atomicrmw volatile max i64 addrspace(4)* %ptr, i64 %in seq_cst + %ptr = getelementptr i64, i64* %out, i64 %index + %tmp0 = atomicrmw volatile max i64* %ptr, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_max_i64_ret_addr64: ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_max_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i64_ret_addr64(i64* %out, i64* %out2, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %tmp0 = atomicrmw volatile max i64 addrspace(4)* %ptr, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %ptr = getelementptr i64, i64* %out, i64 %index + %tmp0 = atomicrmw volatile max i64* %ptr, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_umax_i64_offset: ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_umax_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_umax_i64_offset(i64* %out, i64 %in) { entry: - %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 - %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %gep, i64 %in seq_cst + %gep = getelementptr i64, i64* %out, i64 4 + %tmp0 = atomicrmw volatile umax i64* %gep, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_umax_i64_ret_offset: ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_umax_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_umax_i64_ret_offset(i64* %out, i64* %out2, i64 %in) { entry: - %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 - %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %gep, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %gep = getelementptr i64, i64* %out, i64 4 + %tmp0 = atomicrmw volatile umax i64* %gep, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_umax_i64_addr64_offset: ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_umax_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i64_addr64_offset(i64* %out, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 - %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %gep, i64 %in seq_cst + %ptr = getelementptr i64, i64* %out, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + %tmp0 = atomicrmw volatile umax i64* %gep, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_umax_i64_ret_addr64_offset: ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 - %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %gep, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %ptr = getelementptr i64, i64* %out, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + %tmp0 = atomicrmw volatile umax i64* %gep, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_umax_i64: ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_umax_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_umax_i64(i64* %out, i64 %in) { entry: - %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %out, i64 %in seq_cst + %tmp0 = atomicrmw volatile umax i64* %out, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_umax_i64_ret: ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_umax_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_umax_i64_ret(i64* %out, i64* %out2, i64 %in) { entry: - %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %out, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %tmp0 = atomicrmw volatile umax i64* %out, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_umax_i64_addr64: ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_umax_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i64_addr64(i64* %out, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %ptr, i64 %in seq_cst + %ptr = getelementptr i64, i64* %out, i64 %index + %tmp0 = atomicrmw volatile umax i64* %ptr, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_umax_i64_ret_addr64: ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_umax_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i64_ret_addr64(i64* %out, i64* %out2, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %ptr, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %ptr = getelementptr i64, i64* %out, i64 %index + %tmp0 = atomicrmw volatile umax i64* %ptr, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_min_i64_offset: ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_min_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_min_i64_offset(i64* %out, i64 %in) { entry: - %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 - %tmp0 = atomicrmw volatile min i64 addrspace(4)* %gep, i64 %in seq_cst + %gep = getelementptr i64, i64* %out, i64 4 + %tmp0 = atomicrmw volatile min i64* %gep, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_min_i64_ret_offset: ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_min_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_min_i64_ret_offset(i64* %out, i64* %out2, i64 %in) { entry: - %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 - %tmp0 = atomicrmw volatile min i64 addrspace(4)* %gep, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %gep = getelementptr i64, i64* %out, i64 4 + %tmp0 = atomicrmw volatile min i64* %gep, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_min_i64_addr64_offset: ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_min_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i64_addr64_offset(i64* %out, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 - %tmp0 = atomicrmw volatile min i64 addrspace(4)* %gep, i64 %in seq_cst + %ptr = getelementptr i64, i64* %out, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + %tmp0 = atomicrmw volatile min i64* %gep, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_min_i64_ret_addr64_offset: ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 - %tmp0 = atomicrmw volatile min i64 addrspace(4)* %gep, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %ptr = getelementptr i64, i64* %out, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + %tmp0 = atomicrmw volatile min i64* %gep, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_min_i64: ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_min_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_min_i64(i64* %out, i64 %in) { entry: - %tmp0 = atomicrmw volatile min i64 addrspace(4)* %out, i64 %in seq_cst + %tmp0 = atomicrmw volatile min i64* %out, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_min_i64_ret: ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_min_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_min_i64_ret(i64* %out, i64* %out2, i64 %in) { entry: - %tmp0 = atomicrmw volatile min i64 addrspace(4)* %out, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %tmp0 = atomicrmw volatile min i64* %out, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_min_i64_addr64: ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_min_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i64_addr64(i64* %out, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %tmp0 = atomicrmw volatile min i64 addrspace(4)* %ptr, i64 %in seq_cst + %ptr = getelementptr i64, i64* %out, i64 %index + %tmp0 = atomicrmw volatile min i64* %ptr, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_min_i64_ret_addr64: ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_min_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i64_ret_addr64(i64* %out, i64* %out2, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %tmp0 = atomicrmw volatile min i64 addrspace(4)* %ptr, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %ptr = getelementptr i64, i64* %out, i64 %index + %tmp0 = atomicrmw volatile min i64* %ptr, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_umin_i64_offset: ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_umin_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_umin_i64_offset(i64* %out, i64 %in) { entry: - %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 - %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %gep, i64 %in seq_cst + %gep = getelementptr i64, i64* %out, i64 4 + %tmp0 = atomicrmw volatile umin i64* %gep, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_umin_i64_ret_offset: ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_umin_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_umin_i64_ret_offset(i64* %out, i64* %out2, i64 %in) { entry: - %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 - %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %gep, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %gep = getelementptr i64, i64* %out, i64 4 + %tmp0 = atomicrmw volatile umin i64* %gep, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_umin_i64_addr64_offset: ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_umin_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i64_addr64_offset(i64* %out, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 - %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %gep, i64 %in seq_cst + %ptr = getelementptr i64, i64* %out, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + %tmp0 = atomicrmw volatile umin i64* %gep, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_umin_i64_ret_addr64_offset: ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 - %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %gep, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %ptr = getelementptr i64, i64* %out, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + %tmp0 = atomicrmw volatile umin i64* %gep, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_umin_i64: ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_umin_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_umin_i64(i64* %out, i64 %in) { entry: - %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %out, i64 %in seq_cst + %tmp0 = atomicrmw volatile umin i64* %out, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_umin_i64_ret: ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_umin_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_umin_i64_ret(i64* %out, i64* %out2, i64 %in) { entry: - %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %out, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %tmp0 = atomicrmw volatile umin i64* %out, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_umin_i64_addr64: ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_umin_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i64_addr64(i64* %out, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %ptr, i64 %in seq_cst + %ptr = getelementptr i64, i64* %out, i64 %index + %tmp0 = atomicrmw volatile umin i64* %ptr, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_umin_i64_ret_addr64: ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_umin_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i64_ret_addr64(i64* %out, i64* %out2, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %ptr, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %ptr = getelementptr i64, i64* %out, i64 %index + %tmp0 = atomicrmw volatile umin i64* %ptr, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_or_i64_offset: ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_or_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_or_i64_offset(i64* %out, i64 %in) { entry: - %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 - %tmp0 = atomicrmw volatile or i64 addrspace(4)* %gep, i64 %in seq_cst + %gep = getelementptr i64, i64* %out, i64 4 + %tmp0 = atomicrmw volatile or i64* %gep, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_or_i64_ret_offset: ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_or_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_or_i64_ret_offset(i64* %out, i64* %out2, i64 %in) { entry: - %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 - %tmp0 = atomicrmw volatile or i64 addrspace(4)* %gep, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %gep = getelementptr i64, i64* %out, i64 4 + %tmp0 = atomicrmw volatile or i64* %gep, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_or_i64_addr64_offset: ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_or_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i64_addr64_offset(i64* %out, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 - %tmp0 = atomicrmw volatile or i64 addrspace(4)* %gep, i64 %in seq_cst + %ptr = getelementptr i64, i64* %out, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + %tmp0 = atomicrmw volatile or i64* %gep, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_or_i64_ret_addr64_offset: ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 - %tmp0 = atomicrmw volatile or i64 addrspace(4)* %gep, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %ptr = getelementptr i64, i64* %out, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + %tmp0 = atomicrmw volatile or i64* %gep, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_or_i64: ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_or_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_or_i64(i64* %out, i64 %in) { entry: - %tmp0 = atomicrmw volatile or i64 addrspace(4)* %out, i64 %in seq_cst + %tmp0 = atomicrmw volatile or i64* %out, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_or_i64_ret: ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_or_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_or_i64_ret(i64* %out, i64* %out2, i64 %in) { entry: - %tmp0 = atomicrmw volatile or i64 addrspace(4)* %out, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %tmp0 = atomicrmw volatile or i64* %out, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_or_i64_addr64: ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_or_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i64_addr64(i64* %out, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %tmp0 = atomicrmw volatile or i64 addrspace(4)* %ptr, i64 %in seq_cst + %ptr = getelementptr i64, i64* %out, i64 %index + %tmp0 = atomicrmw volatile or i64* %ptr, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_or_i64_ret_addr64: ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_or_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i64_ret_addr64(i64* %out, i64* %out2, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %tmp0 = atomicrmw volatile or i64 addrspace(4)* %ptr, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %ptr = getelementptr i64, i64* %out, i64 %index + %tmp0 = atomicrmw volatile or i64* %ptr, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_xchg_i64_offset: ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_xchg_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_xchg_i64_offset(i64* %out, i64 %in) { entry: - %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 - %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %gep, i64 %in seq_cst + %gep = getelementptr i64, i64* %out, i64 4 + %tmp0 = atomicrmw volatile xchg i64* %gep, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_offset: ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_xchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_xchg_i64_ret_offset(i64* %out, i64* %out2, i64 %in) { entry: - %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 - %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %gep, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %gep = getelementptr i64, i64* %out, i64 4 + %tmp0 = atomicrmw volatile xchg i64* %gep, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64_offset: ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(i64* %out, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 - %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %gep, i64 %in seq_cst + %ptr = getelementptr i64, i64* %out, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + %tmp0 = atomicrmw volatile xchg i64* %gep, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_addr64_offset: ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 - %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %gep, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %ptr = getelementptr i64, i64* %out, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + %tmp0 = atomicrmw volatile xchg i64* %gep, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_xchg_i64: ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_xchg_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_xchg_i64(i64* %out, i64 %in) { entry: - %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %out, i64 %in seq_cst + %tmp0 = atomicrmw volatile xchg i64* %out, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_xchg_i64_ret: ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_xchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_xchg_i64_ret(i64* %out, i64* %out2, i64 %in) { entry: - %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %out, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %tmp0 = atomicrmw volatile xchg i64* %out, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64: ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_xchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i64_addr64(i64* %out, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %ptr, i64 %in seq_cst + %ptr = getelementptr i64, i64* %out, i64 %index + %tmp0 = atomicrmw volatile xchg i64* %ptr, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_addr64: ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(i64* %out, i64* %out2, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %ptr, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %ptr = getelementptr i64, i64* %out, i64 %index + %tmp0 = atomicrmw volatile xchg i64* %ptr, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_xor_i64_offset: ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_xor_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_xor_i64_offset(i64* %out, i64 %in) { entry: - %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 - %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %gep, i64 %in seq_cst + %gep = getelementptr i64, i64* %out, i64 4 + %tmp0 = atomicrmw volatile xor i64* %gep, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_xor_i64_ret_offset: ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_xor_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_xor_i64_ret_offset(i64* %out, i64* %out2, i64 %in) { entry: - %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 - %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %gep, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %gep = getelementptr i64, i64* %out, i64 4 + %tmp0 = atomicrmw volatile xor i64* %gep, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_xor_i64_addr64_offset: ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_xor_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i64_addr64_offset(i64* %out, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 - %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %gep, i64 %in seq_cst + %ptr = getelementptr i64, i64* %out, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + %tmp0 = atomicrmw volatile xor i64* %gep, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_xor_i64_ret_addr64_offset: ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 - %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %gep, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %ptr = getelementptr i64, i64* %out, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + %tmp0 = atomicrmw volatile xor i64* %gep, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_xor_i64: ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_xor_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_xor_i64(i64* %out, i64 %in) { entry: - %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %out, i64 %in seq_cst + %tmp0 = atomicrmw volatile xor i64* %out, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_xor_i64_ret: ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_xor_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_xor_i64_ret(i64* %out, i64* %out2, i64 %in) { entry: - %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %out, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %tmp0 = atomicrmw volatile xor i64* %out, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_xor_i64_addr64: ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @atomic_xor_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i64_addr64(i64* %out, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %ptr, i64 %in seq_cst + %ptr = getelementptr i64, i64* %out, i64 %index + %tmp0 = atomicrmw volatile xor i64* %ptr, i64 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_xor_i64_ret_addr64: ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_xor_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i64_ret_addr64(i64* %out, i64* %out2, i64 %in, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %ptr, i64 %in seq_cst - store i64 %tmp0, i64 addrspace(4)* %out2 + %ptr = getelementptr i64, i64* %out, i64 %index + %tmp0 = atomicrmw volatile xor i64* %ptr, i64 %in seq_cst + store i64 %tmp0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_load_i64_offset: ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_load_i64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out) { +define amdgpu_kernel void @atomic_load_i64_offset(i64* %in, i64* %out) { entry: - %gep = getelementptr i64, i64 addrspace(4)* %in, i64 4 - %val = load atomic i64, i64 addrspace(4)* %gep seq_cst, align 8 - store i64 %val, i64 addrspace(4)* %out + %gep = getelementptr i64, i64* %in, i64 4 + %val = load atomic i64, i64* %gep seq_cst, align 8 + store i64 %val, i64* %out ret void } ; GCN-LABEL: {{^}}atomic_load_i64: ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_load_i64(i64 addrspace(4)* %in, i64 addrspace(4)* %out) { +define amdgpu_kernel void @atomic_load_i64(i64* %in, i64* %out) { entry: - %val = load atomic i64, i64 addrspace(4)* %in seq_cst, align 8 - store i64 %val, i64 addrspace(4)* %out + %val = load atomic i64, i64* %in seq_cst, align 8 + store i64 %val, i64* %out ret void } ; GCN-LABEL: {{^}}atomic_load_i64_addr64_offset: ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_load_i64_addr64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) { +define amdgpu_kernel void @atomic_load_i64_addr64_offset(i64* %in, i64* %out, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %in, i64 %index - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 - %val = load atomic i64, i64 addrspace(4)* %gep seq_cst, align 8 - store i64 %val, i64 addrspace(4)* %out + %ptr = getelementptr i64, i64* %in, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + %val = load atomic i64, i64* %gep seq_cst, align 8 + store i64 %val, i64* %out ret void } ; GCN-LABEL: {{^}}atomic_load_i64_addr64: ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @atomic_load_i64_addr64(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) { +define amdgpu_kernel void @atomic_load_i64_addr64(i64* %in, i64* %out, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %in, i64 %index - %val = load atomic i64, i64 addrspace(4)* %ptr seq_cst, align 8 - store i64 %val, i64 addrspace(4)* %out + %ptr = getelementptr i64, i64* %in, i64 %index + %val = load atomic i64, i64* %ptr seq_cst, align 8 + store i64 %val, i64* %out ret void } ; GCN-LABEL: {{^}}atomic_store_i64_offset: ; GCN: flat_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, i64 addrspace(4)* %out) { +define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, i64* %out) { entry: - %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 - store atomic i64 %in, i64 addrspace(4)* %gep seq_cst, align 8 + %gep = getelementptr i64, i64* %out, i64 4 + store atomic i64 %in, i64* %gep seq_cst, align 8 ret void } ; GCN-LABEL: {{^}}atomic_store_i64: ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}] -define amdgpu_kernel void @atomic_store_i64(i64 %in, i64 addrspace(4)* %out) { +define amdgpu_kernel void @atomic_store_i64(i64 %in, i64* %out) { entry: - store atomic i64 %in, i64 addrspace(4)* %out seq_cst, align 8 + store atomic i64 %in, i64* %out seq_cst, align 8 ret void } ; GCN-LABEL: {{^}}atomic_store_i64_addr64_offset: ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}]{{$}} -define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(4)* %out, i64 %index) { +define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, i64* %out, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 - store atomic i64 %in, i64 addrspace(4)* %gep seq_cst, align 8 + %ptr = getelementptr i64, i64* %out, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + store atomic i64 %in, i64* %gep seq_cst, align 8 ret void } ; GCN-LABEL: {{^}}atomic_store_i64_addr64: ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}]{{$}} -define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, i64 addrspace(4)* %out, i64 %index) { +define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, i64* %out, i64 %index) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - store atomic i64 %in, i64 addrspace(4)* %ptr seq_cst, align 8 + %ptr = getelementptr i64, i64* %out, i64 %index + store atomic i64 %in, i64* %ptr seq_cst, align 8 ret void } ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_offset: ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -define amdgpu_kernel void @atomic_cmpxchg_i64_offset(i64 addrspace(4)* %out, i64 %in, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_offset(i64* %out, i64 %in, i64 %old) { entry: - %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 - %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst + %gep = getelementptr i64, i64* %out, i64 4 + %val = cmpxchg volatile i64* %gep, i64 %old, i64 %in seq_cst seq_cst ret void } ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_soffset: ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(i64 addrspace(4)* %out, i64 %in, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(i64* %out, i64 %in, i64 %old) { entry: - %gep = getelementptr i64, i64 addrspace(4)* %out, i64 9000 - %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst + %gep = getelementptr i64, i64* %out, i64 9000 + %val = cmpxchg volatile i64* %gep, i64 %old, i64 %in seq_cst seq_cst ret void } ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_offset: ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]: -define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(i64* %out, i64* %out2, i64 %in, i64 %old) { entry: - %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 - %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst + %gep = getelementptr i64, i64* %out, i64 4 + %val = cmpxchg volatile i64* %gep, i64 %old, i64 %in seq_cst seq_cst %extract0 = extractvalue { i64, i1 } %val, 0 - store i64 %extract0, i64 addrspace(4)* %out2 + store i64 %extract0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_addr64_offset: ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(i64* %out, i64 %in, i64 %index, i64 %old) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 - %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst + %ptr = getelementptr i64, i64* %out, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + %val = cmpxchg volatile i64* %gep, i64 %old, i64 %in seq_cst seq_cst ret void } ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64_offset: ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]: -define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index, i64 %old) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 - %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst + %ptr = getelementptr i64, i64* %out, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + %val = cmpxchg volatile i64* %gep, i64 %old, i64 %in seq_cst seq_cst %extract0 = extractvalue { i64, i1 } %val, 0 - store i64 %extract0, i64 addrspace(4)* %out2 + store i64 %extract0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_cmpxchg_i64: ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}} -define amdgpu_kernel void @atomic_cmpxchg_i64(i64 addrspace(4)* %out, i64 %in, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64(i64* %out, i64 %in, i64 %old) { entry: - %val = cmpxchg volatile i64 addrspace(4)* %out, i64 %old, i64 %in seq_cst seq_cst + %val = cmpxchg volatile i64* %out, i64 %old, i64 %in seq_cst seq_cst ret void } ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret: ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]: -define amdgpu_kernel void @atomic_cmpxchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_ret(i64* %out, i64* %out2, i64 %in, i64 %old) { entry: - %val = cmpxchg volatile i64 addrspace(4)* %out, i64 %old, i64 %in seq_cst seq_cst + %val = cmpxchg volatile i64* %out, i64 %old, i64 %in seq_cst seq_cst %extract0 = extractvalue { i64, i1 } %val, 0 - store i64 %extract0, i64 addrspace(4)* %out2 + store i64 %extract0, i64* %out2 ret void } ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_addr64: ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}} -define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(i64* %out, i64 %in, i64 %index, i64 %old) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %val = cmpxchg volatile i64 addrspace(4)* %ptr, i64 %old, i64 %in seq_cst seq_cst + %ptr = getelementptr i64, i64* %out, i64 %index + %val = cmpxchg volatile i64* %ptr, i64 %old, i64 %in seq_cst seq_cst ret void } ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64: ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]: -define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(i64* %out, i64* %out2, i64 %in, i64 %index, i64 %old) { entry: - %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index - %val = cmpxchg volatile i64 addrspace(4)* %ptr, i64 %old, i64 %in seq_cst seq_cst + %ptr = getelementptr i64, i64* %out, i64 %index + %val = cmpxchg volatile i64* %ptr, i64 %old, i64 %in seq_cst seq_cst %extract0 = extractvalue { i64, i1 } %val, 0 - store i64 %extract0, i64 addrspace(4)* %out2 + store i64 %extract0, i64* %out2 ret void } Index: test/CodeGen/AMDGPU/frame-index-elimination.ll =================================================================== --- test/CodeGen/AMDGPU/frame-index-elimination.ll +++ test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -1,5 +1,6 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +target datalayout = "A5" ; Test that non-entry function frame indices are expanded properly to ; give an index relative to the scratch wave offset register @@ -18,8 +19,8 @@ ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 define void @func_mov_fi_i32() #0 { - %alloca = alloca i32 - store volatile i32* %alloca, i32* addrspace(3)* undef + %alloca = alloca i32, addrspace(5) + store volatile i32 addrspace(5)* %alloca, i32 addrspace(5)* addrspace(3)* undef ret void } @@ -42,9 +43,9 @@ ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 define void @func_add_constant_to_fi_i32() #0 { - %alloca = alloca [2 x i32], align 4 - %gep0 = getelementptr inbounds [2 x i32], [2 x i32]* %alloca, i32 0, i32 1 - store volatile i32* %gep0, i32* addrspace(3)* undef + %alloca = alloca [2 x i32], align 4, addrspace(5) + %gep0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %alloca, i32 0, i32 1 + store volatile i32 addrspace(5)* %gep0, i32 addrspace(5)* addrspace(3)* undef ret void } @@ -64,8 +65,8 @@ ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 define void @func_other_fi_user_i32() #0 { - %alloca = alloca [2 x i32], align 4 - %ptrtoint = ptrtoint [2 x i32]* %alloca to i32 + %alloca = alloca [2 x i32], align 4, addrspace(5) + %ptrtoint = ptrtoint [2 x i32] addrspace(5)* %alloca to i32 %mul = mul i32 %ptrtoint, 9 store volatile i32 %mul, i32 addrspace(3)* undef ret void @@ -74,16 +75,16 @@ ; GCN-LABEL: {{^}}func_store_private_arg_i32_ptr: ; GCN: v_mov_b32_e32 v1, 15{{$}} ; GCN: buffer_store_dword v1, v0, s[0:3], s4 offen{{$}} -define void @func_store_private_arg_i32_ptr(i32* %ptr) #0 { - store volatile i32 15, i32* %ptr +define void @func_store_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 { + store volatile i32 15, i32 addrspace(5)* %ptr ret void } ; GCN-LABEL: {{^}}func_load_private_arg_i32_ptr: ; GCN: s_waitcnt ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], s4 offen{{$}} -define void @func_load_private_arg_i32_ptr(i32* %ptr) #0 { - %val = load volatile i32, i32* %ptr +define void @func_load_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 { + %val = load volatile i32, i32 addrspace(5)* %ptr ret void } @@ -102,11 +103,11 @@ ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 -define void @void_func_byval_struct_i8_i32_ptr({ i8, i32 }* byval %arg0) #0 { - %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0 - %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1 - %load1 = load i32, i32* %gep1 - store volatile i32* %gep1, i32* addrspace(3)* undef +define void @void_func_byval_struct_i8_i32_ptr({ i8, i32 } addrspace(5)* byval %arg0) #0 { + %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %arg0, i32 0, i32 0 + %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %arg0, i32 0, i32 1 + %load1 = load i32, i32 addrspace(5)* %gep1 + store volatile i32 addrspace(5)* %gep1, i32 addrspace(5)* addrspace(3)* undef ret void } @@ -115,11 +116,11 @@ ; GCN-NEXT: s_mov_b32 s5, s32 ; GCN-NEXT: buffer_load_ubyte v0, off, s[0:3], s5 ; GCN_NEXT: buffer_load_dword v1, off, s[0:3], s5 offset:4 -define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 }* byval %arg0) #0 { - %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0 - %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1 - %load0 = load i8, i8* %gep0 - %load1 = load i32, i32* %gep1 +define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 } addrspace(5)* byval %arg0) #0 { + %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %arg0, i32 0, i32 0 + %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %arg0, i32 0, i32 1 + %load0 = load i8, i8 addrspace(5)* %gep0 + %load1 = load i32, i32 addrspace(5)* %gep1 store volatile i8 %load0, i8 addrspace(3)* undef store volatile i32 %load1, i32 addrspace(3)* undef ret void @@ -146,15 +147,15 @@ ; GFX9: buffer_load_dword v1, v{{[0-9]+}}, s[0:3], s4 offen offset:4{{$}} ; GCN: ds_write_b32 -define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 }* byval %arg0, i32 %arg2) #0 { +define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 } addrspace(5)* byval %arg0, i32 %arg2) #0 { %cmp = icmp eq i32 %arg2, 0 br i1 %cmp, label %bb, label %ret bb: - %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0 - %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1 - %load1 = load volatile i32, i32* %gep1 - store volatile i32* %gep1, i32* addrspace(3)* undef + %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %arg0, i32 0, i32 0 + %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %arg0, i32 0, i32 1 + %load1 = load volatile i32, i32 addrspace(5)* %gep1 + store volatile i32 addrspace(5)* %gep1, i32 addrspace(5)* addrspace(3)* undef br label %ret ret: @@ -175,12 +176,12 @@ ; GCN: v_mul_lo_i32 v0, v0, 9 ; GCN: ds_write_b32 v0, v0 define void @func_other_fi_user_non_inline_imm_offset_i32() #0 { - %alloca0 = alloca [128 x i32], align 4 - %alloca1 = alloca [8 x i32], align 4 - %gep0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca0, i32 0, i32 65 - %gep1 = getelementptr inbounds [8 x i32], [8 x i32]* %alloca1, i32 0, i32 0 - store volatile i32 7, i32* %gep0 - %ptrtoint = ptrtoint i32* %gep1 to i32 + %alloca0 = alloca [128 x i32], align 4, addrspace(5) + %alloca1 = alloca [8 x i32], align 4, addrspace(5) + %gep0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca0, i32 0, i32 65 + %gep1 = getelementptr inbounds [8 x i32], [8 x i32] addrspace(5)* %alloca1, i32 0, i32 0 + store volatile i32 7, i32 addrspace(5)* %gep0 + %ptrtoint = ptrtoint i32 addrspace(5)* %gep1 to i32 %mul = mul i32 %ptrtoint, 9 store volatile i32 %mul, i32 addrspace(3)* undef ret void @@ -199,20 +200,20 @@ ; GCN: v_mul_lo_i32 v0, v0, 9 ; GCN: ds_write_b32 v0, v0 define void @func_other_fi_user_non_inline_imm_offset_i32_vcc_live() #0 { - %alloca0 = alloca [128 x i32], align 4 - %alloca1 = alloca [8 x i32], align 4 + %alloca0 = alloca [128 x i32], align 4, addrspace(5) + %alloca1 = alloca [8 x i32], align 4, addrspace(5) %vcc = call i64 asm sideeffect "; def $0", "={VCC}"() - %gep0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca0, i32 0, i32 65 - %gep1 = getelementptr inbounds [8 x i32], [8 x i32]* %alloca1, i32 0, i32 0 - store volatile i32 7, i32* %gep0 + %gep0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca0, i32 0, i32 65 + %gep1 = getelementptr inbounds [8 x i32], [8 x i32] addrspace(5)* %alloca1, i32 0, i32 0 + store volatile i32 7, i32 addrspace(5)* %gep0 call void asm sideeffect "; use $0", "{VCC}"(i64 %vcc) - %ptrtoint = ptrtoint i32* %gep1 to i32 + %ptrtoint = ptrtoint i32 addrspace(5)* %gep1 to i32 %mul = mul i32 %ptrtoint, 9 store volatile i32 %mul, i32 addrspace(3)* undef ret void } -declare void @func(<4 x float>* nocapture) #0 +declare void @func(<4 x float> addrspace(5)* nocapture) #0 ; undef flag not preserved in eliminateFrameIndex when handling the ; stores in the middle block. @@ -225,16 +226,16 @@ ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset: define void @undefined_stack_store_reg(float %arg, i32 %arg1) #0 { bb: - %tmp = alloca <4 x float>, align 16 + %tmp = alloca <4 x float>, align 16, addrspace(5) %tmp2 = insertelement <4 x float> undef, float %arg, i32 0 - store <4 x float> %tmp2, <4 x float>* undef + store <4 x float> %tmp2, <4 x float> addrspace(5)* undef %tmp3 = icmp eq i32 %arg1, 0 br i1 %tmp3, label %bb4, label %bb5 bb4: - call void @func(<4 x float>* nonnull undef) - store <4 x float> %tmp2, <4 x float>* %tmp, align 16 - call void @func(<4 x float>* nonnull %tmp) + call void @func(<4 x float> addrspace(5)* nonnull undef) + store <4 x float> %tmp2, <4 x float> addrspace(5)* %tmp, align 16 + call void @func(<4 x float> addrspace(5)* nonnull %tmp) br label %bb5 bb5: @@ -245,15 +246,15 @@ ; GCN: s_and_saveexec_b64 ; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s5 offset:12 define void @alloca_ptr_nonentry_block(i32 %arg0) #0 { - %alloca0 = alloca { i8, i32 }, align 4 + %alloca0 = alloca { i8, i32 }, align 4, addrspace(5) %cmp = icmp eq i32 %arg0, 0 br i1 %cmp, label %bb, label %ret bb: - %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %alloca0, i32 0, i32 0 - %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %alloca0, i32 0, i32 1 - %load1 = load volatile i32, i32* %gep1 - store volatile i32* %gep1, i32* addrspace(3)* undef + %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %alloca0, i32 0, i32 0 + %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %alloca0, i32 0, i32 1 + %load1 = load volatile i32, i32 addrspace(5)* %gep1 + store volatile i32 addrspace(5)* %gep1, i32 addrspace(5)* addrspace(3)* undef br label %ret ret: Index: test/CodeGen/AMDGPU/function-args.ll =================================================================== --- test/CodeGen/AMDGPU/function-args.ll +++ test/CodeGen/AMDGPU/function-args.ll @@ -506,8 +506,8 @@ ; GCN-DAG: buffer_load_dword v[[ELT1:[0-9]+]], off, s[0:3], s5 offset:8{{$}} ; GCN-DAG: buffer_store_dword v[[ELT1]] ; GCN-DAG: buffer_store_byte v[[ELT0]] -define void @void_func_byval_struct_i8_i32({ i8, i32 }* byval %arg0) #0 { - %arg0.load = load { i8, i32 }, { i8, i32 }* %arg0 +define void @void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval %arg0) #0 { + %arg0.load = load { i8, i32 }, { i8, i32 } addrspace(5)* %arg0 store { i8, i32 } %arg0.load, { i8, i32 } addrspace(1)* undef ret void } @@ -520,9 +520,9 @@ ; GCN: ds_write_b32 v0, v0 ; GCN: s_setpc_b64 -define void @void_func_byval_struct_i8_i32_x2({ i8, i32 }* byval %arg0, { i8, i32 }* byval %arg1, i32 %arg2) #0 { - %arg0.load = load volatile { i8, i32 }, { i8, i32 }* %arg0 - %arg1.load = load volatile { i8, i32 }, { i8, i32 }* %arg1 +define void @void_func_byval_struct_i8_i32_x2({ i8, i32 } addrspace(5)* byval %arg0, { i8, i32 } addrspace(5)* byval %arg1, i32 %arg2) #0 { + %arg0.load = load volatile { i8, i32 }, { i8, i32 } addrspace(5)* %arg0 + %arg1.load = load volatile { i8, i32 }, { i8, i32 } addrspace(5)* %arg1 store volatile { i8, i32 } %arg0.load, { i8, i32 } addrspace(1)* undef store volatile { i8, i32 } %arg1.load, { i8, i32 } addrspace(1)* undef store volatile i32 %arg2, i32 addrspace(3)* undef @@ -535,9 +535,9 @@ ; GCN-DAG: buffer_load_dword v[[ARG1_LOAD1:[0-9]+]], off, s[0:3], s5 offset:12{{$}} ; GCN-DAG: buffer_store_dword v[[ARG0_LOAD]], off ; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ARG1_LOAD0]]:[[ARG1_LOAD1]]{{\]}}, off -define void @void_func_byval_i32_byval_i64(i32* byval %arg0, i64* byval %arg1) #0 { - %arg0.load = load i32, i32* %arg0 - %arg1.load = load i64, i64* %arg1 +define void @void_func_byval_i32_byval_i64(i32 addrspace(5)* byval %arg0, i64 addrspace(5)* byval %arg1) #0 { + %arg0.load = load i32, i32 addrspace(5)* %arg0 + %arg1.load = load i64, i64 addrspace(5)* %arg1 store i32 %arg0.load, i32 addrspace(1)* undef store i64 %arg1.load, i64 addrspace(1)* undef ret void Index: test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll =================================================================== --- test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll +++ test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll @@ -276,7 +276,7 @@ ; CHECK-NEXT: LanguageVersion: [ 2, 0 ] ; CHECK-NEXT: Args: ; CHECK-NEXT: - Name: a -; CHECK-NEXT: TypeName: 'int *' +; CHECK-NEXT: TypeName: 'int addrspace(5)*' ; CHECK-NEXT: Size: 8 ; CHECK-NEXT: Align: 8 ; CHECK-NEXT: ValueKind: GlobalBuffer @@ -443,7 +443,7 @@ ; CHECK-NEXT: ValueKind: HiddenPrintfBuffer ; CHECK-NEXT: ValueType: I8 ; CHECK-NEXT: AddrSpaceQual: Global -define amdgpu_kernel void @test_struct(%struct.A* byval %a) +define amdgpu_kernel void @test_struct(%struct.A addrspace(5)* byval %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !20 !kernel_arg_base_type !20 !kernel_arg_type_qual !4 { ret void @@ -539,7 +539,7 @@ ; CHECK-NEXT: LanguageVersion: [ 2, 0 ] ; CHECK-NEXT: Args: ; CHECK-NEXT: - Name: g -; CHECK-NEXT: TypeName: 'int *' +; CHECK-NEXT: TypeName: 'int addrspace(5)*' ; CHECK-NEXT: Size: 8 ; CHECK-NEXT: Align: 8 ; CHECK-NEXT: ValueKind: GlobalBuffer @@ -547,7 +547,7 @@ ; CHECK-NEXT: AddrSpaceQual: Global ; CHECK-NEXT: AccQual: Default ; CHECK-NEXT: - Name: c -; CHECK-NEXT: TypeName: 'int *' +; CHECK-NEXT: TypeName: 'int addrspace(5)*' ; CHECK-NEXT: Size: 8 ; CHECK-NEXT: Align: 8 ; CHECK-NEXT: ValueKind: GlobalBuffer @@ -555,7 +555,7 @@ ; CHECK-NEXT: AddrSpaceQual: Constant ; CHECK-NEXT: AccQual: Default ; CHECK-NEXT: - Name: l -; CHECK-NEXT: TypeName: 'int *' +; CHECK-NEXT: TypeName: 'int addrspace(5)*' ; CHECK-NEXT: Size: 4 ; CHECK-NEXT: Align: 4 ; CHECK-NEXT: ValueKind: DynamicSharedPointer @@ -594,7 +594,7 @@ ; CHECK-NEXT: LanguageVersion: [ 2, 0 ] ; CHECK-NEXT: Args: ; CHECK-NEXT: - Name: a -; CHECK-NEXT: TypeName: 'int *' +; CHECK-NEXT: TypeName: 'int addrspace(5)*' ; CHECK-NEXT: Size: 8 ; CHECK-NEXT: Align: 8 ; CHECK-NEXT: ValueKind: GlobalBuffer @@ -603,7 +603,7 @@ ; CHECK-NEXT: AccQual: Default ; CHECK-NEXT: IsVolatile: true ; CHECK-NEXT: - Name: b -; CHECK-NEXT: TypeName: 'int *' +; CHECK-NEXT: TypeName: 'int addrspace(5)*' ; CHECK-NEXT: Size: 8 ; CHECK-NEXT: Align: 8 ; CHECK-NEXT: ValueKind: GlobalBuffer @@ -613,7 +613,7 @@ ; CHECK-NEXT: IsConst: true ; CHECK-NEXT: IsRestrict: true ; CHECK-NEXT: - Name: c -; CHECK-NEXT: TypeName: 'int *' +; CHECK-NEXT: TypeName: 'int addrspace(5)*' ; CHECK-NEXT: Size: 8 ; CHECK-NEXT: Align: 8 ; CHECK-NEXT: ValueKind: Pipe @@ -1043,7 +1043,7 @@ ; CHECK-NEXT: LanguageVersion: [ 2, 0 ] ; CHECK-NEXT: Args: ; CHECK-NEXT: - Name: a -; CHECK-NEXT: TypeName: 'int **' +; CHECK-NEXT: TypeName: 'int addrspace(5)* addrspace(5)*' ; CHECK-NEXT: Size: 8 ; CHECK-NEXT: Align: 8 ; CHECK-NEXT: ValueKind: GlobalBuffer @@ -1067,7 +1067,7 @@ ; CHECK-NEXT: ValueKind: HiddenPrintfBuffer ; CHECK-NEXT: ValueType: I8 ; CHECK-NEXT: AddrSpaceQual: Global -define amdgpu_kernel void @test_arg_ptr_to_ptr(i32* addrspace(1)* %a) +define amdgpu_kernel void @test_arg_ptr_to_ptr(i32 addrspace(5)* addrspace(1)* %a) !kernel_arg_addr_space !81 !kernel_arg_access_qual !2 !kernel_arg_type !80 !kernel_arg_base_type !80 !kernel_arg_type_qual !4 { ret void @@ -1103,7 +1103,7 @@ ; CHECK-NEXT: ValueKind: HiddenPrintfBuffer ; CHECK-NEXT: ValueType: I8 ; CHECK-NEXT: AddrSpaceQual: Global -define amdgpu_kernel void @test_arg_struct_contains_ptr(%struct.B* byval %a) +define amdgpu_kernel void @test_arg_struct_contains_ptr(%struct.B addrspace(5)* byval %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !82 !kernel_arg_base_type !82 !kernel_arg_type_qual !4 { ret void @@ -1115,7 +1115,7 @@ ; CHECK-NEXT: LanguageVersion: [ 2, 0 ] ; CHECK-NEXT: Args: ; CHECK-NEXT: - Name: a -; CHECK-NEXT: TypeName: 'global int* __attribute__((ext_vector_type(2)))' +; CHECK-NEXT: TypeName: 'global int addrspace(5)* __attribute__((ext_vector_type(2)))' ; CHECK-NEXT: Size: 16 ; CHECK-NEXT: Align: 16 ; CHECK-NEXT: ValueKind: ByValue @@ -1187,7 +1187,7 @@ ; CHECK-NEXT: LanguageVersion: [ 2, 0 ] ; CHECK-NEXT: Args: ; CHECK-NEXT: - Name: a -; CHECK-NEXT: TypeName: 'long *' +; CHECK-NEXT: TypeName: 'long addrspace(5)*' ; CHECK-NEXT: Size: 8 ; CHECK-NEXT: Align: 8 ; CHECK-NEXT: ValueKind: GlobalBuffer @@ -1195,7 +1195,7 @@ ; CHECK-NEXT: AddrSpaceQual: Global ; CHECK-NEXT: AccQual: Default ; CHECK-NEXT: - Name: b -; CHECK-NEXT: TypeName: 'char *' +; CHECK-NEXT: TypeName: 'char addrspace(5)*' ; CHECK-NEXT: Size: 4 ; CHECK-NEXT: Align: 4 ; CHECK-NEXT: ValueKind: DynamicSharedPointer @@ -1204,7 +1204,7 @@ ; CHECK-NEXT: AddrSpaceQual: Local ; CHECK-NEXT: AccQual: Default ; CHECK-NEXT: - Name: c -; CHECK-NEXT: TypeName: 'char2 *' +; CHECK-NEXT: TypeName: 'char2 addrspace(5)*' ; CHECK-NEXT: Size: 4 ; CHECK-NEXT: Align: 4 ; CHECK-NEXT: ValueKind: DynamicSharedPointer @@ -1213,7 +1213,7 @@ ; CHECK-NEXT: AddrSpaceQual: Local ; CHECK-NEXT: AccQual: Default ; CHECK-NEXT: - Name: d -; CHECK-NEXT: TypeName: 'char3 *' +; CHECK-NEXT: TypeName: 'char3 addrspace(5)*' ; CHECK-NEXT: Size: 4 ; CHECK-NEXT: Align: 4 ; CHECK-NEXT: ValueKind: DynamicSharedPointer @@ -1222,7 +1222,7 @@ ; CHECK-NEXT: AddrSpaceQual: Local ; CHECK-NEXT: AccQual: Default ; CHECK-NEXT: - Name: e -; CHECK-NEXT: TypeName: 'char4 *' +; CHECK-NEXT: TypeName: 'char4 addrspace(5)*' ; CHECK-NEXT: Size: 4 ; CHECK-NEXT: Align: 4 ; CHECK-NEXT: ValueKind: DynamicSharedPointer @@ -1231,7 +1231,7 @@ ; CHECK-NEXT: AddrSpaceQual: Local ; CHECK-NEXT: AccQual: Default ; CHECK-NEXT: - Name: f -; CHECK-NEXT: TypeName: 'char8 *' +; CHECK-NEXT: TypeName: 'char8 addrspace(5)*' ; CHECK-NEXT: Size: 4 ; CHECK-NEXT: Align: 4 ; CHECK-NEXT: ValueKind: DynamicSharedPointer @@ -1240,7 +1240,7 @@ ; CHECK-NEXT: AddrSpaceQual: Local ; CHECK-NEXT: AccQual: Default ; CHECK-NEXT: - Name: g -; CHECK-NEXT: TypeName: 'char16 *' +; CHECK-NEXT: TypeName: 'char16 addrspace(5)*' ; CHECK-NEXT: Size: 4 ; CHECK-NEXT: Align: 4 ; CHECK-NEXT: ValueKind: DynamicSharedPointer @@ -1309,7 +1309,7 @@ ; CHECK-NEXT: ValueType: I8 ; CHECK-NEXT: AddrSpaceQual: Global define amdgpu_kernel void @__test_block_invoke_kernel( - <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %arg) #0 + <{ i32, i32, i8*, i8 addrspace(1)*, i8 }> %arg) #0 !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !110 !kernel_arg_base_type !110 !kernel_arg_type_qual !4 { ret void @@ -1380,7 +1380,7 @@ !13 = !{!"half8"} !14 = !{!"float16"} !15 = !{!"double16"} -!16 = !{!"int *"} +!16 = !{!"int addrspace(5)*"} !17 = !{!"image2d_t"} !18 = !{!"sampler_t"} !19 = !{!"queue_t"} @@ -1396,23 +1396,23 @@ !29 = !{i8 undef, i32 1} !30 = !{i16 undef, i32 1} !31 = !{i64 undef, i32 1} -!32 = !{i32 *undef, i32 1} +!32 = !{i32 addrspace(5)*undef, i32 1} !50 = !{i32 1, i32 2, i32 3} -!51 = !{!"int *", !"int *", !"int *"} +!51 = !{!"int addrspace(5)*", !"int addrspace(5)*", !"int addrspace(5)*"} !60 = !{i32 1, i32 1, i32 1} !61 = !{!"read_only", !"write_only", !"read_write"} !62 = !{!"image1d_t", !"image2d_t", !"image3d_t"} !70 = !{!"volatile", !"const restrict", !"pipe"} -!80 = !{!"int **"} +!80 = !{!"int addrspace(5)* addrspace(5)*"} !81 = !{i32 1} !82 = !{!"struct B"} -!83 = !{!"global int* __attribute__((ext_vector_type(2)))"} +!83 = !{!"global int addrspace(5)* __attribute__((ext_vector_type(2)))"} !84 = !{!"clk_event_t"} !opencl.ocl.version = !{!90} !90 = !{i32 2, i32 0} !91 = !{i32 0, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3} !92 = !{!"none", !"none", !"none", !"none", !"none", !"none", !"none"} -!93 = !{!"long *", !"char *", !"char2 *", !"char3 *", !"char4 *", !"char8 *", !"char16 *"} +!93 = !{!"long addrspace(5)*", !"char addrspace(5)*", !"char2 addrspace(5)*", !"char3 addrspace(5)*", !"char4 addrspace(5)*", !"char8 addrspace(5)*", !"char16 addrspace(5)*"} !94 = !{!"", !"", !"", !"", !"", !"", !""} !100 = !{!"1:1:4:%d\5Cn"} !101 = !{!"2:1:8:%g\5Cn"} Index: test/CodeGen/AMDGPU/huge-private-buffer.ll =================================================================== --- test/CodeGen/AMDGPU/huge-private-buffer.ll +++ test/CodeGen/AMDGPU/huge-private-buffer.ll @@ -1,13 +1,14 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +target datalayout = "A5" ; GCN-LABEL: {{^}}scratch_buffer_known_high_bit_small: ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4 ; GCN-NOT: [[FI]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FI]] define amdgpu_kernel void @scratch_buffer_known_high_bit_small() #0 { - %alloca = alloca i32, align 4 - store volatile i32 0, i32* %alloca - %toint = ptrtoint i32* %alloca to i32 + %alloca = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + %toint = ptrtoint i32 addrspace(5)* %alloca to i32 %masked = and i32 %toint, 2147483647 store volatile i32 %masked, i32 addrspace(1)* undef ret void @@ -19,9 +20,9 @@ ; GCN-DAG: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x7ffffffc, [[FI]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]] define amdgpu_kernel void @scratch_buffer_known_high_bit_huge() #1 { - %alloca = alloca i32, align 4 - store volatile i32 0, i32* %alloca - %toint = ptrtoint i32* %alloca to i32 + %alloca = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + %toint = ptrtoint i32 addrspace(5)* %alloca to i32 %masked = and i32 %toint, 2147483647 store volatile i32 %masked, i32 addrspace(1)* undef ret void Index: test/CodeGen/AMDGPU/indirect-private-64.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-private-64.ll +++ test/CodeGen/AMDGPU/indirect-private-64.ll @@ -3,6 +3,7 @@ ; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=CI-ALLOCA16 -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=CI-PROMOTE -check-prefix=SI %s +target datalayout = "A5" declare void @llvm.amdgcn.s.barrier() #0 @@ -22,11 +23,11 @@ ; CI-PROMOTE: ds_read_b64 define amdgpu_kernel void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) #1 { %val = load double, double addrspace(1)* %in, align 8 - %array = alloca [8 x double], align 8 - %ptr = getelementptr inbounds [8 x double], [8 x double]* %array, i32 0, i32 %b - store double %val, double* %ptr, align 8 + %array = alloca [8 x double], align 8, addrspace(5) + %ptr = getelementptr inbounds [8 x double], [8 x double] addrspace(5)* %array, i32 0, i32 %b + store double %val, double addrspace(5)* %ptr, align 8 call void @llvm.amdgcn.s.barrier() - %result = load double, double* %ptr, align 8 + %result = load double, double addrspace(5)* %ptr, align 8 store double %result, double addrspace(1)* %out, align 8 ret void } @@ -53,11 +54,11 @@ ; CI-PROMOTE: ds_read2_b64 define amdgpu_kernel void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 { %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16 - %array = alloca [4 x <2 x double>], align 16 - %ptr = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* %array, i32 0, i32 %b - store <2 x double> %val, <2 x double>* %ptr, align 16 + %array = alloca [4 x <2 x double>], align 16, addrspace(5) + %ptr = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>] addrspace(5)* %array, i32 0, i32 %b + store <2 x double> %val, <2 x double> addrspace(5)* %ptr, align 16 call void @llvm.amdgcn.s.barrier() - %result = load <2 x double>, <2 x double>* %ptr, align 16 + %result = load <2 x double>, <2 x double> addrspace(5)* %ptr, align 16 store <2 x double> %result, <2 x double> addrspace(1)* %out, align 16 ret void } @@ -79,11 +80,11 @@ ; CI-PROMOTE: ds_read_b64 define amdgpu_kernel void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) #1 { %val = load i64, i64 addrspace(1)* %in, align 8 - %array = alloca [8 x i64], align 8 - %ptr = getelementptr inbounds [8 x i64], [8 x i64]* %array, i32 0, i32 %b - store i64 %val, i64* %ptr, align 8 + %array = alloca [8 x i64], align 8, addrspace(5) + %ptr = getelementptr inbounds [8 x i64], [8 x i64] addrspace(5)* %array, i32 0, i32 %b + store i64 %val, i64 addrspace(5)* %ptr, align 8 call void @llvm.amdgcn.s.barrier() - %result = load i64, i64* %ptr, align 8 + %result = load i64, i64 addrspace(5)* %ptr, align 8 store i64 %result, i64 addrspace(1)* %out, align 8 ret void } @@ -111,11 +112,11 @@ ; CI-PROMOTE: ds_read2_b64 define amdgpu_kernel void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 { %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 - %array = alloca [4 x <2 x i64>], align 16 - %ptr = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* %array, i32 0, i32 %b - store <2 x i64> %val, <2 x i64>* %ptr, align 16 + %array = alloca [4 x <2 x i64>], align 16, addrspace(5) + %ptr = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] addrspace(5)* %array, i32 0, i32 %b + store <2 x i64> %val, <2 x i64> addrspace(5)* %ptr, align 16 call void @llvm.amdgcn.s.barrier() - %result = load <2 x i64>, <2 x i64>* %ptr, align 16 + %result = load <2 x i64>, <2 x i64> addrspace(5)* %ptr, align 16 store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 16 ret void } Index: test/CodeGen/AMDGPU/insert_subreg.ll =================================================================== --- test/CodeGen/AMDGPU/insert_subreg.ll +++ test/CodeGen/AMDGPU/insert_subreg.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -verify-machineinstrs < %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s +target datalayout = "A5" ; Test that INSERT_SUBREG instructions don't have non-register operands after ; instruction selection. @@ -8,8 +9,8 @@ ; CHECK-LABEL: test: define amdgpu_kernel void @test(i64 addrspace(1)* %out) { entry: - %tmp0 = alloca [16 x i32] - %tmp1 = ptrtoint [16 x i32]* %tmp0 to i32 + %tmp0 = alloca [16 x i32], addrspace(5) + %tmp1 = ptrtoint [16 x i32] addrspace(5)* %tmp0 to i32 %tmp2 = sext i32 %tmp1 to i64 store i64 %tmp2, i64 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/kernarg-stack-alignment.ll =================================================================== --- test/CodeGen/AMDGPU/kernarg-stack-alignment.ll +++ test/CodeGen/AMDGPU/kernarg-stack-alignment.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s | FileCheck %s +target datalayout = "A5" ; Test that the alignment of kernel arguments does not impact the ; alignment of the stack @@ -6,39 +7,39 @@ ; CHECK-LABEL: {{^}}no_args: ; CHECK: ScratchSize: 5{{$}} define amdgpu_kernel void @no_args() { - %alloca = alloca i8 - store volatile i8 0, i8* %alloca + %alloca = alloca i8, addrspace(5) + store volatile i8 0, i8 addrspace(5)* %alloca ret void } ; CHECK-LABEL: {{^}}force_align32: ; CHECK: ScratchSize: 5{{$}} define amdgpu_kernel void @force_align32(<8 x i32>) { - %alloca = alloca i8 - store volatile i8 0, i8* %alloca + %alloca = alloca i8, addrspace(5) + store volatile i8 0, i8 addrspace(5)* %alloca ret void } ; CHECK-LABEL: {{^}}force_align64: ; CHECK: ScratchSize: 5{{$}} define amdgpu_kernel void @force_align64(<16 x i32>) { - %alloca = alloca i8 - store volatile i8 0, i8* %alloca + %alloca = alloca i8, addrspace(5) + store volatile i8 0, i8 addrspace(5)* %alloca ret void } ; CHECK-LABEL: {{^}}force_align128: ; CHECK: ScratchSize: 5{{$}} define amdgpu_kernel void @force_align128(<32 x i32>) { - %alloca = alloca i8 - store volatile i8 0, i8* %alloca + %alloca = alloca i8, addrspace(5) + store volatile i8 0, i8 addrspace(5)* %alloca ret void } ; CHECK-LABEL: {{^}}force_align256: ; CHECK: ScratchSize: 5{{$}} define amdgpu_kernel void @force_align256(<64 x i32>) { - %alloca = alloca i8 - store volatile i8 0, i8* %alloca + %alloca = alloca i8, addrspace(5) + store volatile i8 0, i8 addrspace(5)* %alloca ret void } Index: test/CodeGen/AMDGPU/large-alloca-compute.ll =================================================================== --- test/CodeGen/AMDGPU/large-alloca-compute.ll +++ test/CodeGen/AMDGPU/large-alloca-compute.ll @@ -3,6 +3,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 --show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=ALL %s ; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s -mattr=-flat-for-global | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s ; RUN: llc -march=amdgcn -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s +target datalayout = "A5" ; FIXME: align on alloca seems to be ignored for private_segment_alignment @@ -46,14 +47,14 @@ ; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s9 offen ; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s9 offen -; Scratch size = alloca size + emergency stack slot +; Scratch size = alloca size + emergency stack slot, align {{.*}}, addrspace(5) ; ALL: ; ScratchSize: 32772 define amdgpu_kernel void @large_alloca_compute_shader(i32 %x, i32 %y) #0 { - %large = alloca [8192 x i32], align 4 - %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191 - store volatile i32 %x, i32* %gep - %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y - %val = load volatile i32, i32* %gep1 + %large = alloca [8192 x i32], align 4, addrspace(5) + %gep = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %large, i32 0, i32 8191 + store volatile i32 %x, i32 addrspace(5)* %gep + %gep1 = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %large, i32 0, i32 %y + %val = load volatile i32, i32 addrspace(5)* %gep1 store volatile i32 %val, i32 addrspace(1)* undef ret void } Index: test/CodeGen/AMDGPU/large-alloca-graphics.ll =================================================================== --- test/CodeGen/AMDGPU/large-alloca-graphics.ll +++ test/CodeGen/AMDGPU/large-alloca-graphics.ll @@ -1,6 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s ; RUN: llc -march=amdgcn -mcpu=carrizo -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=ALL %s +target datalayout = "A5" ; ALL-LABEL: {{^}}large_alloca_pixel_shader: ; GCN-DAG: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 @@ -15,11 +16,11 @@ ; ALL: ; ScratchSize: 32772 define amdgpu_ps void @large_alloca_pixel_shader(i32 %x, i32 %y) #0 { - %large = alloca [8192 x i32], align 4 - %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191 - store volatile i32 %x, i32* %gep - %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y - %val = load volatile i32, i32* %gep1 + %large = alloca [8192 x i32], align 4, addrspace(5) + %gep = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %large, i32 0, i32 8191 + store volatile i32 %x, i32 addrspace(5)* %gep + %gep1 = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %large, i32 0, i32 %y + %val = load volatile i32, i32 addrspace(5)* %gep1 store volatile i32 %val, i32 addrspace(1)* undef ret void } @@ -37,11 +38,11 @@ ; ALL: ; ScratchSize: 32772 define amdgpu_ps void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #0 { - %large = alloca [8192 x i32], align 4 - %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191 - store volatile i32 %x, i32* %gep - %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y - %val = load volatile i32, i32* %gep1 + %large = alloca [8192 x i32], align 4, addrspace(5) + %gep = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %large, i32 0, i32 8191 + store volatile i32 %x, i32 addrspace(5)* %gep + %gep1 = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %large, i32 0, i32 %y + %val = load volatile i32, i32 addrspace(5)* %gep1 store volatile i32 %val, i32 addrspace(1)* undef ret void } Index: test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll @@ -4,11 +4,11 @@ declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #2 declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2 -declare i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* nocapture, i32, i32, i32, i1) #2 +declare i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* nocapture, i32, i32, i32, i1) #2 declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #2 declare i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* nocapture, i64, i32, i32, i1) #2 -declare i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* nocapture, i64, i32, i32, i1) #2 +declare i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* nocapture, i64, i32, i32, i1) #2 declare i32 @llvm.amdgcn.workitem.id.x() #1 @@ -159,9 +159,9 @@ ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} -define amdgpu_kernel void @flat_atomic_dec_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { - %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %ptr, i32 42, i32 0, i32 0, i1 false) - store i32 %result, i32 addrspace(4)* %out +define amdgpu_kernel void @flat_atomic_dec_ret_i32(i32* %out, i32* %ptr) #0 { + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* %ptr, i32 42, i32 0, i32 0, i1 false) + store i32 %result, i32* %out ret void } @@ -169,18 +169,18 @@ ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; CIVI: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} ; GFX9: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:16 glc{{$}} -define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false) - store i32 %result, i32 addrspace(4)* %out +define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(i32* %out, i32* %ptr) #0 { + %gep = getelementptr i32, i32* %ptr, i32 4 + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false) + store i32 %result, i32* %out ret void } ; GCN-LABEL: {{^}}flat_atomic_dec_noret_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} -define amdgpu_kernel void @flat_atomic_dec_noret_i32(i32 addrspace(4)* %ptr) nounwind { - %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %ptr, i32 42, i32 0, i32 0, i1 false) +define amdgpu_kernel void @flat_atomic_dec_noret_i32(i32* %ptr) nounwind { + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* %ptr, i32 42, i32 0, i32 0, i1 false) ret void } @@ -188,9 +188,9 @@ ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; CIVI: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} ; GFX9: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:16{{$}} -define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false) +define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(i32* %ptr) nounwind { + %gep = getelementptr i32, i32* %ptr, i32 4 + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false) ret void } @@ -198,13 +198,13 @@ ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; CIVI: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} ; GFX9: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:20 glc{{$}} -define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32* %out, i32* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id - %out.gep = getelementptr i32, i32 addrspace(4)* %out, i32 %id - %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5 - %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false) - store i32 %result, i32 addrspace(4)* %out.gep + %gep.tid = getelementptr i32, i32* %ptr, i32 %id + %out.gep = getelementptr i32, i32* %out, i32 %id + %gep = getelementptr i32, i32* %gep.tid, i32 5 + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false) + store i32 %result, i32* %out.gep ret void } @@ -212,11 +212,11 @@ ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; CIVI: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} ; GFX9: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:20{{$}} -define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id - %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5 - %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false) + %gep.tid = getelementptr i32, i32* %ptr, i32 %id + %gep = getelementptr i32, i32* %gep.tid, i32 5 + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false) ret void } @@ -224,9 +224,9 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} -define amdgpu_kernel void @flat_atomic_dec_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { - %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %ptr, i64 42, i32 0, i32 0, i1 false) - store i64 %result, i64 addrspace(4)* %out +define amdgpu_kernel void @flat_atomic_dec_ret_i64(i64* %out, i64* %ptr) #0 { + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %ptr, i64 42, i32 0, i32 0, i1 false) + store i64 %result, i64* %out ret void } @@ -235,10 +235,10 @@ ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CIVI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} ; GFX9: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32 glc{{$}} -define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4 - %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false) - store i64 %result, i64 addrspace(4)* %out +define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(i64* %out, i64* %ptr) #0 { + %gep = getelementptr i64, i64* %ptr, i32 4 + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false) + store i64 %result, i64* %out ret void } @@ -246,8 +246,8 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}} -define amdgpu_kernel void @flat_atomic_dec_noret_i64(i64 addrspace(4)* %ptr) nounwind { - %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %ptr, i64 42, i32 0, i32 0, i1 false) +define amdgpu_kernel void @flat_atomic_dec_noret_i64(i64* %ptr) nounwind { + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %ptr, i64 42, i32 0, i32 0, i1 false) ret void } @@ -256,9 +256,9 @@ ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CIVI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}} ; GFX9: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}} -define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4 - %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false) +define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(i64* %ptr) nounwind { + %gep = getelementptr i64, i64* %ptr, i32 4 + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false) ret void } @@ -267,13 +267,13 @@ ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CIVI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} ; GFX9: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:40 glc{{$}} -define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64* %out, i64* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id - %out.gep = getelementptr i64, i64 addrspace(4)* %out, i32 %id - %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5 - %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false) - store i64 %result, i64 addrspace(4)* %out.gep + %gep.tid = getelementptr i64, i64* %ptr, i32 %id + %out.gep = getelementptr i64, i64* %out, i32 %id + %gep = getelementptr i64, i64* %gep.tid, i32 5 + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false) + store i64 %result, i64* %out.gep ret void } @@ -282,11 +282,11 @@ ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CIVI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}} ; GFX9: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:40{{$}} -define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id - %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5 - %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false) + %gep.tid = getelementptr i64, i64* %ptr, i32 %id + %gep = getelementptr i64, i64* %gep.tid, i32 5 + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false) ret void } Index: test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll @@ -4,11 +4,11 @@ declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #2 declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2 -declare i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* nocapture, i32, i32, i32, i1) #2 +declare i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* nocapture, i32, i32, i32, i1) #2 declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #2 declare i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* nocapture, i64, i32, i32, i1) #2 -declare i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* nocapture, i64, i32, i32, i1) #2 +declare i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* nocapture, i64, i32, i32, i1) #2 declare i32 @llvm.amdgcn.workitem.id.x() #1 @@ -261,9 +261,9 @@ ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} -define amdgpu_kernel void @flat_atomic_inc_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %ptr, i32 42, i32 0, i32 0, i1 false) - store i32 %result, i32 addrspace(4)* %out +define amdgpu_kernel void @flat_atomic_inc_ret_i32(i32* %out, i32* %ptr) #0 { + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %ptr, i32 42, i32 0, i32 0, i1 false) + store i32 %result, i32* %out ret void } @@ -271,18 +271,18 @@ ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; CIVI: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} ; GFX9: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:16 glc{{$}} -define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false) - store i32 %result, i32 addrspace(4)* %out +define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(i32* %out, i32* %ptr) #0 { + %gep = getelementptr i32, i32* %ptr, i32 4 + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false) + store i32 %result, i32* %out ret void } ; GCN-LABEL: {{^}}flat_atomic_inc_noret_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} -define amdgpu_kernel void @flat_atomic_inc_noret_i32(i32 addrspace(4)* %ptr) nounwind { - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %ptr, i32 42, i32 0, i32 0, i1 false) +define amdgpu_kernel void @flat_atomic_inc_noret_i32(i32* %ptr) nounwind { + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %ptr, i32 42, i32 0, i32 0, i1 false) ret void } @@ -290,9 +290,9 @@ ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; CIVI: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} ; GFX9: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:16{{$}} -define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false) +define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(i32* %ptr) nounwind { + %gep = getelementptr i32, i32* %ptr, i32 4 + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false) ret void } @@ -300,13 +300,13 @@ ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; CIVI: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} ; GFX9: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:20 glc{{$}} -define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32* %out, i32* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id - %out.gep = getelementptr i32, i32 addrspace(4)* %out, i32 %id - %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5 - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false) - store i32 %result, i32 addrspace(4)* %out.gep + %gep.tid = getelementptr i32, i32* %ptr, i32 %id + %out.gep = getelementptr i32, i32* %out, i32 %id + %gep = getelementptr i32, i32* %gep.tid, i32 5 + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false) + store i32 %result, i32* %out.gep ret void } @@ -314,11 +314,11 @@ ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; CIVI: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} ; GFX9: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:20{{$}} -define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id - %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5 - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false) + %gep.tid = getelementptr i32, i32* %ptr, i32 %id + %gep = getelementptr i32, i32* %gep.tid, i32 5 + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false) ret void } @@ -341,9 +341,9 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} -define amdgpu_kernel void @flat_atomic_inc_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %ptr, i64 42, i32 0, i32 0, i1 false) - store i64 %result, i64 addrspace(4)* %out +define amdgpu_kernel void @flat_atomic_inc_ret_i64(i64* %out, i64* %ptr) #0 { + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %ptr, i64 42, i32 0, i32 0, i1 false) + store i64 %result, i64* %out ret void } @@ -352,10 +352,10 @@ ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CIVI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} ; GFX9: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32 glc{{$}} -define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4 - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false) - store i64 %result, i64 addrspace(4)* %out +define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(i64* %out, i64* %ptr) #0 { + %gep = getelementptr i64, i64* %ptr, i32 4 + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false) + store i64 %result, i64* %out ret void } @@ -363,8 +363,8 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}} -define amdgpu_kernel void @flat_atomic_inc_noret_i64(i64 addrspace(4)* %ptr) nounwind { - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %ptr, i64 42, i32 0, i32 0, i1 false) +define amdgpu_kernel void @flat_atomic_inc_noret_i64(i64* %ptr) nounwind { + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %ptr, i64 42, i32 0, i32 0, i1 false) ret void } @@ -373,9 +373,9 @@ ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CIVI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}} ; GFX9: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}} -define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4 - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false) +define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(i64* %ptr) nounwind { + %gep = getelementptr i64, i64* %ptr, i32 4 + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false) ret void } @@ -384,13 +384,13 @@ ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CIVI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} ; GFX9: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:40 glc{{$}} -define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id - %out.gep = getelementptr i64, i64 addrspace(4)* %out, i32 %id - %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5 - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false) - store i64 %result, i64 addrspace(4)* %out.gep + %gep.tid = getelementptr i64, i64* %ptr, i32 %id + %out.gep = getelementptr i64, i64* %out, i32 %id + %gep = getelementptr i64, i64* %gep.tid, i32 5 + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false) + store i64 %result, i64* %out.gep ret void } @@ -399,11 +399,11 @@ ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CIVI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}} ; GFX9: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:40{{$}} -define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id - %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5 - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false) + %gep.tid = getelementptr i64, i64* %ptr, i32 %id + %gep = getelementptr i64, i64* %gep.tid, i32 5 + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false) ret void } Index: test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +target datalayout = "A5" ; FIXME: Requires stack object to not assert ; GCN-LABEL: {{^}}test_ps: @@ -8,8 +9,8 @@ ; GCN-NEXT: s_waitcnt ; GCN-NEXT: ; return define amdgpu_ps i32 @test_ps() #1 { - %alloca = alloca i32 - store volatile i32 0, i32* %alloca + %alloca = alloca i32, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() %buffer_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)* %value = load volatile i32, i32 addrspace(2)* %buffer_ptr @@ -21,8 +22,8 @@ ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], s2 offset:4 ; GCN: s_load_dword s0, s[0:1], 0x0 define amdgpu_cs i32 @test_cs() #1 { - %alloca = alloca i32 - store volatile i32 0, i32* %alloca + %alloca = alloca i32, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() %buffer_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)* %value = load volatile i32, i32 addrspace(2)* %buffer_ptr Index: test/CodeGen/AMDGPU/load-hi16.ll =================================================================== --- test/CodeGen/AMDGPU/load-hi16.ll +++ test/CodeGen/AMDGPU/load-hi16.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +target datalayout = "A5" ; GCN-LABEL: {{^}}load_local_hi_v2i16_undeflo: ; GCN: s_waitcnt @@ -221,9 +222,9 @@ ; VI: flat_load_ushort v{{[0-9]+}} ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, ; VI: v_or_b32_sdwa -define void @load_flat_hi_v2i16_reglo_vreg(i16 addrspace(4)* %in, i16 %reg) #0 { +define void @load_flat_hi_v2i16_reglo_vreg(i16* %in, i16 %reg) #0 { entry: - %load = load i16, i16 addrspace(4)* %in + %load = load i16, i16* %in %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef @@ -241,9 +242,9 @@ ; VI: flat_load_ushort v{{[0-9]+}} ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, ; VI: v_or_b32_sdwa -define void @load_flat_hi_v2f16_reglo_vreg(half addrspace(4)* %in, half %reg) #0 { +define void @load_flat_hi_v2f16_reglo_vreg(half* %in, half %reg) #0 { entry: - %load = load half, half addrspace(4)* %in + %load = load half, half* %in %build0 = insertelement <2 x half> undef, half %reg, i32 0 %build1 = insertelement <2 x half> %build0, half %load, i32 1 store <2 x half> %build1, <2 x half> addrspace(1)* undef @@ -261,9 +262,9 @@ ; VI: flat_load_ubyte v{{[0-9]+}} ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, ; VI: v_or_b32_sdwa -define void @load_flat_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(4)* %in, i16 %reg) #0 { +define void @load_flat_hi_v2i16_reglo_vreg_zexti8(i8* %in, i16 %reg) #0 { entry: - %load = load i8, i8 addrspace(4)* %in + %load = load i8, i8* %in %ext = zext i8 %load to i16 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 @@ -282,9 +283,9 @@ ; VI: flat_load_sbyte v{{[0-9]+}} ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, ; VI: v_or_b32_sdwa -define void @load_flat_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(4)* %in, i16 %reg) #0 { +define void @load_flat_hi_v2i16_reglo_vreg_sexti8(i8* %in, i16 %reg) #0 { entry: - %load = load i8, i8 addrspace(4)* %in + %load = load i8, i8* %in %ext = sext i8 %load to i16 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 @@ -301,10 +302,10 @@ ; GFX9-NEXT: s_setpc_b64 ; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} -define void @load_private_hi_v2i16_reglo_vreg(i16* byval %in, i16 %reg) #0 { +define void @load_private_hi_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 { entry: - %gep = getelementptr inbounds i16, i16* %in, i64 2045 - %load = load i16, i16* %gep + %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2045 + %load = load i16, i16 addrspace(5)* %gep %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef @@ -320,10 +321,10 @@ ; GFX9-NEXT: s_setpc_b64 ; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} -define void @load_private_hi_v2f16_reglo_vreg(half* byval %in, half %reg) #0 { +define void @load_private_hi_v2f16_reglo_vreg(half addrspace(5)* byval %in, half %reg) #0 { entry: - %gep = getelementptr inbounds half, half* %in, i64 2045 - %load = load half, half* %gep + %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2045 + %load = load half, half addrspace(5)* %gep %build0 = insertelement <2 x half> undef, half %reg, i32 0 %build1 = insertelement <2 x half> %build0, half %load, i32 1 store <2 x half> %build1, <2 x half> addrspace(1)* undef @@ -339,9 +340,9 @@ ; GFX9-NEXT: s_setpc_b64 ; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}} -define void @load_private_hi_v2i16_reglo_vreg_nooff(i16* byval %in, i16 %reg) #0 { +define void @load_private_hi_v2i16_reglo_vreg_nooff(i16 addrspace(5)* byval %in, i16 %reg) #0 { entry: - %load = load volatile i16, i16* inttoptr (i32 4094 to i16*) + %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*) %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef @@ -357,9 +358,9 @@ ; GFX9-NEXT: s_setpc_b64 ; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}} -define void @load_private_hi_v2f16_reglo_vreg_nooff(half* %in, half %reg) #0 { +define void @load_private_hi_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, half %reg) #0 { entry: - %load = load volatile half, half* inttoptr (i32 4094 to half*) + %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*) %build0 = insertelement <2 x half> undef, half %reg, i32 0 %build1 = insertelement <2 x half> %build0, half %load, i32 1 store <2 x half> %build1, <2 x half> addrspace(1)* undef @@ -375,10 +376,10 @@ ; GFX9-NEXT: s_setpc_b64 ; VI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} -define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8* byval %in, i16 %reg) #0 { +define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 { entry: - %gep = getelementptr inbounds i8, i8* %in, i64 4091 - %load = load i8, i8* %gep + %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091 + %load = load i8, i8 addrspace(5)* %gep %ext = zext i8 %load to i16 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 @@ -395,10 +396,10 @@ ; GFX9-NEXT: s_setpc_b64 ; VI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} -define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8* byval %in, i16 %reg) #0 { +define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 { entry: - %gep = getelementptr inbounds i8, i8* %in, i64 4091 - %load = load i8, i8* %gep + %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091 + %load = load i8, i8 addrspace(5)* %gep %ext = sext i8 %load to i16 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 @@ -415,9 +416,9 @@ ; GFX9-NEXT: s_setpc_b64 ; VI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}} -define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8* %in, i16 %reg) #0 { +define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i16 %reg) #0 { entry: - %load = load volatile i8, i8* inttoptr (i32 4094 to i8*) + %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) %ext = zext i8 %load to i16 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 @@ -434,9 +435,9 @@ ; GFX9-NEXT: s_setpc_b64 ; VI: buffer_load_sbyte v0, off, s[0:3], s4 offset:4094{{$}} -define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8* %in, i16 %reg) #0 { +define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i16 %reg) #0 { entry: - %load = load volatile i8, i8* inttoptr (i32 4094 to i8*) + %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) %ext = sext i8 %load to i16 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 @@ -453,9 +454,9 @@ ; GFX9-NEXT: s_setpc_b64 ; VI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}} -define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8* %in, half %reg) #0 { +define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, half %reg) #0 { entry: - %load = load volatile i8, i8* inttoptr (i32 4094 to i8*) + %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) %ext = zext i8 %load to i16 %bc.ext = bitcast i16 %ext to half %build0 = insertelement <2 x half> undef, half %reg, i32 0 @@ -510,12 +511,12 @@ ; GFX9-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4094 define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 { entry: - %obj0 = alloca [10 x i32], align 4 - %obj1 = alloca [4096 x i16], align 2 - %bc = bitcast [10 x i32]* %obj0 to i32* - store volatile i32 123, i32* %bc - %gep = getelementptr inbounds [4096 x i16], [4096 x i16]* %obj1, i32 0, i32 2025 - %load = load i16, i16* %gep + %obj0 = alloca [10 x i32], align 4, addrspace(5) + %obj1 = alloca [4096 x i16], align 2, addrspace(5) + %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* + store volatile i32 123, i32 addrspace(5)* %bc + %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2025 + %load = load i16, i16 addrspace(5)* %gep %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef @@ -527,12 +528,12 @@ ; GFX9-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4095 define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 { entry: - %obj0 = alloca [10 x i32], align 4 - %obj1 = alloca [4096 x i8], align 2 - %bc = bitcast [10 x i32]* %obj0 to i32* - store volatile i32 123, i32* %bc - %gep = getelementptr inbounds [4096 x i8], [4096 x i8]* %obj1, i32 0, i32 4051 - %load = load i8, i8* %gep + %obj0 = alloca [10 x i32], align 4, addrspace(5) + %obj1 = alloca [4096 x i8], align 2, addrspace(5) + %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* + store volatile i32 123, i32 addrspace(5)* %bc + %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051 + %load = load i8, i8 addrspace(5)* %gep %ext = sext i8 %load to i16 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 @@ -545,12 +546,12 @@ ; GFX9-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4095 define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 { entry: - %obj0 = alloca [10 x i32], align 4 - %obj1 = alloca [4096 x i8], align 2 - %bc = bitcast [10 x i32]* %obj0 to i32* - store volatile i32 123, i32* %bc - %gep = getelementptr inbounds [4096 x i8], [4096 x i8]* %obj1, i32 0, i32 4051 - %load = load i8, i8* %gep + %obj0 = alloca [10 x i32], align 4, addrspace(5) + %obj1 = alloca [4096 x i8], align 2, addrspace(5) + %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* + store volatile i32 123, i32 addrspace(5)* %bc + %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051 + %load = load i8, i8 addrspace(5)* %gep %ext = zext i8 %load to i16 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 @@ -606,11 +607,11 @@ ; GFX9-NEXT: s_waitcnt ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_setpc_b64 -define <2 x i16> @load_flat_v2i16_split(i16 addrspace(4)* %in) #0 { +define <2 x i16> @load_flat_v2i16_split(i16* %in) #0 { entry: - %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 1 - %load0 = load volatile i16, i16 addrspace(4)* %in - %load1 = load volatile i16, i16 addrspace(4)* %gep + %gep = getelementptr inbounds i16, i16* %in, i64 1 + %load0 = load volatile i16, i16* %in + %load1 = load volatile i16, i16* %gep %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 ret <2 x i16> %build1 @@ -644,11 +645,11 @@ ; GFX9-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:6 ; GFX9-NEXT: s_waitcnt ; GFX9-NEXT: s_setpc_b64 -define <2 x i16> @load_private_v2i16_split(i16* byval %in) #0 { +define <2 x i16> @load_private_v2i16_split(i16 addrspace(5)* byval %in) #0 { entry: - %gep = getelementptr inbounds i16, i16* %in, i32 1 - %load0 = load volatile i16, i16* %in - %load1 = load volatile i16, i16* %gep + %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i32 1 + %load0 = load volatile i16, i16 addrspace(5)* %in + %load1 = load volatile i16, i16 addrspace(5)* %gep %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 ret <2 x i16> %build1 Index: test/CodeGen/AMDGPU/load-lo16.ll =================================================================== --- test/CodeGen/AMDGPU/load-lo16.ll +++ test/CodeGen/AMDGPU/load-lo16.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +target datalayout = "A5" ; GCN-LABEL: {{^}}load_local_lo_v2i16_undeflo: ; GCN: s_waitcnt @@ -269,10 +270,10 @@ ; VI: flat_load_ushort v{{[0-9]+}} ; VI: v_or_b32_e32 -define void @load_flat_lo_v2i16_reghi_vreg(i16 addrspace(4)* %in, i32 %reg) #0 { +define void @load_flat_lo_v2i16_reghi_vreg(i16* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> - %load = load i16, i16 addrspace(4)* %in + %load = load i16, i16* %in %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef ret void @@ -288,10 +289,10 @@ ; VI: flat_load_ushort v{{[0-9]+}} ; VI: v_or_b32_e32 -define void @load_flat_lo_v2f16_reghi_vreg(half addrspace(4)* %in, i32 %reg) #0 { +define void @load_flat_lo_v2f16_reghi_vreg(half* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x half> - %load = load half, half addrspace(4)* %in + %load = load half, half* %in %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0 store <2 x half> %build1, <2 x half> addrspace(1)* undef ret void @@ -307,10 +308,10 @@ ; VI: flat_load_ubyte v{{[0-9]+}} ; VI: v_or_b32_e32 -define void @load_flat_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(4)* %in, i32 %reg) #0 { +define void @load_flat_lo_v2i16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> - %load = load i8, i8 addrspace(4)* %in + %load = load i8, i8* %in %ext = zext i8 %load to i16 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef @@ -328,10 +329,10 @@ ; VI: flat_load_sbyte v{{[0-9]+}} ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -define void @load_flat_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(4)* %in, i32 %reg) #0 { +define void @load_flat_lo_v2i16_reglo_vreg_sexti8(i8* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> - %load = load i8, i8 addrspace(4)* %in + %load = load i8, i8* %in %ext = sext i8 %load to i16 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef @@ -347,11 +348,11 @@ ; GFX9-NEXT: s_setpc_b64 ; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} -define void @load_private_lo_v2i16_reglo_vreg(i16* byval %in, i32 %reg) #0 { +define void @load_private_lo_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> - %gep = getelementptr inbounds i16, i16* %in, i64 2045 - %load = load i16, i16* %gep + %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2045 + %load = load i16, i16 addrspace(5)* %gep %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef ret void @@ -369,10 +370,10 @@ ; GFX9-NEXT: s_setpc_b64 ; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} -define void @load_private_lo_v2i16_reghi_vreg(i16* byval %in, i16 %reg) #0 { +define void @load_private_lo_v2i16_reghi_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 { entry: - %gep = getelementptr inbounds i16, i16* %in, i64 2045 - %load = load i16, i16* %gep + %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2045 + %load = load i16, i16 addrspace(5)* %gep %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef @@ -388,11 +389,11 @@ ; GFX9-NEXT: s_setpc_b64 ; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} -define void @load_private_lo_v2f16_reglo_vreg(half* byval %in, i32 %reg) #0 { +define void @load_private_lo_v2f16_reglo_vreg(half addrspace(5)* byval %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x half> - %gep = getelementptr inbounds half, half* %in, i64 2045 - %load = load half, half* %gep + %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2045 + %load = load half, half addrspace(5)* %gep %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0 store <2 x half> %build1, <2 x half> addrspace(1)* undef ret void @@ -407,10 +408,10 @@ ; GFX9-NEXT: s_setpc_b64 ; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}} -define void @load_private_lo_v2i16_reglo_vreg_nooff(i16* %in, i32 %reg) #0 { +define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> - %load = load volatile i16, i16* inttoptr (i32 4094 to i16*) + %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*) %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef ret void @@ -425,10 +426,10 @@ ; GFX9-NEXT: s_setpc_b64 ; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}} -define void @load_private_lo_v2i16_reghi_vreg_nooff(i16* %in, i32 %reg) #0 { +define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> - %load = load volatile i16, i16* inttoptr (i32 4094 to i16*) + %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*) %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef ret void @@ -443,10 +444,10 @@ ; GFX9-NEXT: s_setpc_b64 ; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}} -define void @load_private_lo_v2f16_reglo_vreg_nooff(half* %in, i32 %reg) #0 { +define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x half> - %load = load volatile half, half* inttoptr (i32 4094 to half*) + %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*) %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0 store <2 x half> %build1, <2 x half> addrspace(1)* undef ret void @@ -461,11 +462,11 @@ ; GFX9-NEXT: s_setpc_b64 ; VI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} -define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8* byval %in, i32 %reg) #0 { +define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> - %gep = getelementptr inbounds i8, i8* %in, i64 4091 - %load = load i8, i8* %gep + %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091 + %load = load i8, i8 addrspace(5)* %gep %ext = zext i8 %load to i16 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef @@ -481,11 +482,11 @@ ; GFX9-NEXT: s_setpc_b64 ; VI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} -define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8* byval %in, i32 %reg) #0 { +define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> - %gep = getelementptr inbounds i8, i8* %in, i64 4091 - %load = load i8, i8* %gep + %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091 + %load = load i8, i8 addrspace(5)* %gep %ext = sext i8 %load to i16 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef @@ -501,10 +502,10 @@ ; GFX9-NEXT: s_setpc_b64 ; VI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}} -define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8* %in, i32 %reg) #0 { +define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> - %load = load volatile i8, i8* inttoptr (i32 4094 to i8*) + %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) %ext = zext i8 %load to i16 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef @@ -520,10 +521,10 @@ ; GFX9-NEXT: s_setpc_b64 ; VI: buffer_load_sbyte v0, off, s[0:3], s4 offset:4094{{$}} -define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8* %in, i32 %reg) #0 { +define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> - %load = load volatile i8, i8* inttoptr (i32 4094 to i8*) + %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) %ext = sext i8 %load to i16 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef @@ -539,10 +540,10 @@ ; GFX9-NEXT: s_setpc_b64 ; VI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}} -define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8* %in, i32 %reg) #0 { +define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x half> - %load = load volatile i8, i8* inttoptr (i32 4094 to i8*) + %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) %ext = zext i8 %load to i16 %bc.ext = bitcast i16 %ext to half %build1 = insertelement <2 x half> %reg.bc, half %bc.ext, i32 0 @@ -595,13 +596,13 @@ ; VI: buffer_load_ushort v define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 { entry: - %obj0 = alloca [10 x i32], align 4 - %obj1 = alloca [4096 x i16], align 2 + %obj0 = alloca [10 x i32], align 4, addrspace(5) + %obj1 = alloca [4096 x i16], align 2, addrspace(5) %reg.bc = bitcast i32 %reg to <2 x i16> - %bc = bitcast [10 x i32]* %obj0 to i32* - store volatile i32 123, i32* %bc - %gep = getelementptr inbounds [4096 x i16], [4096 x i16]* %obj1, i32 0, i32 2025 - %load = load volatile i16, i16* %gep + %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* + store volatile i32 123, i32 addrspace(5)* %bc + %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2025 + %load = load volatile i16, i16 addrspace(5)* %gep %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef ret void @@ -614,13 +615,13 @@ ; VI: buffer_load_sbyte v define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 { entry: - %obj0 = alloca [10 x i32], align 4 - %obj1 = alloca [4096 x i8], align 2 + %obj0 = alloca [10 x i32], align 4, addrspace(5) + %obj1 = alloca [4096 x i8], align 2, addrspace(5) %reg.bc = bitcast i32 %reg to <2 x i16> - %bc = bitcast [10 x i32]* %obj0 to i32* - store volatile i32 123, i32* %bc - %gep = getelementptr inbounds [4096 x i8], [4096 x i8]* %obj1, i32 0, i32 4051 - %load = load volatile i8, i8* %gep + %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* + store volatile i32 123, i32 addrspace(5)* %bc + %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051 + %load = load volatile i8, i8 addrspace(5)* %gep %load.ext = sext i8 %load to i16 %build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef @@ -634,13 +635,13 @@ ; VI: buffer_load_ubyte v define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 { entry: - %obj0 = alloca [10 x i32], align 4 - %obj1 = alloca [4096 x i8], align 2 + %obj0 = alloca [10 x i32], align 4, addrspace(5) + %obj1 = alloca [4096 x i8], align 2, addrspace(5) %reg.bc = bitcast i32 %reg to <2 x i16> - %bc = bitcast [10 x i32]* %obj0 to i32* - store volatile i32 123, i32* %bc - %gep = getelementptr inbounds [4096 x i8], [4096 x i8]* %obj1, i32 0, i32 4051 - %load = load volatile i8, i8* %gep + %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* + store volatile i32 123, i32 addrspace(5)* %bc + %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051 + %load = load volatile i8, i8 addrspace(5)* %gep %load.ext = zext i8 %load to i16 %build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef Index: test/CodeGen/AMDGPU/local-stack-slot-offset.ll =================================================================== --- test/CodeGen/AMDGPU/local-stack-slot-offset.ll +++ test/CodeGen/AMDGPU/local-stack-slot-offset.ll @@ -1,5 +1,6 @@ ;RUN: llc < %s -march=amdgcn -mcpu=verde -mattr=+vgpr-spilling -mattr=-promote-alloca -verify-machineinstrs | FileCheck %s -check-prefix=CHECK ;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+vgpr-spilling -mattr=-promote-alloca -verify-machineinstrs | FileCheck %s -check-prefix=CHECK +target datalayout = "A5" ; Allocate two stack slots of 2052 bytes each requiring a total of 4104 bytes. ; Extracting the last element of each does not fit into the offset field of @@ -13,22 +14,22 @@ ; CHECK: buffer_load_dword define amdgpu_gs float @main(float %v1, float %v2, i32 %idx1, i32 %idx2) { main_body: - %m1 = alloca [513 x float] - %m2 = alloca [513 x float] + %m1 = alloca [513 x float], addrspace(5) + %m2 = alloca [513 x float], addrspace(5) - %gep1.store = getelementptr [513 x float], [513 x float]* %m1, i32 0, i32 %idx1 - store float %v1, float* %gep1.store + %gep1.store = getelementptr [513 x float], [513 x float] addrspace(5)* %m1, i32 0, i32 %idx1 + store float %v1, float addrspace(5)* %gep1.store - %gep2.store = getelementptr [513 x float], [513 x float]* %m2, i32 0, i32 %idx2 - store float %v2, float* %gep2.store + %gep2.store = getelementptr [513 x float], [513 x float] addrspace(5)* %m2, i32 0, i32 %idx2 + store float %v2, float addrspace(5)* %gep2.store ; This used to use a base reg equal to 0. - %gep1.load = getelementptr [513 x float], [513 x float]* %m1, i32 0, i32 0 - %out1 = load float, float* %gep1.load + %gep1.load = getelementptr [513 x float], [513 x float] addrspace(5)* %m1, i32 0, i32 0 + %out1 = load float, float addrspace(5)* %gep1.load ; This used to attempt to re-use the base reg at 0, generating an out-of-bounds instruction offset. - %gep2.load = getelementptr [513 x float], [513 x float]* %m2, i32 0, i32 512 - %out2 = load float, float* %gep2.load + %gep2.load = getelementptr [513 x float], [513 x float] addrspace(5)* %m2, i32 0, i32 512 + %out2 = load float, float addrspace(5)* %gep2.load %r = fadd float %out1, %out2 ret float %r Index: test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll +++ test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll @@ -7,10 +7,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @system_monotonic_monotonic( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in monotonic monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in monotonic monotonic ret void } @@ -20,10 +20,10 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} ; CHECK-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_acquire_monotonic( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in acquire monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire monotonic ret void } @@ -33,10 +33,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @system_release_monotonic( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in release monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release monotonic ret void } @@ -46,10 +46,10 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} ; CHECK-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_acq_rel_monotonic( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in acq_rel monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel monotonic ret void } @@ -59,10 +59,10 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} ; CHECK-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_seq_cst_monotonic( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst monotonic ret void } @@ -72,10 +72,10 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} ; CHECK-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_acquire_acquire( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in acquire acquire + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire acquire ret void } @@ -85,10 +85,10 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} ; CHECK-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_release_acquire( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in release acquire + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release acquire ret void } @@ -98,10 +98,10 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} ; CHECK-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_acq_rel_acquire( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in acq_rel acquire + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel acquire ret void } @@ -111,10 +111,10 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} ; CHECK-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_seq_cst_acquire( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst acquire + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst acquire ret void } @@ -124,10 +124,10 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} ; CHECK-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_seq_cst_seq_cst( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst seq_cst ret void } @@ -137,10 +137,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_monotonic_monotonic( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic ret void } @@ -150,10 +150,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_acquire_monotonic( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic ret void } @@ -163,10 +163,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_release_monotonic( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic ret void } @@ -176,10 +176,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_acq_rel_monotonic( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic ret void } @@ -189,10 +189,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_seq_cst_monotonic( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic ret void } @@ -202,10 +202,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_acquire_acquire( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire ret void } @@ -215,10 +215,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_release_acquire( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire ret void } @@ -228,10 +228,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_acq_rel_acquire( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire ret void } @@ -241,10 +241,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_seq_cst_acquire( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire ret void } @@ -254,10 +254,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_seq_cst_seq_cst( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst ret void } @@ -267,10 +267,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_monotonic_monotonic( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic ret void } @@ -280,10 +280,10 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} ; CHECK-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_acquire_monotonic( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic ret void } @@ -293,10 +293,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_release_monotonic( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("agent") release monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") release monotonic ret void } @@ -306,10 +306,10 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} ; CHECK-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_acq_rel_monotonic( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic ret void } @@ -319,10 +319,10 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} ; CHECK-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_seq_cst_monotonic( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic ret void } @@ -332,10 +332,10 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} ; CHECK-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_acquire_acquire( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire ret void } @@ -345,10 +345,10 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} ; CHECK-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_release_acquire( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("agent") release acquire + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") release acquire ret void } @@ -358,10 +358,10 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} ; CHECK-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_acq_rel_acquire( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire ret void } @@ -371,10 +371,10 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} ; CHECK-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_seq_cst_acquire( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire ret void } @@ -384,10 +384,10 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} ; CHECK-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_seq_cst_seq_cst( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst ret void } @@ -397,10 +397,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_monotonic_monotonic( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic ret void } @@ -410,10 +410,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_acquire_monotonic( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic ret void } @@ -423,10 +423,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_release_monotonic( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic ret void } @@ -436,10 +436,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_acq_rel_monotonic( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic ret void } @@ -449,10 +449,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_seq_cst_monotonic( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic ret void } @@ -462,10 +462,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_acquire_acquire( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire ret void } @@ -475,10 +475,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_release_acquire( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire ret void } @@ -488,10 +488,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_acq_rel_acquire( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire ret void } @@ -501,10 +501,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_seq_cst_acquire( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire ret void } @@ -514,10 +514,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_seq_cst_seq_cst( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst ret void } @@ -527,10 +527,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_monotonic_monotonic( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic ret void } @@ -540,10 +540,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_acquire_monotonic( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic ret void } @@ -553,10 +553,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_release_monotonic( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic ret void } @@ -566,10 +566,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_acq_rel_monotonic( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic ret void } @@ -579,10 +579,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_seq_cst_monotonic( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic ret void } @@ -592,10 +592,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_acquire_acquire( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire ret void } @@ -605,10 +605,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_release_acquire( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("wavefront") release acquire + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") release acquire ret void } @@ -618,10 +618,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_acq_rel_acquire( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire ret void } @@ -631,10 +631,10 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_seq_cst_acquire( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire ret void } @@ -644,9 +644,9 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_seq_cst_seq_cst( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst ret void } Index: test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll +++ test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll @@ -7,9 +7,9 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @system_monotonic( - i32 addrspace(4)* %out, i32 %in) { + i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in monotonic + %val = atomicrmw volatile xchg i32* %out, i32 %in monotonic ret void } @@ -19,9 +19,9 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} ; CHECK-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_acquire( - i32 addrspace(4)* %out, i32 %in) { + i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in acquire + %val = atomicrmw volatile xchg i32* %out, i32 %in acquire ret void } @@ -31,9 +31,9 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @system_release( - i32 addrspace(4)* %out, i32 %in) { + i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in release + %val = atomicrmw volatile xchg i32* %out, i32 %in release ret void } @@ -43,9 +43,9 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} ; CHECK-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_acq_rel( - i32 addrspace(4)* %out, i32 %in) { + i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in acq_rel + %val = atomicrmw volatile xchg i32* %out, i32 %in acq_rel ret void } @@ -55,9 +55,9 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} ; CHECK-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_seq_cst( - i32 addrspace(4)* %out, i32 %in) { + i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in seq_cst + %val = atomicrmw volatile xchg i32* %out, i32 %in seq_cst ret void } @@ -67,9 +67,9 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_monotonic( - i32 addrspace(4)* %out, i32 %in) { + i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("singlethread") monotonic + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") monotonic ret void } @@ -79,9 +79,9 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_acquire( - i32 addrspace(4)* %out, i32 %in) { + i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("singlethread") acquire + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acquire ret void } @@ -91,9 +91,9 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_release( - i32 addrspace(4)* %out, i32 %in) { + i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("singlethread") release + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") release ret void } @@ -103,9 +103,9 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_acq_rel( - i32 addrspace(4)* %out, i32 %in) { + i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("singlethread") acq_rel + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acq_rel ret void } @@ -115,9 +115,9 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_seq_cst( - i32 addrspace(4)* %out, i32 %in) { + i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("singlethread") seq_cst + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") seq_cst ret void } @@ -127,9 +127,9 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_monotonic( - i32 addrspace(4)* %out, i32 %in) { + i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("agent") monotonic + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") monotonic ret void } @@ -139,9 +139,9 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} ; CHECK-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_acquire( - i32 addrspace(4)* %out, i32 %in) { + i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("agent") acquire + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acquire ret void } @@ -151,9 +151,9 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_release( - i32 addrspace(4)* %out, i32 %in) { + i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("agent") release + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") release ret void } @@ -163,9 +163,9 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} ; CHECK-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_acq_rel( - i32 addrspace(4)* %out, i32 %in) { + i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("agent") acq_rel + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acq_rel ret void } @@ -175,9 +175,9 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} ; CHECK-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_seq_cst( - i32 addrspace(4)* %out, i32 %in) { + i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") seq_cst ret void } @@ -187,9 +187,9 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_monotonic( - i32 addrspace(4)* %out, i32 %in) { + i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("workgroup") monotonic + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") monotonic ret void } @@ -199,9 +199,9 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_acquire( - i32 addrspace(4)* %out, i32 %in) { + i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("workgroup") acquire + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acquire ret void } @@ -211,9 +211,9 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_release( - i32 addrspace(4)* %out, i32 %in) { + i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("workgroup") release + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") release ret void } @@ -223,9 +223,9 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_acq_rel( - i32 addrspace(4)* %out, i32 %in) { + i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("workgroup") acq_rel + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acq_rel ret void } @@ -235,9 +235,9 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_seq_cst( - i32 addrspace(4)* %out, i32 %in) { + i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") seq_cst ret void } @@ -247,9 +247,9 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_monotonic( - i32 addrspace(4)* %out, i32 %in) { + i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("wavefront") monotonic + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") monotonic ret void } @@ -259,9 +259,9 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_acquire( - i32 addrspace(4)* %out, i32 %in) { + i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("wavefront") acquire + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acquire ret void } @@ -271,9 +271,9 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_release( - i32 addrspace(4)* %out, i32 %in) { + i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("wavefront") release + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") release ret void } @@ -283,9 +283,9 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_acq_rel( - i32 addrspace(4)* %out, i32 %in) { + i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("wavefront") acq_rel + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acq_rel ret void } @@ -295,8 +295,8 @@ ; CHECK-NOT: s_waitcnt vmcnt(0){{$}} ; CHECK-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_seq_cst( - i32 addrspace(4)* %out, i32 %in) { + i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("wavefront") seq_cst + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") seq_cst ret void } Index: test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll +++ test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll @@ -8,36 +8,36 @@ ret void } -; CHECK: error: :0:0: in function invalid_load void (i32 addrspace(4)*, i32 addrspace(4)*): Unsupported synchronization scope +; CHECK: error: :0:0: in function invalid_load void (i32*, i32*): Unsupported synchronization scope define amdgpu_kernel void @invalid_load( - i32 addrspace(4)* %in, i32 addrspace(4)* %out) { + i32* %in, i32* %out) { entry: - %val = load atomic i32, i32 addrspace(4)* %in syncscope("invalid") seq_cst, align 4 - store i32 %val, i32 addrspace(4)* %out + %val = load atomic i32, i32* %in syncscope("invalid") seq_cst, align 4 + store i32 %val, i32* %out ret void } -; CHECK: error: :0:0: in function invalid_store void (i32, i32 addrspace(4)*): Unsupported synchronization scope +; CHECK: error: :0:0: in function invalid_store void (i32, i32*): Unsupported synchronization scope define amdgpu_kernel void @invalid_store( - i32 %in, i32 addrspace(4)* %out) { + i32 %in, i32* %out) { entry: - store atomic i32 %in, i32 addrspace(4)* %out syncscope("invalid") seq_cst, align 4 + store atomic i32 %in, i32* %out syncscope("invalid") seq_cst, align 4 ret void } -; CHECK: error: :0:0: in function invalid_cmpxchg void (i32 addrspace(4)*, i32, i32): Unsupported synchronization scope +; CHECK: error: :0:0: in function invalid_cmpxchg void (i32*, i32, i32): Unsupported synchronization scope define amdgpu_kernel void @invalid_cmpxchg( - i32 addrspace(4)* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("invalid") seq_cst seq_cst + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("invalid") seq_cst seq_cst ret void } -; CHECK: error: :0:0: in function invalid_rmw void (i32 addrspace(4)*, i32): Unsupported synchronization scope +; CHECK: error: :0:0: in function invalid_rmw void (i32*, i32): Unsupported synchronization scope define amdgpu_kernel void @invalid_rmw( - i32 addrspace(4)* %out, i32 %in) { + i32* %out, i32 %in) { entry: - %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("invalid") seq_cst + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("invalid") seq_cst ret void } Index: test/CodeGen/AMDGPU/memory-legalizer-load.ll =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-load.ll +++ test/CodeGen/AMDGPU/memory-legalizer-load.ll @@ -12,10 +12,10 @@ ; GCN-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @system_unordered( - i32 addrspace(4)* %in, i32 addrspace(4)* %out) { + i32* %in, i32* %out) { entry: - %val = load atomic i32, i32 addrspace(4)* %in unordered, align 4 - store i32 %val, i32 addrspace(4)* %out + %val = load atomic i32, i32* %in unordered, align 4 + store i32 %val, i32* %out ret void } @@ -26,10 +26,10 @@ ; GCN-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @system_monotonic( - i32 addrspace(4)* %in, i32 addrspace(4)* %out) { + i32* %in, i32* %out) { entry: - %val = load atomic i32, i32 addrspace(4)* %in monotonic, align 4 - store i32 %val, i32 addrspace(4)* %out + %val = load atomic i32, i32* %in monotonic, align 4 + store i32 %val, i32* %out ret void } @@ -40,10 +40,10 @@ ; GCN-NEXT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @system_acquire( - i32 addrspace(4)* %in, i32 addrspace(4)* %out) { + i32* %in, i32* %out) { entry: - %val = load atomic i32, i32 addrspace(4)* %in acquire, align 4 - store i32 %val, i32 addrspace(4)* %out + %val = load atomic i32, i32* %in acquire, align 4 + store i32 %val, i32* %out ret void } @@ -54,10 +54,10 @@ ; GCN-NEXT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @system_seq_cst( - i32 addrspace(4)* %in, i32 addrspace(4)* %out) { + i32* %in, i32* %out) { entry: - %val = load atomic i32, i32 addrspace(4)* %in seq_cst, align 4 - store i32 %val, i32 addrspace(4)* %out + %val = load atomic i32, i32* %in seq_cst, align 4 + store i32 %val, i32* %out ret void } @@ -68,10 +68,10 @@ ; GCN-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @singlethread_unordered( - i32 addrspace(4)* %in, i32 addrspace(4)* %out) { + i32* %in, i32* %out) { entry: - %val = load atomic i32, i32 addrspace(4)* %in syncscope("singlethread") unordered, align 4 - store i32 %val, i32 addrspace(4)* %out + %val = load atomic i32, i32* %in syncscope("singlethread") unordered, align 4 + store i32 %val, i32* %out ret void } @@ -82,10 +82,10 @@ ; GCN-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @singlethread_monotonic( - i32 addrspace(4)* %in, i32 addrspace(4)* %out) { + i32* %in, i32* %out) { entry: - %val = load atomic i32, i32 addrspace(4)* %in syncscope("singlethread") monotonic, align 4 - store i32 %val, i32 addrspace(4)* %out + %val = load atomic i32, i32* %in syncscope("singlethread") monotonic, align 4 + store i32 %val, i32* %out ret void } @@ -96,10 +96,10 @@ ; GCN-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @singlethread_acquire( - i32 addrspace(4)* %in, i32 addrspace(4)* %out) { + i32* %in, i32* %out) { entry: - %val = load atomic i32, i32 addrspace(4)* %in syncscope("singlethread") acquire, align 4 - store i32 %val, i32 addrspace(4)* %out + %val = load atomic i32, i32* %in syncscope("singlethread") acquire, align 4 + store i32 %val, i32* %out ret void } @@ -110,10 +110,10 @@ ; GCN-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @singlethread_seq_cst( - i32 addrspace(4)* %in, i32 addrspace(4)* %out) { + i32* %in, i32* %out) { entry: - %val = load atomic i32, i32 addrspace(4)* %in syncscope("singlethread") seq_cst, align 4 - store i32 %val, i32 addrspace(4)* %out + %val = load atomic i32, i32* %in syncscope("singlethread") seq_cst, align 4 + store i32 %val, i32* %out ret void } @@ -124,10 +124,10 @@ ; GCN-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @agent_unordered( - i32 addrspace(4)* %in, i32 addrspace(4)* %out) { + i32* %in, i32* %out) { entry: - %val = load atomic i32, i32 addrspace(4)* %in syncscope("agent") unordered, align 4 - store i32 %val, i32 addrspace(4)* %out + %val = load atomic i32, i32* %in syncscope("agent") unordered, align 4 + store i32 %val, i32* %out ret void } @@ -138,10 +138,10 @@ ; GCN-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @agent_monotonic( - i32 addrspace(4)* %in, i32 addrspace(4)* %out) { + i32* %in, i32* %out) { entry: - %val = load atomic i32, i32 addrspace(4)* %in syncscope("agent") monotonic, align 4 - store i32 %val, i32 addrspace(4)* %out + %val = load atomic i32, i32* %in syncscope("agent") monotonic, align 4 + store i32 %val, i32* %out ret void } @@ -152,10 +152,10 @@ ; GCN-NEXT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @agent_acquire( - i32 addrspace(4)* %in, i32 addrspace(4)* %out) { + i32* %in, i32* %out) { entry: - %val = load atomic i32, i32 addrspace(4)* %in syncscope("agent") acquire, align 4 - store i32 %val, i32 addrspace(4)* %out + %val = load atomic i32, i32* %in syncscope("agent") acquire, align 4 + store i32 %val, i32* %out ret void } @@ -166,10 +166,10 @@ ; GCN-NEXT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @agent_seq_cst( - i32 addrspace(4)* %in, i32 addrspace(4)* %out) { + i32* %in, i32* %out) { entry: - %val = load atomic i32, i32 addrspace(4)* %in syncscope("agent") seq_cst, align 4 - store i32 %val, i32 addrspace(4)* %out + %val = load atomic i32, i32* %in syncscope("agent") seq_cst, align 4 + store i32 %val, i32* %out ret void } @@ -180,10 +180,10 @@ ; GCN-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @workgroup_unordered( - i32 addrspace(4)* %in, i32 addrspace(4)* %out) { + i32* %in, i32* %out) { entry: - %val = load atomic i32, i32 addrspace(4)* %in syncscope("workgroup") unordered, align 4 - store i32 %val, i32 addrspace(4)* %out + %val = load atomic i32, i32* %in syncscope("workgroup") unordered, align 4 + store i32 %val, i32* %out ret void } @@ -194,10 +194,10 @@ ; GCN-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @workgroup_monotonic( - i32 addrspace(4)* %in, i32 addrspace(4)* %out) { + i32* %in, i32* %out) { entry: - %val = load atomic i32, i32 addrspace(4)* %in syncscope("workgroup") monotonic, align 4 - store i32 %val, i32 addrspace(4)* %out + %val = load atomic i32, i32* %in syncscope("workgroup") monotonic, align 4 + store i32 %val, i32* %out ret void } @@ -208,10 +208,10 @@ ; GCN-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @workgroup_acquire( - i32 addrspace(4)* %in, i32 addrspace(4)* %out) { + i32* %in, i32* %out) { entry: - %val = load atomic i32, i32 addrspace(4)* %in syncscope("workgroup") acquire, align 4 - store i32 %val, i32 addrspace(4)* %out + %val = load atomic i32, i32* %in syncscope("workgroup") acquire, align 4 + store i32 %val, i32* %out ret void } @@ -222,10 +222,10 @@ ; GCN-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @workgroup_seq_cst( - i32 addrspace(4)* %in, i32 addrspace(4)* %out) { + i32* %in, i32* %out) { entry: - %val = load atomic i32, i32 addrspace(4)* %in syncscope("workgroup") seq_cst, align 4 - store i32 %val, i32 addrspace(4)* %out + %val = load atomic i32, i32* %in syncscope("workgroup") seq_cst, align 4 + store i32 %val, i32* %out ret void } @@ -236,10 +236,10 @@ ; GCN-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @wavefront_unordered( - i32 addrspace(4)* %in, i32 addrspace(4)* %out) { + i32* %in, i32* %out) { entry: - %val = load atomic i32, i32 addrspace(4)* %in syncscope("wavefront") unordered, align 4 - store i32 %val, i32 addrspace(4)* %out + %val = load atomic i32, i32* %in syncscope("wavefront") unordered, align 4 + store i32 %val, i32* %out ret void } @@ -250,10 +250,10 @@ ; GCN-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @wavefront_monotonic( - i32 addrspace(4)* %in, i32 addrspace(4)* %out) { + i32* %in, i32* %out) { entry: - %val = load atomic i32, i32 addrspace(4)* %in syncscope("wavefront") monotonic, align 4 - store i32 %val, i32 addrspace(4)* %out + %val = load atomic i32, i32* %in syncscope("wavefront") monotonic, align 4 + store i32 %val, i32* %out ret void } @@ -264,10 +264,10 @@ ; GCN-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @wavefront_acquire( - i32 addrspace(4)* %in, i32 addrspace(4)* %out) { + i32* %in, i32* %out) { entry: - %val = load atomic i32, i32 addrspace(4)* %in syncscope("wavefront") acquire, align 4 - store i32 %val, i32 addrspace(4)* %out + %val = load atomic i32, i32* %in syncscope("wavefront") acquire, align 4 + store i32 %val, i32* %out ret void } @@ -278,42 +278,42 @@ ; GCN-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @wavefront_seq_cst( - i32 addrspace(4)* %in, i32 addrspace(4)* %out) { + i32* %in, i32* %out) { entry: - %val = load atomic i32, i32 addrspace(4)* %in syncscope("wavefront") seq_cst, align 4 - store i32 %val, i32 addrspace(4)* %out + %val = load atomic i32, i32* %in syncscope("wavefront") seq_cst, align 4 + store i32 %val, i32* %out ret void } ; GCN-LABEL: {{^}}nontemporal_private_0 ; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}} define amdgpu_kernel void @nontemporal_private_0( - i32* %in, i32 addrspace(4)* %out) { + i32 addrspace(5)* %in, i32* %out) { entry: - %val = load i32, i32* %in, align 4, !nontemporal !0 - store i32 %val, i32 addrspace(4)* %out + %val = load i32, i32 addrspace(5)* %in, align 4, !nontemporal !0 + store i32 %val, i32* %out ret void } ; GCN-LABEL: {{^}}nontemporal_private_1 ; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}} define amdgpu_kernel void @nontemporal_private_1( - i32* %in, i32 addrspace(4)* %out) { + i32 addrspace(5)* %in, i32* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() - %val.gep = getelementptr inbounds i32, i32* %in, i32 %tid - %val = load i32, i32* %val.gep, align 4, !nontemporal !0 - store i32 %val, i32 addrspace(4)* %out + %val.gep = getelementptr inbounds i32, i32 addrspace(5)* %in, i32 %tid + %val = load i32, i32 addrspace(5)* %val.gep, align 4, !nontemporal !0 + store i32 %val, i32* %out ret void } ; GCN-LABEL: {{^}}nontemporal_global_0 ; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0x0{{$}} define amdgpu_kernel void @nontemporal_global_0( - i32 addrspace(1)* %in, i32 addrspace(4)* %out) { + i32 addrspace(1)* %in, i32* %out) { entry: %val = load i32, i32 addrspace(1)* %in, align 4, !nontemporal !0 - store i32 %val, i32 addrspace(4)* %out + store i32 %val, i32* %out ret void } @@ -321,56 +321,56 @@ ; GFX8: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} ; GFX9: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], off glc slc{{$}} define amdgpu_kernel void @nontemporal_global_1( - i32 addrspace(1)* %in, i32 addrspace(4)* %out) { + i32 addrspace(1)* %in, i32* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %val.gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid %val = load i32, i32 addrspace(1)* %val.gep, align 4, !nontemporal !0 - store i32 %val, i32 addrspace(4)* %out + store i32 %val, i32* %out ret void } ; GCN-LABEL: {{^}}nontemporal_local_0 ; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}} define amdgpu_kernel void @nontemporal_local_0( - i32 addrspace(3)* %in, i32 addrspace(4)* %out) { + i32 addrspace(3)* %in, i32* %out) { entry: %val = load i32, i32 addrspace(3)* %in, align 4, !nontemporal !0 - store i32 %val, i32 addrspace(4)* %out + store i32 %val, i32* %out ret void } ; GCN-LABEL: {{^}}nontemporal_local_1 ; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}} define amdgpu_kernel void @nontemporal_local_1( - i32 addrspace(3)* %in, i32 addrspace(4)* %out) { + i32 addrspace(3)* %in, i32* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %val.gep = getelementptr inbounds i32, i32 addrspace(3)* %in, i32 %tid %val = load i32, i32 addrspace(3)* %val.gep, align 4, !nontemporal !0 - store i32 %val, i32 addrspace(4)* %out + store i32 %val, i32* %out ret void } ; GCN-LABEL: {{^}}nontemporal_flat_0 ; GCN: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} define amdgpu_kernel void @nontemporal_flat_0( - i32 addrspace(4)* %in, i32 addrspace(4)* %out) { + i32* %in, i32* %out) { entry: - %val = load i32, i32 addrspace(4)* %in, align 4, !nontemporal !0 - store i32 %val, i32 addrspace(4)* %out + %val = load i32, i32* %in, align 4, !nontemporal !0 + store i32 %val, i32* %out ret void } ; GCN-LABEL: {{^}}nontemporal_flat_1 ; GCN: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} define amdgpu_kernel void @nontemporal_flat_1( - i32 addrspace(4)* %in, i32 addrspace(4)* %out) { + i32* %in, i32* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() - %val.gep = getelementptr inbounds i32, i32 addrspace(4)* %in, i32 %tid - %val = load i32, i32 addrspace(4)* %val.gep, align 4, !nontemporal !0 - store i32 %val, i32 addrspace(4)* %out + %val.gep = getelementptr inbounds i32, i32* %in, i32 %tid + %val = load i32, i32* %val.gep, align 4, !nontemporal !0 + store i32 %val, i32* %out ret void } Index: test/CodeGen/AMDGPU/memory-legalizer-store.ll =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-store.ll +++ test/CodeGen/AMDGPU/memory-legalizer-store.ll @@ -9,9 +9,9 @@ ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} define amdgpu_kernel void @system_unordered( - i32 %in, i32 addrspace(4)* %out) { + i32 %in, i32* %out) { entry: - store atomic i32 %in, i32 addrspace(4)* %out unordered, align 4 + store atomic i32 %in, i32* %out unordered, align 4 ret void } @@ -19,9 +19,9 @@ ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} define amdgpu_kernel void @system_monotonic( - i32 %in, i32 addrspace(4)* %out) { + i32 %in, i32* %out) { entry: - store atomic i32 %in, i32 addrspace(4)* %out monotonic, align 4 + store atomic i32 %in, i32* %out monotonic, align 4 ret void } @@ -29,9 +29,9 @@ ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} define amdgpu_kernel void @system_release( - i32 %in, i32 addrspace(4)* %out) { + i32 %in, i32* %out) { entry: - store atomic i32 %in, i32 addrspace(4)* %out release, align 4 + store atomic i32 %in, i32* %out release, align 4 ret void } @@ -39,9 +39,9 @@ ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} define amdgpu_kernel void @system_seq_cst( - i32 %in, i32 addrspace(4)* %out) { + i32 %in, i32* %out) { entry: - store atomic i32 %in, i32 addrspace(4)* %out seq_cst, align 4 + store atomic i32 %in, i32* %out seq_cst, align 4 ret void } @@ -49,9 +49,9 @@ ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} define amdgpu_kernel void @singlethread_unordered( - i32 %in, i32 addrspace(4)* %out) { + i32 %in, i32* %out) { entry: - store atomic i32 %in, i32 addrspace(4)* %out syncscope("singlethread") unordered, align 4 + store atomic i32 %in, i32* %out syncscope("singlethread") unordered, align 4 ret void } @@ -59,9 +59,9 @@ ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} define amdgpu_kernel void @singlethread_monotonic( - i32 %in, i32 addrspace(4)* %out) { + i32 %in, i32* %out) { entry: - store atomic i32 %in, i32 addrspace(4)* %out syncscope("singlethread") monotonic, align 4 + store atomic i32 %in, i32* %out syncscope("singlethread") monotonic, align 4 ret void } @@ -69,9 +69,9 @@ ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} define amdgpu_kernel void @singlethread_release( - i32 %in, i32 addrspace(4)* %out) { + i32 %in, i32* %out) { entry: - store atomic i32 %in, i32 addrspace(4)* %out syncscope("singlethread") release, align 4 + store atomic i32 %in, i32* %out syncscope("singlethread") release, align 4 ret void } @@ -79,9 +79,9 @@ ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} define amdgpu_kernel void @singlethread_seq_cst( - i32 %in, i32 addrspace(4)* %out) { + i32 %in, i32* %out) { entry: - store atomic i32 %in, i32 addrspace(4)* %out syncscope("singlethread") seq_cst, align 4 + store atomic i32 %in, i32* %out syncscope("singlethread") seq_cst, align 4 ret void } @@ -89,9 +89,9 @@ ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} define amdgpu_kernel void @agent_unordered( - i32 %in, i32 addrspace(4)* %out) { + i32 %in, i32* %out) { entry: - store atomic i32 %in, i32 addrspace(4)* %out syncscope("agent") unordered, align 4 + store atomic i32 %in, i32* %out syncscope("agent") unordered, align 4 ret void } @@ -99,9 +99,9 @@ ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} define amdgpu_kernel void @agent_monotonic( - i32 %in, i32 addrspace(4)* %out) { + i32 %in, i32* %out) { entry: - store atomic i32 %in, i32 addrspace(4)* %out syncscope("agent") monotonic, align 4 + store atomic i32 %in, i32* %out syncscope("agent") monotonic, align 4 ret void } @@ -109,9 +109,9 @@ ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} define amdgpu_kernel void @agent_release( - i32 %in, i32 addrspace(4)* %out) { + i32 %in, i32* %out) { entry: - store atomic i32 %in, i32 addrspace(4)* %out syncscope("agent") release, align 4 + store atomic i32 %in, i32* %out syncscope("agent") release, align 4 ret void } @@ -119,9 +119,9 @@ ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} define amdgpu_kernel void @agent_seq_cst( - i32 %in, i32 addrspace(4)* %out) { + i32 %in, i32* %out) { entry: - store atomic i32 %in, i32 addrspace(4)* %out syncscope("agent") seq_cst, align 4 + store atomic i32 %in, i32* %out syncscope("agent") seq_cst, align 4 ret void } @@ -129,9 +129,9 @@ ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} define amdgpu_kernel void @workgroup_unordered( - i32 %in, i32 addrspace(4)* %out) { + i32 %in, i32* %out) { entry: - store atomic i32 %in, i32 addrspace(4)* %out syncscope("workgroup") unordered, align 4 + store atomic i32 %in, i32* %out syncscope("workgroup") unordered, align 4 ret void } @@ -139,9 +139,9 @@ ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} define amdgpu_kernel void @workgroup_monotonic( - i32 %in, i32 addrspace(4)* %out) { + i32 %in, i32* %out) { entry: - store atomic i32 %in, i32 addrspace(4)* %out syncscope("workgroup") monotonic, align 4 + store atomic i32 %in, i32* %out syncscope("workgroup") monotonic, align 4 ret void } @@ -149,9 +149,9 @@ ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} define amdgpu_kernel void @workgroup_release( - i32 %in, i32 addrspace(4)* %out) { + i32 %in, i32* %out) { entry: - store atomic i32 %in, i32 addrspace(4)* %out syncscope("workgroup") release, align 4 + store atomic i32 %in, i32* %out syncscope("workgroup") release, align 4 ret void } @@ -159,9 +159,9 @@ ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} define amdgpu_kernel void @workgroup_seq_cst( - i32 %in, i32 addrspace(4)* %out) { + i32 %in, i32* %out) { entry: - store atomic i32 %in, i32 addrspace(4)* %out syncscope("workgroup") seq_cst, align 4 + store atomic i32 %in, i32* %out syncscope("workgroup") seq_cst, align 4 ret void } @@ -169,9 +169,9 @@ ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} define amdgpu_kernel void @wavefront_unordered( - i32 %in, i32 addrspace(4)* %out) { + i32 %in, i32* %out) { entry: - store atomic i32 %in, i32 addrspace(4)* %out syncscope("wavefront") unordered, align 4 + store atomic i32 %in, i32* %out syncscope("wavefront") unordered, align 4 ret void } @@ -179,9 +179,9 @@ ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} define amdgpu_kernel void @wavefront_monotonic( - i32 %in, i32 addrspace(4)* %out) { + i32 %in, i32* %out) { entry: - store atomic i32 %in, i32 addrspace(4)* %out syncscope("wavefront") monotonic, align 4 + store atomic i32 %in, i32* %out syncscope("wavefront") monotonic, align 4 ret void } @@ -189,9 +189,9 @@ ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} define amdgpu_kernel void @wavefront_release( - i32 %in, i32 addrspace(4)* %out) { + i32 %in, i32* %out) { entry: - store atomic i32 %in, i32 addrspace(4)* %out syncscope("wavefront") release, align 4 + store atomic i32 %in, i32* %out syncscope("wavefront") release, align 4 ret void } @@ -199,31 +199,31 @@ ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} define amdgpu_kernel void @wavefront_seq_cst( - i32 %in, i32 addrspace(4)* %out) { + i32 %in, i32* %out) { entry: - store atomic i32 %in, i32 addrspace(4)* %out syncscope("wavefront") seq_cst, align 4 + store atomic i32 %in, i32* %out syncscope("wavefront") seq_cst, align 4 ret void } ; GCN-LABEL: {{^}}nontemporal_private_0 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}} define amdgpu_kernel void @nontemporal_private_0( - i32 addrspace(4)* %in, i32* %out) { + i32* %in, i32 addrspace(5)* %out) { entry: - %val = load i32, i32 addrspace(4)* %in, align 4 - store i32 %val, i32* %out, !nontemporal !0 + %val = load i32, i32* %in, align 4 + store i32 %val, i32 addrspace(5)* %out, !nontemporal !0 ret void } ; GCN-LABEL: {{^}}nontemporal_private_1 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}} define amdgpu_kernel void @nontemporal_private_1( - i32 addrspace(4)* %in, i32* %out) { + i32* %in, i32 addrspace(5)* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() - %val = load i32, i32 addrspace(4)* %in, align 4 - %out.gep = getelementptr inbounds i32, i32* %out, i32 %tid - store i32 %val, i32* %out.gep, !nontemporal !0 + %val = load i32, i32* %in, align 4 + %out.gep = getelementptr inbounds i32, i32 addrspace(5)* %out, i32 %tid + store i32 %val, i32 addrspace(5)* %out.gep, !nontemporal !0 ret void } @@ -231,9 +231,9 @@ ; GFX8: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}} ; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc slc{{$}} define amdgpu_kernel void @nontemporal_global_0( - i32 addrspace(4)* %in, i32 addrspace(1)* %out) { + i32* %in, i32 addrspace(1)* %out) { entry: - %val = load i32, i32 addrspace(4)* %in, align 4 + %val = load i32, i32* %in, align 4 store i32 %val, i32 addrspace(1)* %out, !nontemporal !0 ret void } @@ -242,10 +242,10 @@ ; GFX8: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}} ; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc slc{{$}} define amdgpu_kernel void @nontemporal_global_1( - i32 addrspace(4)* %in, i32 addrspace(1)* %out) { + i32* %in, i32 addrspace(1)* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() - %val = load i32, i32 addrspace(4)* %in, align 4 + %val = load i32, i32* %in, align 4 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid store i32 %val, i32 addrspace(1)* %out.gep, !nontemporal !0 ret void @@ -254,9 +254,9 @@ ; GCN-LABEL: {{^}}nontemporal_local_0 ; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}} define amdgpu_kernel void @nontemporal_local_0( - i32 addrspace(4)* %in, i32 addrspace(3)* %out) { + i32* %in, i32 addrspace(3)* %out) { entry: - %val = load i32, i32 addrspace(4)* %in, align 4 + %val = load i32, i32* %in, align 4 store i32 %val, i32 addrspace(3)* %out, !nontemporal !0 ret void } @@ -264,10 +264,10 @@ ; GCN-LABEL: {{^}}nontemporal_local_1 ; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}} define amdgpu_kernel void @nontemporal_local_1( - i32 addrspace(4)* %in, i32 addrspace(3)* %out) { + i32* %in, i32 addrspace(3)* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() - %val = load i32, i32 addrspace(4)* %in, align 4 + %val = load i32, i32* %in, align 4 %out.gep = getelementptr inbounds i32, i32 addrspace(3)* %out, i32 %tid store i32 %val, i32 addrspace(3)* %out.gep, !nontemporal !0 ret void @@ -276,22 +276,22 @@ ; GCN-LABEL: {{^}}nontemporal_flat_0 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}} define amdgpu_kernel void @nontemporal_flat_0( - i32 addrspace(4)* %in, i32 addrspace(4)* %out) { + i32* %in, i32* %out) { entry: - %val = load i32, i32 addrspace(4)* %in, align 4 - store i32 %val, i32 addrspace(4)* %out, !nontemporal !0 + %val = load i32, i32* %in, align 4 + store i32 %val, i32* %out, !nontemporal !0 ret void } ; GCN-LABEL: {{^}}nontemporal_flat_1 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}} define amdgpu_kernel void @nontemporal_flat_1( - i32 addrspace(4)* %in, i32 addrspace(4)* %out) { + i32* %in, i32* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() - %val = load i32, i32 addrspace(4)* %in, align 4 - %out.gep = getelementptr inbounds i32, i32 addrspace(4)* %out, i32 %tid - store i32 %val, i32 addrspace(4)* %out.gep, !nontemporal !0 + %val = load i32, i32* %in, align 4 + %out.gep = getelementptr inbounds i32, i32* %out, i32 %tid + store i32 %val, i32* %out.gep, !nontemporal !0 ret void } Index: test/CodeGen/AMDGPU/move-to-valu-worklist.ll =================================================================== --- test/CodeGen/AMDGPU/move-to-valu-worklist.ll +++ test/CodeGen/AMDGPU/move-to-valu-worklist.ll @@ -13,7 +13,7 @@ ; GCN-NEXT: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @in_worklist_once() #0 { bb: - %tmp = load i64, i64* undef + %tmp = load i64, i64 addrspace(5)* undef br label %bb1 bb1: ; preds = %bb1, %bb Index: test/CodeGen/AMDGPU/mubuf-offset-private.ll =================================================================== --- test/CodeGen/AMDGPU/mubuf-offset-private.ll +++ test/CodeGen/AMDGPU/mubuf-offset-private.ll @@ -1,55 +1,56 @@ ; RUN: llc -march=amdgcn -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SICIVI %s ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SICIVI %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +target datalayout = "A5" ; Test addressing modes when the scratch base is not a frame index. ; GCN-LABEL: {{^}}store_private_offset_i8: ; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @store_private_offset_i8() #0 { - store volatile i8 5, i8* inttoptr (i32 8 to i8*) + store volatile i8 5, i8 addrspace(5)* inttoptr (i32 8 to i8 addrspace(5)*) ret void } ; GCN-LABEL: {{^}}store_private_offset_i16: ; GCN: buffer_store_short v{{[0-9]+}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @store_private_offset_i16() #0 { - store volatile i16 5, i16* inttoptr (i32 8 to i16*) + store volatile i16 5, i16 addrspace(5)* inttoptr (i32 8 to i16 addrspace(5)*) ret void } ; GCN-LABEL: {{^}}store_private_offset_i32: ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @store_private_offset_i32() #0 { - store volatile i32 5, i32* inttoptr (i32 8 to i32*) + store volatile i32 5, i32 addrspace(5)* inttoptr (i32 8 to i32 addrspace(5)*) ret void } ; GCN-LABEL: {{^}}store_private_offset_v2i32: ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @store_private_offset_v2i32() #0 { - store volatile <2 x i32> , <2 x i32>* inttoptr (i32 8 to <2 x i32>*) + store volatile <2 x i32> , <2 x i32> addrspace(5)* inttoptr (i32 8 to <2 x i32> addrspace(5)*) ret void } ; GCN-LABEL: {{^}}store_private_offset_v4i32: ; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @store_private_offset_v4i32() #0 { - store volatile <4 x i32> , <4 x i32>* inttoptr (i32 8 to <4 x i32>*) + store volatile <4 x i32> , <4 x i32> addrspace(5)* inttoptr (i32 8 to <4 x i32> addrspace(5)*) ret void } ; GCN-LABEL: {{^}}load_private_offset_i8: ; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @load_private_offset_i8() #0 { - %load = load volatile i8, i8* inttoptr (i32 8 to i8*) + %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 8 to i8 addrspace(5)*) ret void } ; GCN-LABEL: {{^}}sextload_private_offset_i8: ; GCN: buffer_load_sbyte v{{[0-9]+}}, off, s[4:7], s8 offset:8 define amdgpu_kernel void @sextload_private_offset_i8(i32 addrspace(1)* %out) #0 { - %load = load volatile i8, i8* inttoptr (i32 8 to i8*) + %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 8 to i8 addrspace(5)*) %sextload = sext i8 %load to i32 store i32 %sextload, i32 addrspace(1)* undef ret void @@ -58,7 +59,7 @@ ; GCN-LABEL: {{^}}zextload_private_offset_i8: ; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], s8 offset:8 define amdgpu_kernel void @zextload_private_offset_i8(i32 addrspace(1)* %out) #0 { - %load = load volatile i8, i8* inttoptr (i32 8 to i8*) + %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 8 to i8 addrspace(5)*) %zextload = zext i8 %load to i32 store i32 %zextload, i32 addrspace(1)* undef ret void @@ -67,14 +68,14 @@ ; GCN-LABEL: {{^}}load_private_offset_i16: ; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @load_private_offset_i16() #0 { - %load = load volatile i16, i16* inttoptr (i32 8 to i16*) + %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 8 to i16 addrspace(5)*) ret void } ; GCN-LABEL: {{^}}sextload_private_offset_i16: ; GCN: buffer_load_sshort v{{[0-9]+}}, off, s[4:7], s8 offset:8 define amdgpu_kernel void @sextload_private_offset_i16(i32 addrspace(1)* %out) #0 { - %load = load volatile i16, i16* inttoptr (i32 8 to i16*) + %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 8 to i16 addrspace(5)*) %sextload = sext i16 %load to i32 store i32 %sextload, i32 addrspace(1)* undef ret void @@ -83,7 +84,7 @@ ; GCN-LABEL: {{^}}zextload_private_offset_i16: ; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], s8 offset:8 define amdgpu_kernel void @zextload_private_offset_i16(i32 addrspace(1)* %out) #0 { - %load = load volatile i16, i16* inttoptr (i32 8 to i16*) + %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 8 to i16 addrspace(5)*) %zextload = zext i16 %load to i32 store i32 %zextload, i32 addrspace(1)* undef ret void @@ -92,28 +93,28 @@ ; GCN-LABEL: {{^}}load_private_offset_i32: ; GCN: buffer_load_dword v{{[0-9]+}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @load_private_offset_i32() #0 { - %load = load volatile i32, i32* inttoptr (i32 8 to i32*) + %load = load volatile i32, i32 addrspace(5)* inttoptr (i32 8 to i32 addrspace(5)*) ret void } ; GCN-LABEL: {{^}}load_private_offset_v2i32: ; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @load_private_offset_v2i32() #0 { - %load = load volatile <2 x i32>, <2 x i32>* inttoptr (i32 8 to <2 x i32>*) + %load = load volatile <2 x i32>, <2 x i32> addrspace(5)* inttoptr (i32 8 to <2 x i32> addrspace(5)*) ret void } ; GCN-LABEL: {{^}}load_private_offset_v4i32: ; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @load_private_offset_v4i32() #0 { - %load = load volatile <4 x i32>, <4 x i32>* inttoptr (i32 8 to <4 x i32>*) + %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* inttoptr (i32 8 to <4 x i32> addrspace(5)*) ret void } ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset: ; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s2 offset:4095 define amdgpu_kernel void @store_private_offset_i8_max_offset() #0 { - store volatile i8 5, i8* inttoptr (i32 4095 to i8*) + store volatile i8 5, i8 addrspace(5)* inttoptr (i32 4095 to i8 addrspace(5)*) ret void } @@ -121,7 +122,7 @@ ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000 ; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s2 offen{{$}} define amdgpu_kernel void @store_private_offset_i8_max_offset_plus1() #0 { - store volatile i8 5, i8* inttoptr (i32 4096 to i8*) + store volatile i8 5, i8 addrspace(5)* inttoptr (i32 4096 to i8 addrspace(5)*) ret void } @@ -129,7 +130,7 @@ ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000 ; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s2 offen offset:1{{$}} define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 { - store volatile i8 5, i8* inttoptr (i32 4097 to i8*) + store volatile i8 5, i8 addrspace(5)* inttoptr (i32 4097 to i8 addrspace(5)*) ret void } @@ -144,11 +145,11 @@ ; GFX9: v_add_u32_e32 [[ADDR:v[0-9]+]], 4, ; GFX9: buffer_store_dword v{{[0-9]+}}, [[ADDR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:32 define amdgpu_kernel void @store_private_unknown_bits_vaddr() #0 { - %alloca = alloca [16 x i32], align 4 + %alloca = alloca [16 x i32], align 4, addrspace(5) %vaddr = load volatile i32, i32 addrspace(1)* undef %vaddr.off = add i32 %vaddr, 8 - %gep = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %vaddr.off - store volatile i32 9, i32* %gep + %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %vaddr.off + store volatile i32 9, i32 addrspace(5)* %gep ret void } Index: test/CodeGen/AMDGPU/nested-calls.ll =================================================================== --- test/CodeGen/AMDGPU/nested-calls.ll +++ test/CodeGen/AMDGPU/nested-calls.ll @@ -1,6 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; RUN: llc -march=amdgcn -mcpu=hawaii -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=VI %s +target datalayout = "A5" ; Test calls when called by other callable functions rather than ; kernels. @@ -39,11 +40,11 @@ ; GCN: s_sub_u32 s32, s32, 0x1200{{$}} ; GCN: s_setpc_b64 define void @test_func_call_external_void_func_i32_imm_stack_use() #0 { - %alloca = alloca [16 x i32], align 4 - %gep0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 0 - %gep15 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 16 - store volatile i32 0, i32* %gep0 - store volatile i32 0, i32* %gep15 + %alloca = alloca [16 x i32], align 4, addrspace(5) + %gep0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0 + %gep15 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 16 + store volatile i32 0, i32 addrspace(5)* %gep0 + store volatile i32 0, i32 addrspace(5)* %gep15 call void @external_void_func_i32(i32 42) ret void } Index: test/CodeGen/AMDGPU/parallelandifcollapse.ll =================================================================== --- test/CodeGen/AMDGPU/parallelandifcollapse.ll +++ test/CodeGen/AMDGPU/parallelandifcollapse.ll @@ -1,4 +1,5 @@ ; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck %s +target datalayout = "A5" ; ; CFG flattening should use parallel-and mode to generate branch conditions and ; then merge if-regions with the same bodies. @@ -13,44 +14,44 @@ define amdgpu_kernel void @_Z9chk1D_512v() #0 { entry: - %a0 = alloca i32, align 4 - %b0 = alloca i32, align 4 - %c0 = alloca i32, align 4 - %d0 = alloca i32, align 4 - %a1 = alloca i32, align 4 - %b1 = alloca i32, align 4 - %c1 = alloca i32, align 4 - %d1 = alloca i32, align 4 - %data = alloca i32, align 4 - %0 = load i32, i32* %a0, align 4 - %1 = load i32, i32* %b0, align 4 + %a0 = alloca i32, align 4, addrspace(5) + %b0 = alloca i32, align 4, addrspace(5) + %c0 = alloca i32, align 4, addrspace(5) + %d0 = alloca i32, align 4, addrspace(5) + %a1 = alloca i32, align 4, addrspace(5) + %b1 = alloca i32, align 4, addrspace(5) + %c1 = alloca i32, align 4, addrspace(5) + %d1 = alloca i32, align 4, addrspace(5) + %data = alloca i32, align 4, addrspace(5) + %0 = load i32, i32 addrspace(5)* %a0, align 4 + %1 = load i32, i32 addrspace(5)* %b0, align 4 %cmp = icmp ne i32 %0, %1 br i1 %cmp, label %land.lhs.true, label %if.end land.lhs.true: ; preds = %entry - %2 = load i32, i32* %c0, align 4 - %3 = load i32, i32* %d0, align 4 + %2 = load i32, i32 addrspace(5)* %c0, align 4 + %3 = load i32, i32 addrspace(5)* %d0, align 4 %cmp1 = icmp ne i32 %2, %3 br i1 %cmp1, label %if.then, label %if.end if.then: ; preds = %land.lhs.true - store i32 1, i32* %data, align 4 + store i32 1, i32 addrspace(5)* %data, align 4 br label %if.end if.end: ; preds = %if.then, %land.lhs.true, %entry - %4 = load i32, i32* %a1, align 4 - %5 = load i32, i32* %b1, align 4 + %4 = load i32, i32 addrspace(5)* %a1, align 4 + %5 = load i32, i32 addrspace(5)* %b1, align 4 %cmp2 = icmp ne i32 %4, %5 br i1 %cmp2, label %land.lhs.true3, label %if.end6 land.lhs.true3: ; preds = %if.end - %6 = load i32, i32* %c1, align 4 - %7 = load i32, i32* %d1, align 4 + %6 = load i32, i32 addrspace(5)* %c1, align 4 + %7 = load i32, i32 addrspace(5)* %d1, align 4 %cmp4 = icmp ne i32 %6, %7 br i1 %cmp4, label %if.then5, label %if.end6 if.then5: ; preds = %land.lhs.true3 - store i32 1, i32* %data, align 4 + store i32 1, i32 addrspace(5)* %data, align 4 br label %if.end6 if.end6: ; preds = %if.then5, %land.lhs.true3, %if.end Index: test/CodeGen/AMDGPU/private-access-no-objects.ll =================================================================== --- test/CodeGen/AMDGPU/private-access-no-objects.ll +++ test/CodeGen/AMDGPU/private-access-no-objects.ll @@ -19,7 +19,7 @@ ; OPTNONE-NOT: s_mov_b32 ; OPTNONE: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s5 offen{{$}} define amdgpu_kernel void @store_to_undef() #0 { - store volatile i32 0, i32* undef + store volatile i32 0, i32 addrspace(5)* undef ret void } @@ -29,7 +29,7 @@ ; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}} ; OPT: buffer_store_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offset:124{{$}} define amdgpu_kernel void @store_to_inttoptr() #0 { - store volatile i32 0, i32* inttoptr (i32 124 to i32*) + store volatile i32 0, i32 addrspace(5)* inttoptr (i32 124 to i32 addrspace(5)*) ret void } @@ -39,7 +39,7 @@ ; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}} ; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}} define amdgpu_kernel void @load_from_undef() #0 { - %ld = load volatile i32, i32* undef + %ld = load volatile i32, i32 addrspace(5)* undef ret void } @@ -49,7 +49,7 @@ ; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}} ; OPT: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offset:124{{$}} define amdgpu_kernel void @load_from_inttoptr() #0 { - %ld = load volatile i32, i32* inttoptr (i32 124 to i32*) + %ld = load volatile i32, i32 addrspace(5)* inttoptr (i32 124 to i32 addrspace(5)*) ret void } Index: test/CodeGen/AMDGPU/private-element-size.ll =================================================================== --- test/CodeGen/AMDGPU/private-element-size.ll +++ test/CodeGen/AMDGPU/private-element-size.ll @@ -1,6 +1,7 @@ ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=ELT16 -check-prefix=HSA -check-prefix=HSA-ELT16 -check-prefix=ALL -check-prefix=HSA_ELTGE8 %s ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-promote-alloca,+max-private-element-size-8 -verify-machineinstrs < %s | FileCheck -check-prefix=ELT8 -check-prefix=HSA -check-prefix=HSA-ELT8 -check-prefix=ALL -check-prefix=HSA-ELTGE8 %s ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-promote-alloca,+max-private-element-size-4 -verify-machineinstrs < %s | FileCheck -check-prefix=ELT4 -check-prefix=HSA -check-prefix=HSA-ELT4 -check-prefix=ALL %s +target datalayout = "A5" ; ALL-LABEL: {{^}}private_elt_size_v4i32: @@ -43,13 +44,13 @@ %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom %index.load = load i32, i32 addrspace(1)* %gep.index %index = and i32 %index.load, 2 - %alloca = alloca [2 x <4 x i32>], align 16 - %gep0 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %alloca, i32 0, i32 0 - %gep1 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %alloca, i32 0, i32 1 - store <4 x i32> zeroinitializer, <4 x i32>* %gep0 - store <4 x i32> , <4 x i32>* %gep1 - %gep2 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %alloca, i32 0, i32 %index - %load = load <4 x i32>, <4 x i32>* %gep2 + %alloca = alloca [2 x <4 x i32>], align 16, addrspace(5) + %gep0 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] addrspace(5)* %alloca, i32 0, i32 0 + %gep1 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] addrspace(5)* %alloca, i32 0, i32 1 + store <4 x i32> zeroinitializer, <4 x i32> addrspace(5)* %gep0 + store <4 x i32> , <4 x i32> addrspace(5)* %gep1 + %gep2 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] addrspace(5)* %alloca, i32 0, i32 %index + %load = load <4 x i32>, <4 x i32> addrspace(5)* %gep2 store <4 x i32> %load, <4 x i32> addrspace(1)* %out ret void } @@ -113,13 +114,13 @@ %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom %index.load = load i32, i32 addrspace(1)* %gep.index %index = and i32 %index.load, 2 - %alloca = alloca [2 x <8 x i32>], align 16 - %gep0 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>]* %alloca, i32 0, i32 0 - %gep1 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>]* %alloca, i32 0, i32 1 - store <8 x i32> zeroinitializer, <8 x i32>* %gep0 - store <8 x i32> , <8 x i32>* %gep1 - %gep2 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>]* %alloca, i32 0, i32 %index - %load = load <8 x i32>, <8 x i32>* %gep2 + %alloca = alloca [2 x <8 x i32>], align 16, addrspace(5) + %gep0 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>] addrspace(5)* %alloca, i32 0, i32 0 + %gep1 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>] addrspace(5)* %alloca, i32 0, i32 1 + store <8 x i32> zeroinitializer, <8 x i32> addrspace(5)* %gep0 + store <8 x i32> , <8 x i32> addrspace(5)* %gep1 + %gep2 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>] addrspace(5)* %alloca, i32 0, i32 %index + %load = load <8 x i32>, <8 x i32> addrspace(5)* %gep2 store <8 x i32> %load, <8 x i32> addrspace(1)* %out ret void } @@ -150,13 +151,13 @@ %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom %index.load = load i32, i32 addrspace(1)* %gep.index %index = and i32 %index.load, 2 - %alloca = alloca [2 x i64], align 16 - %gep0 = getelementptr inbounds [2 x i64], [2 x i64]* %alloca, i32 0, i32 0 - %gep1 = getelementptr inbounds [2 x i64], [2 x i64]* %alloca, i32 0, i32 1 - store i64 0, i64* %gep0 - store i64 34359738602, i64* %gep1 - %gep2 = getelementptr inbounds [2 x i64], [2 x i64]* %alloca, i32 0, i32 %index - %load = load i64, i64* %gep2 + %alloca = alloca [2 x i64], align 16, addrspace(5) + %gep0 = getelementptr inbounds [2 x i64], [2 x i64] addrspace(5)* %alloca, i32 0, i32 0 + %gep1 = getelementptr inbounds [2 x i64], [2 x i64] addrspace(5)* %alloca, i32 0, i32 1 + store i64 0, i64 addrspace(5)* %gep0 + store i64 34359738602, i64 addrspace(5)* %gep1 + %gep2 = getelementptr inbounds [2 x i64], [2 x i64] addrspace(5)* %alloca, i32 0, i32 %index + %load = load i64, i64 addrspace(5)* %gep2 store i64 %load, i64 addrspace(1)* %out ret void } @@ -186,13 +187,13 @@ %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom %index.load = load i32, i32 addrspace(1)* %gep.index %index = and i32 %index.load, 2 - %alloca = alloca [2 x double], align 16 - %gep0 = getelementptr inbounds [2 x double], [2 x double]* %alloca, i32 0, i32 0 - %gep1 = getelementptr inbounds [2 x double], [2 x double]* %alloca, i32 0, i32 1 - store double 0.0, double* %gep0 - store double 4.0, double* %gep1 - %gep2 = getelementptr inbounds [2 x double], [2 x double]* %alloca, i32 0, i32 %index - %load = load double, double* %gep2 + %alloca = alloca [2 x double], align 16, addrspace(5) + %gep0 = getelementptr inbounds [2 x double], [2 x double] addrspace(5)* %alloca, i32 0, i32 0 + %gep1 = getelementptr inbounds [2 x double], [2 x double] addrspace(5)* %alloca, i32 0, i32 1 + store double 0.0, double addrspace(5)* %gep0 + store double 4.0, double addrspace(5)* %gep1 + %gep2 = getelementptr inbounds [2 x double], [2 x double] addrspace(5)* %alloca, i32 0, i32 %index + %load = load double, double addrspace(5)* %gep2 store double %load, double addrspace(1)* %out ret void } @@ -235,13 +236,13 @@ %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom %index.load = load i32, i32 addrspace(1)* %gep.index %index = and i32 %index.load, 2 - %alloca = alloca [2 x <2 x i64>], align 16 - %gep0 = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* %alloca, i32 0, i32 0 - %gep1 = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* %alloca, i32 0, i32 1 - store <2 x i64> zeroinitializer, <2 x i64>* %gep0 - store <2 x i64> , <2 x i64>* %gep1 - %gep2 = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* %alloca, i32 0, i32 %index - %load = load <2 x i64>, <2 x i64>* %gep2 + %alloca = alloca [2 x <2 x i64>], align 16, addrspace(5) + %gep0 = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>] addrspace(5)* %alloca, i32 0, i32 0 + %gep1 = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>] addrspace(5)* %alloca, i32 0, i32 1 + store <2 x i64> zeroinitializer, <2 x i64> addrspace(5)* %gep0 + store <2 x i64> , <2 x i64> addrspace(5)* %gep1 + %gep2 = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>] addrspace(5)* %alloca, i32 0, i32 %index + %load = load <2 x i64>, <2 x i64> addrspace(5)* %gep2 store <2 x i64> %load, <2 x i64> addrspace(1)* %out ret void } Index: test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll +++ test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll @@ -1,4 +1,5 @@ ; RUN: not llc -march=amdgcn < %s 2>&1 | FileCheck %s +target datalayout = "A5" ; FIXME: Error is misleading because it's not an indirect call. @@ -7,20 +8,20 @@ ; Make sure that AMDGPUPromoteAlloca doesn't crash if the called ; function is a constantexpr cast of a function. -declare void @foo(float*) #0 +declare void @foo(float addrspace(5)*) #0 declare void @foo.varargs(...) #0 ; XCHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo define amdgpu_kernel void @crash_call_constexpr_cast() #0 { - %alloca = alloca i32 - call void bitcast (void (float*)* @foo to void (i32*)*)(i32* %alloca) #0 + %alloca = alloca i32, addrspace(5) + call void bitcast (void (float addrspace(5)*)* @foo to void (i32 addrspace(5)*)*)(i32 addrspace(5)* %alloca) #0 ret void } ; XCHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo.varargs define amdgpu_kernel void @crash_call_constexpr_cast_varargs() #0 { - %alloca = alloca i32 - call void bitcast (void (...)* @foo.varargs to void (i32*)*)(i32* %alloca) #0 + %alloca = alloca i32, addrspace(5) + call void bitcast (void (...)* @foo.varargs to void (i32 addrspace(5)*)*)(i32 addrspace(5)* %alloca) #0 ret void } Index: test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll +++ test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll @@ -1,5 +1,6 @@ ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck -check-prefix=IR %s ; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=ASM %s +target datalayout = "A5" ; IR-LABEL: define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 { ; IR: alloca [5 x i32] @@ -8,19 +9,19 @@ ; ASM: ; ScratchSize: 24 define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 { entry: - %stack = alloca [5 x i32], align 4 + %stack = alloca [5 x i32], align 4, addrspace(5) %tmp0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp0 - store i32 4, i32* %arrayidx1, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp0 + store i32 4, i32 addrspace(5)* %arrayidx1, align 4 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp1 - store i32 5, i32* %arrayidx3, align 4 - %arrayidx4 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 - %tmp2 = load i32, i32* %arrayidx4, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp1 + store i32 5, i32 addrspace(5)* %arrayidx3, align 4 + %arrayidx4 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 + %tmp2 = load i32, i32 addrspace(5)* %arrayidx4, align 4 store i32 %tmp2, i32 addrspace(1)* %out, align 4 - %arrayidx5 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 - %tmp3 = load i32, i32* %arrayidx5 + %arrayidx5 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 + %tmp3 = load i32, i32 addrspace(5)* %arrayidx5 %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 store i32 %tmp3, i32 addrspace(1)* %arrayidx6 ret void @@ -35,13 +36,13 @@ ; ASM: ; ScratchSize: 0 define void @promote_to_vector_call_c(i32 addrspace(1)* %out, i32 %in) #0 { entry: - %tmp = alloca [2 x i32] - %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 - %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1 - store i32 0, i32* %tmp1 - store i32 1, i32* %tmp2 - %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in - %tmp4 = load i32, i32* %tmp3 + %tmp = alloca [2 x i32], addrspace(5) + %tmp1 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 0 + %tmp2 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 1 + store i32 0, i32 addrspace(5)* %tmp1 + store i32 1, i32 addrspace(5)* %tmp2 + %tmp3 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 %in + %tmp4 = load i32, i32 addrspace(5)* %tmp3 %tmp5 = load volatile i32, i32 addrspace(1)* undef %tmp6 = add i32 %tmp4, %tmp5 store i32 %tmp6, i32 addrspace(1)* %out @@ -56,25 +57,25 @@ ; ASM: ; ScratchSize: 24 define void @no_promote_to_lds_c(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: - %stack = alloca [5 x i32], align 4 + %stack = alloca [5 x i32], align 4, addrspace(5) %0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 - store i32 4, i32* %arrayidx1, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0 + store i32 4, i32 addrspace(5)* %arrayidx1, align 4 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 - store i32 5, i32* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 - %2 = load i32, i32* %arrayidx10, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1 + store i32 5, i32 addrspace(5)* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 + %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 store i32 %2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 - %3 = load i32, i32* %arrayidx12 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 + %3 = load i32, i32 addrspace(5)* %arrayidx12 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 store i32 %3, i32 addrspace(1)* %arrayidx13 ret void } -declare i32 @foo(i32*) #0 +declare i32 @foo(i32 addrspace(5)*) #0 ; ASM-LABEL: {{^}}call_private: ; ASM: buffer_store_dword @@ -83,13 +84,13 @@ ; ASM: ScratchSize: 16396 define amdgpu_kernel void @call_private(i32 addrspace(1)* %out, i32 %in) #0 { entry: - %tmp = alloca [2 x i32] - %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 - %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1 - store i32 0, i32* %tmp1 - store i32 1, i32* %tmp2 - %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in - %val = call i32 @foo(i32* %tmp3) + %tmp = alloca [2 x i32], addrspace(5) + %tmp1 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 0 + %tmp2 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 1 + store i32 0, i32 addrspace(5)* %tmp1 + store i32 1, i32 addrspace(5)* %tmp2 + %tmp3 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 %in + %val = call i32 @foo(i32 addrspace(5)* %tmp3) store i32 %val, i32 addrspace(1)* %out ret void } Index: test/CodeGen/AMDGPU/promote-alloca-globals.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-globals.ll +++ test/CodeGen/AMDGPU/promote-alloca-globals.ll @@ -1,5 +1,6 @@ ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck -check-prefix=IR %s ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=ASM %s +target datalayout = "A5" @global_array0 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4 @@ -12,19 +13,19 @@ define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { entry: - %stack = alloca [10 x i32], align 4 + %stack = alloca [10 x i32], align 4, addrspace(5) %tmp = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [10 x i32], [10 x i32]* %stack, i32 0, i32 %tmp - store i32 4, i32* %arrayidx1, align 4 + %arrayidx1 = getelementptr inbounds [10 x i32], [10 x i32] addrspace(5)* %stack, i32 0, i32 %tmp + store i32 4, i32 addrspace(5)* %arrayidx1, align 4 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [10 x i32], [10 x i32]* %stack, i32 0, i32 %tmp1 - store i32 5, i32* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [10 x i32], [10 x i32]* %stack, i32 0, i32 0 - %tmp2 = load i32, i32* %arrayidx10, align 4 + %arrayidx3 = getelementptr inbounds [10 x i32], [10 x i32] addrspace(5)* %stack, i32 0, i32 %tmp1 + store i32 5, i32 addrspace(5)* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [10 x i32], [10 x i32] addrspace(5)* %stack, i32 0, i32 0 + %tmp2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 store i32 %tmp2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [10 x i32], [10 x i32]* %stack, i32 0, i32 1 - %tmp3 = load i32, i32* %arrayidx12 + %arrayidx12 = getelementptr inbounds [10 x i32], [10 x i32] addrspace(5)* %stack, i32 0, i32 1 + %tmp3 = load i32, i32 addrspace(5)* %arrayidx12 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 store i32 %tmp3, i32 addrspace(1)* %arrayidx13 %v0 = getelementptr inbounds [750 x [10 x i32]], [750 x [10 x i32]] addrspace(3)* @global_array0, i32 0, i32 0, i32 0 Index: test/CodeGen/AMDGPU/promote-alloca-no-opts.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-no-opts.ll +++ test/CodeGen/AMDGPU/promote-alloca-no-opts.ll @@ -1,5 +1,6 @@ ; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=+promote-alloca < %s | FileCheck -check-prefix=NOOPTS -check-prefix=ALL %s ; RUN: llc -O1 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=+promote-alloca < %s | FileCheck -check-prefix=OPTS -check-prefix=ALL %s +target datalayout = "A5" ; ALL-LABEL: {{^}}promote_alloca_i32_array_array: ; NOOPTS: workgroup_group_segment_byte_size = 0{{$}} @@ -7,13 +8,13 @@ ; OPTS: ds_write define amdgpu_kernel void @promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: - %alloca = alloca [2 x [2 x i32]] - %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0 - %gep1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1 - store i32 0, i32* %gep0 - store i32 1, i32* %gep1 - %gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index - %load = load i32, i32* %gep2 + %alloca = alloca [2 x [2 x i32]], addrspace(5) + %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 0 + %gep1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 1 + store i32 0, i32 addrspace(5)* %gep0 + store i32 1, i32 addrspace(5)* %gep1 + %gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index + %load = load i32, i32 addrspace(5)* %gep2 store i32 %load, i32 addrspace(1)* %out ret void } @@ -23,13 +24,13 @@ ; ALL-NOT ds_write define amdgpu_kernel void @optnone_promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #1 { entry: - %alloca = alloca [2 x [2 x i32]] - %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0 - %gep1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1 - store i32 0, i32* %gep0 - store i32 1, i32* %gep1 - %gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index - %load = load i32, i32* %gep2 + %alloca = alloca [2 x [2 x i32]], addrspace(5) + %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 0 + %gep1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 1 + store i32 0, i32 addrspace(5)* %gep0 + store i32 1, i32 addrspace(5)* %gep1 + %gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index + %load = load i32, i32 addrspace(5)* %gep2 store i32 %load, i32 addrspace(1)* %out ret void } Index: test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll +++ test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN %s +target datalayout = "A5" ; This shows that the amount of LDS estimate is sensitive to the order ; of the LDS globals. @@ -32,19 +33,19 @@ ; GCN: workgroup_group_segment_byte_size = 2340 define amdgpu_kernel void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { entry: - %stack = alloca [5 x i32], align 4 + %stack = alloca [5 x i32], align 4, addrspace(5) %tmp0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp0 - store i32 4, i32* %arrayidx1, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp0 + store i32 4, i32 addrspace(5)* %arrayidx1, align 4 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp1 - store i32 5, i32* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 - %tmp2 = load i32, i32* %arrayidx10, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp1 + store i32 5, i32 addrspace(5)* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 + %tmp2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 store i32 %tmp2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 - %tmp3 = load i32, i32* %arrayidx12 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 + %tmp3 = load i32, i32 addrspace(5)* %arrayidx12 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 store i32 %tmp3, i32 addrspace(1)* %arrayidx13 @@ -64,19 +65,19 @@ ; GCN: workgroup_group_segment_byte_size = 2352 define amdgpu_kernel void @promote_alloca_size_order_1(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { entry: - %stack = alloca [5 x i32], align 4 + %stack = alloca [5 x i32], align 4, addrspace(5) %tmp0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp0 - store i32 4, i32* %arrayidx1, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp0 + store i32 4, i32 addrspace(5)* %arrayidx1, align 4 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp1 - store i32 5, i32* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 - %tmp2 = load i32, i32* %arrayidx10, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp1 + store i32 5, i32 addrspace(5)* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 + %tmp2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 store i32 %tmp2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 - %tmp3 = load i32, i32* %arrayidx12 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 + %tmp3 = load i32, i32 addrspace(5)* %arrayidx12 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 store i32 %tmp3, i32 addrspace(1)* %arrayidx13 @@ -102,19 +103,19 @@ ; GCN: workgroup_group_segment_byte_size = 1060 define amdgpu_kernel void @promote_alloca_align_pad_guess_over_limit(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { entry: - %stack = alloca [5 x i32], align 4 + %stack = alloca [5 x i32], align 4, addrspace(5) %tmp0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp0 - store i32 4, i32* %arrayidx1, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp0 + store i32 4, i32 addrspace(5)* %arrayidx1, align 4 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp1 - store i32 5, i32* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 - %tmp2 = load i32, i32* %arrayidx10, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp1 + store i32 5, i32 addrspace(5)* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 + %tmp2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 store i32 %tmp2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 - %tmp3 = load i32, i32* %arrayidx12 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 + %tmp3 = load i32, i32 addrspace(5)* %arrayidx12 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 store i32 %tmp3, i32 addrspace(1)* %arrayidx13 Index: test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll +++ test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll @@ -1,26 +1,27 @@ ; RUN: llc -march=amdgcn -mattr=+promote-alloca,+max-private-element-size-4 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mattr=-promote-alloca,+max-private-element-size-4 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +target datalayout = "A5" ; Pointer value is stored in a candidate for LDS usage. ; GCN-LABEL: {{^}}stored_lds_pointer_value: ; GCN: buffer_store_dword v -define amdgpu_kernel void @stored_lds_pointer_value(float* addrspace(1)* %ptr) #0 { - %tmp = alloca float - store float 0.0, float *%tmp - store float* %tmp, float* addrspace(1)* %ptr +define amdgpu_kernel void @stored_lds_pointer_value(float addrspace(5)* addrspace(1)* %ptr) #0 { + %tmp = alloca float, addrspace(5) + store float 0.0, float addrspace(5)*%tmp + store float addrspace(5)* %tmp, float addrspace(5)* addrspace(1)* %ptr ret void } ; GCN-LABEL: {{^}}stored_lds_pointer_value_offset: ; GCN: buffer_store_dword v -define amdgpu_kernel void @stored_lds_pointer_value_offset(float* addrspace(1)* %ptr) #0 { - %tmp0 = alloca float - %tmp1 = alloca float - store float 0.0, float *%tmp0 - store float 0.0, float *%tmp1 - store volatile float* %tmp0, float* addrspace(1)* %ptr - store volatile float* %tmp1, float* addrspace(1)* %ptr +define amdgpu_kernel void @stored_lds_pointer_value_offset(float addrspace(5)* addrspace(1)* %ptr) #0 { + %tmp0 = alloca float, addrspace(5) + %tmp1 = alloca float, addrspace(5) + store float 0.0, float addrspace(5)*%tmp0 + store float 0.0, float addrspace(5)*%tmp1 + store volatile float addrspace(5)* %tmp0, float addrspace(5)* addrspace(1)* %ptr + store volatile float addrspace(5)* %tmp1, float addrspace(5)* addrspace(1)* %ptr ret void } @@ -29,12 +30,12 @@ ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; GCN: buffer_store_dword v ; GCN: buffer_store_dword v -define amdgpu_kernel void @stored_lds_pointer_value_gep(float* addrspace(1)* %ptr, i32 %idx) #0 { +define amdgpu_kernel void @stored_lds_pointer_value_gep(float addrspace(5)* addrspace(1)* %ptr, i32 %idx) #0 { bb: - %tmp = alloca float, i32 16 - store float 0.0, float* %tmp - %tmp2 = getelementptr inbounds float, float* %tmp, i32 %idx - store float* %tmp2, float* addrspace(1)* %ptr + %tmp = alloca float, i32 16, addrspace(5) + store float 0.0, float addrspace(5)* %tmp + %tmp2 = getelementptr inbounds float, float addrspace(5)* %tmp, i32 %idx + store float addrspace(5)* %tmp2, float addrspace(5)* addrspace(1)* %ptr ret void } @@ -46,29 +47,29 @@ ; GCN: buffer_store_dword ; GCN: buffer_store_dword ; GCN: buffer_store_dword -define amdgpu_kernel void @stored_vector_pointer_value(i32* addrspace(1)* %out, i32 %index) { +define amdgpu_kernel void @stored_vector_pointer_value(i32 addrspace(5)* addrspace(1)* %out, i32 %index) { entry: - %tmp0 = alloca [4 x i32] - %x = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 0 - %y = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 1 - %z = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 2 - %w = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 3 - store i32 0, i32* %x - store i32 1, i32* %y - store i32 2, i32* %z - store i32 3, i32* %w - %tmp1 = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 %index - store i32* %tmp1, i32* addrspace(1)* %out + %tmp0 = alloca [4 x i32], addrspace(5) + %x = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp0, i32 0, i32 0 + %y = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp0, i32 0, i32 1 + %z = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp0, i32 0, i32 2 + %w = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp0, i32 0, i32 3 + store i32 0, i32 addrspace(5)* %x + store i32 1, i32 addrspace(5)* %y + store i32 2, i32 addrspace(5)* %z + store i32 3, i32 addrspace(5)* %w + %tmp1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp0, i32 0, i32 %index + store i32 addrspace(5)* %tmp1, i32 addrspace(5)* addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}stored_fi_to_self: ; GCN-NOT: ds_ define amdgpu_kernel void @stored_fi_to_self() #0 { - %tmp = alloca i32* - store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp - %bitcast = bitcast i32** %tmp to i32* - store volatile i32* %bitcast, i32** %tmp + %tmp = alloca i32 addrspace(5)*, addrspace(5) + store volatile i32 addrspace(5)* inttoptr (i32 1234 to i32 addrspace(5)*), i32 addrspace(5)* addrspace(5)* %tmp + %bitcast = bitcast i32 addrspace(5)* addrspace(5)* %tmp to i32 addrspace(5)* + store volatile i32 addrspace(5)* %bitcast, i32 addrspace(5)* addrspace(5)* %tmp ret void } Index: test/CodeGen/AMDGPU/r600.alu-limits.ll =================================================================== --- test/CodeGen/AMDGPU/r600.alu-limits.ll +++ test/CodeGen/AMDGPU/r600.alu-limits.ll @@ -6,10 +6,10 @@ %struct.foo = type {i32, i32, i32} -define amdgpu_kernel void @alu_limits(i32 addrspace(1)* %out, %struct.foo* %in, i32 %offset) { +define amdgpu_kernel void @alu_limits(i32 addrspace(1)* %out, %struct.foo addrspace(5)* %in, i32 %offset) { entry: - %ptr = getelementptr inbounds %struct.foo, %struct.foo* %in, i32 1, i32 2 - %x = load i32, i32 *%ptr, align 4 + %ptr = getelementptr inbounds %struct.foo, %struct.foo addrspace(5)* %in, i32 1, i32 2 + %x = load i32, i32 addrspace(5)*%ptr, align 4 br label %loop loop: %i = phi i32 [ 100, %entry ], [ %nexti, %loop ] Index: test/CodeGen/AMDGPU/r600.private-memory.ll =================================================================== --- test/CodeGen/AMDGPU/r600.private-memory.ll +++ test/CodeGen/AMDGPU/r600.private-memory.ll @@ -1,4 +1,5 @@ ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC +target datalayout = "A5" declare i32 @llvm.r600.read.tidig.x() nounwind readnone @@ -12,13 +13,13 @@ define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) { entry: - %0 = alloca [2 x i32] - %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0 - %2 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 1 - store i32 0, i32* %1 - store i32 1, i32* %2 - %3 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 %in - %4 = load i32, i32* %3 + %0 = alloca [2 x i32], addrspace(5) + %1 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %0, i32 0, i32 0 + %2 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %0, i32 0, i32 1 + store i32 0, i32 addrspace(5)* %1 + store i32 1, i32 addrspace(5)* %2 + %3 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %0, i32 0, i32 %in + %4 = load i32, i32 addrspace(5)* %3 %5 = call i32 @llvm.r600.read.tidig.x() %6 = add i32 %4, %5 store i32 %6, i32 addrspace(1)* %out Index: test/CodeGen/AMDGPU/sad.ll =================================================================== --- test/CodeGen/AMDGPU/sad.ll +++ test/CodeGen/AMDGPU/sad.ll @@ -59,7 +59,7 @@ %t1 = select i1 %icmp1, i32 %a, i32 %b %ret0 = sub i32 %t0, %t1 - store volatile i32 %ret0, i32 *undef + store volatile i32 %ret0, i32 addrspace(5)*undef %ret = add i32 %ret0, %c store i32 %ret, i32 addrspace(1)* %out @@ -77,7 +77,7 @@ %ret0 = sub i32 %t0, %t1 %ret = add i32 %ret0, %c - store volatile i32 %ret, i32 *undef + store volatile i32 %ret, i32 addrspace(5)*undef store i32 %ret, i32 addrspace(1)* %out ret void } @@ -87,7 +87,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { %icmp0 = icmp ugt i32 %a, %b %t0 = select i1 %icmp0, i32 %a, i32 %b - store volatile i32 %t0, i32 *undef + store volatile i32 %t0, i32 addrspace(5)*undef %icmp1 = icmp ule i32 %a, %b %t1 = select i1 %icmp1, i32 %a, i32 %b @@ -108,7 +108,7 @@ %icmp1 = icmp ule i32 %a, %b %t1 = select i1 %icmp1, i32 %a, i32 %b - store volatile i32 %t1, i32 *undef + store volatile i32 %t1, i32 addrspace(5)*undef %ret0 = sub i32 %t0, %t1 %ret = add i32 %ret0, %c @@ -122,7 +122,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { %icmp0 = icmp ugt i32 %a, %b %sub0 = sub i32 %a, %b - store volatile i32 %sub0, i32 *undef + store volatile i32 %sub0, i32 addrspace(5)*undef %sub1 = sub i32 %b, %a %ret0 = select i1 %icmp0, i32 %sub0, i32 %sub1 @@ -141,7 +141,7 @@ %sub0 = sub i32 %a, %b %sub1 = sub i32 %b, %a %ret0 = select i1 %icmp0, i32 %sub0, i32 %sub1 - store volatile i32 %ret0, i32 *undef + store volatile i32 %ret0, i32 addrspace(5)*undef %ret = add i32 %ret0, %c Index: test/CodeGen/AMDGPU/scratch-buffer.ll =================================================================== --- test/CodeGen/AMDGPU/scratch-buffer.ll +++ test/CodeGen/AMDGPU/scratch-buffer.ll @@ -1,5 +1,6 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +target datalayout = "A5" ; When a frame index offset is more than 12-bits, make sure we don't store ; it in mubuf's offset field. @@ -15,26 +16,26 @@ define amdgpu_kernel void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) { entry: - %scratch0 = alloca [8192 x i32] - %scratch1 = alloca [8192 x i32] + %scratch0 = alloca [8192 x i32], addrspace(5) + %scratch1 = alloca [8192 x i32], addrspace(5) - %scratchptr0 = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 0 - store i32 1, i32* %scratchptr0 + %scratchptr0 = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch0, i32 0, i32 0 + store i32 1, i32 addrspace(5)* %scratchptr0 - %scratchptr1 = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 0 - store i32 2, i32* %scratchptr1 + %scratchptr1 = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch1, i32 0, i32 0 + store i32 2, i32 addrspace(5)* %scratchptr1 %cmp = icmp eq i32 %cond, 0 br i1 %cmp, label %if, label %else if: - %if_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %if_offset - %if_value = load i32, i32* %if_ptr + %if_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch0, i32 0, i32 %if_offset + %if_value = load i32, i32 addrspace(5)* %if_ptr br label %done else: - %else_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %else_offset - %else_value = load i32, i32* %else_ptr + %else_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch1, i32 0, i32 %else_offset + %else_value = load i32, i32 addrspace(5)* %else_ptr br label %done done: @@ -55,29 +56,29 @@ define amdgpu_kernel void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) { entry: - %scratch0 = alloca [8192 x i32] - %scratch1 = alloca [8192 x i32] + %scratch0 = alloca [8192 x i32], addrspace(5) + %scratch1 = alloca [8192 x i32], addrspace(5) %offset0 = load i32, i32 addrspace(1)* %offsets - %scratchptr0 = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %offset0 - store i32 %offset0, i32* %scratchptr0 + %scratchptr0 = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch0, i32 0, i32 %offset0 + store i32 %offset0, i32 addrspace(5)* %scratchptr0 %offsetptr1 = getelementptr i32, i32 addrspace(1)* %offsets, i32 1 %offset1 = load i32, i32 addrspace(1)* %offsetptr1 - %scratchptr1 = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %offset1 - store i32 %offset1, i32* %scratchptr1 + %scratchptr1 = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch1, i32 0, i32 %offset1 + store i32 %offset1, i32 addrspace(5)* %scratchptr1 %cmp = icmp eq i32 %cond, 0 br i1 %cmp, label %if, label %else if: - %if_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %if_offset - %if_value = load i32, i32* %if_ptr + %if_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch0, i32 0, i32 %if_offset + %if_value = load i32, i32 addrspace(5)* %if_ptr br label %done else: - %else_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %else_offset - %else_value = load i32, i32* %else_ptr + %else_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch1, i32 0, i32 %else_offset + %else_value = load i32, i32 addrspace(5)* %else_ptr br label %done done: @@ -91,10 +92,10 @@ ; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen{{$}} define amdgpu_kernel void @neg_vaddr_offset_inbounds(i32 %offset) { entry: - %array = alloca [8192 x i32] + %array = alloca [8192 x i32], addrspace(5) %ptr_offset = add i32 %offset, 4 - %ptr = getelementptr inbounds [8192 x i32], [8192 x i32]* %array, i32 0, i32 %ptr_offset - store i32 0, i32* %ptr + %ptr = getelementptr inbounds [8192 x i32], [8192 x i32] addrspace(5)* %array, i32 0, i32 %ptr_offset + store i32 0, i32 addrspace(5)* %ptr ret void } @@ -103,10 +104,10 @@ ; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen{{$}} define amdgpu_kernel void @neg_vaddr_offset(i32 %offset) { entry: - %array = alloca [8192 x i32] + %array = alloca [8192 x i32], addrspace(5) %ptr_offset = add i32 %offset, 4 - %ptr = getelementptr [8192 x i32], [8192 x i32]* %array, i32 0, i32 %ptr_offset - store i32 0, i32* %ptr + %ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %array, i32 0, i32 %ptr_offset + store i32 0, i32 addrspace(5)* %ptr ret void } @@ -114,11 +115,11 @@ ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:20 define amdgpu_kernel void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) { entry: - %array = alloca [8192 x i32] - %ptr = getelementptr [8192 x i32], [8192 x i32]* %array, i32 0, i32 4 - store i32 0, i32* %ptr - %load_ptr = getelementptr [8192 x i32], [8192 x i32]* %array, i32 0, i32 %offset - %val = load i32, i32* %load_ptr + %array = alloca [8192 x i32], addrspace(5) + %ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %array, i32 0, i32 4 + store i32 0, i32 addrspace(5)* %ptr + %load_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %array, i32 0, i32 %offset + %val = load i32, i32 addrspace(5)* %load_ptr store i32 %val, i32 addrspace(1)* %out ret void } Index: test/CodeGen/AMDGPU/shl_add_ptr.ll =================================================================== --- test/CodeGen/AMDGPU/shl_add_ptr.ll +++ test/CodeGen/AMDGPU/shl_add_ptr.ll @@ -342,10 +342,10 @@ %idx.add = add nuw i32 %idx, 4 %shl0 = shl i32 %idx.add, 2 %shl1 = shl i32 %idx.add, 3 - %ptr0 = inttoptr i32 %shl0 to i32* - %ptr1 = inttoptr i32 %shl1 to i32* - store volatile i32 9, i32* %ptr0 - store volatile i32 10, i32* %ptr1 + %ptr0 = inttoptr i32 %shl0 to i32 addrspace(5)* + %ptr1 = inttoptr i32 %shl1 to i32 addrspace(5)* + store volatile i32 9, i32 addrspace(5)* %ptr0 + store volatile i32 10, i32 addrspace(5)* %ptr1 ret void } @@ -360,10 +360,10 @@ %idx.add = add nuw i32 %idx, 511 %shl0 = shl i32 %idx.add, 3 %shl1 = shl i32 %idx.add, 4 - %ptr0 = inttoptr i32 %shl0 to i32* - %ptr1 = inttoptr i32 %shl1 to i32* - store volatile i32 9, i32* %ptr0 - store volatile i32 10, i32* %ptr1 + %ptr0 = inttoptr i32 %shl0 to i32 addrspace(5)* + %ptr1 = inttoptr i32 %shl1 to i32 addrspace(5)* + store volatile i32 9, i32 addrspace(5)* %ptr0 + store volatile i32 10, i32 addrspace(5)* %ptr1 ret void } ; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_both_max_private_offset: @@ -377,10 +377,10 @@ %idx.add = add nuw i32 %idx, 256 %shl0 = shl i32 %idx.add, 4 %shl1 = shl i32 %idx.add, 5 - %ptr0 = inttoptr i32 %shl0 to i32* - %ptr1 = inttoptr i32 %shl1 to i32* - store volatile i32 9, i32* %ptr0 - store volatile i32 10, i32* %ptr1 + %ptr0 = inttoptr i32 %shl0 to i32 addrspace(5)* + %ptr1 = inttoptr i32 %shl1 to i32 addrspace(5)* + store volatile i32 9, i32 addrspace(5)* %ptr0 + store volatile i32 10, i32 addrspace(5)* %ptr1 ret void } Index: test/CodeGen/AMDGPU/stack-size-overflow.ll =================================================================== --- test/CodeGen/AMDGPU/stack-size-overflow.ll +++ test/CodeGen/AMDGPU/stack-size-overflow.ll @@ -1,14 +1,15 @@ ; RUN: not llc -march=amdgcn < %s 2>&1 | FileCheck -check-prefix=ERROR %s ; RUN: not llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s +target datalayout = "A5" -declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) #1 +declare void @llvm.memset.p5i8.i32(i8 addrspace(5)* nocapture, i8, i32, i32, i1) #1 ; ERROR: error: stack size limit exceeded (4294967296) in stack_size_limit ; GCN: ; ScratchSize: 4294967296 define amdgpu_kernel void @stack_size_limit() #0 { entry: - %alloca = alloca [1073741823 x i32], align 4 - %bc = bitcast [1073741823 x i32]* %alloca to i8* - call void @llvm.memset.p0i8.i32(i8* %bc, i8 9, i32 1073741823, i32 1, i1 true) + %alloca = alloca [1073741823 x i32], align 4, addrspace(5) + %bc = bitcast [1073741823 x i32] addrspace(5)* %alloca to i8 addrspace(5)* + call void @llvm.memset.p5i8.i32(i8 addrspace(5)* %bc, i8 9, i32 1073741823, i32 1, i1 true) ret void } Index: test/CodeGen/AMDGPU/store-hi16.ll =================================================================== --- test/CodeGen/AMDGPU/store-hi16.ll +++ test/CodeGen/AMDGPU/store-hi16.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +target datalayout = "A5" ; GCN-LABEL: {{^}}store_global_hi_v2i16: ; GCN: s_waitcnt @@ -187,11 +188,11 @@ ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 -define void @store_flat_hi_v2i16(i16 addrspace(4)* %out, i32 %arg) #0 { +define void @store_flat_hi_v2i16(i16* %out, i32 %arg) #0 { entry: %value = bitcast i32 %arg to <2 x i16> %hi = extractelement <2 x i16> %value, i32 1 - store i16 %hi, i16 addrspace(4)* %out + store i16 %hi, i16* %out ret void } @@ -205,11 +206,11 @@ ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 -define void @store_flat_hi_v2f16(half addrspace(4)* %out, i32 %arg) #0 { +define void @store_flat_hi_v2f16(half* %out, i32 %arg) #0 { entry: %value = bitcast i32 %arg to <2 x half> %hi = extractelement <2 x half> %value, i32 1 - store half %hi, half addrspace(4)* %out + store half %hi, half* %out ret void } @@ -223,11 +224,11 @@ ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 -define void @store_flat_hi_i32_shift(i16 addrspace(4)* %out, i32 %value) #0 { +define void @store_flat_hi_i32_shift(i16* %out, i32 %value) #0 { entry: %hi32 = lshr i32 %value, 16 %hi = trunc i32 %hi32 to i16 - store i16 %hi, i16 addrspace(4)* %out + store i16 %hi, i16* %out ret void } @@ -241,12 +242,12 @@ ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 -define void @store_flat_hi_v2i16_i8(i8 addrspace(4)* %out, i32 %arg) #0 { +define void @store_flat_hi_v2i16_i8(i8* %out, i32 %arg) #0 { entry: %value = bitcast i32 %arg to <2 x i16> %hi = extractelement <2 x i16> %value, i32 1 %trunc = trunc i16 %hi to i8 - store i8 %trunc, i8 addrspace(4)* %out + store i8 %trunc, i8* %out ret void } @@ -260,11 +261,11 @@ ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 -define void @store_flat_hi_i8_shift(i8 addrspace(4)* %out, i32 %value) #0 { +define void @store_flat_hi_i8_shift(i8* %out, i32 %value) #0 { entry: %hi32 = lshr i32 %value, 16 %hi = trunc i32 %hi32 to i8 - store i8 %hi, i8 addrspace(4)* %out + store i8 %hi, i8* %out ret void } @@ -278,12 +279,12 @@ ; VI: flat_store_short v[0:1], v2{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 -define void @store_flat_hi_v2i16_max_offset(i16 addrspace(4)* %out, i32 %arg) #0 { +define void @store_flat_hi_v2i16_max_offset(i16* %out, i32 %arg) #0 { entry: %value = bitcast i32 %arg to <2 x i16> %hi = extractelement <2 x i16> %value, i32 1 - %gep = getelementptr inbounds i16, i16 addrspace(4)* %out, i64 2047 - store i16 %hi, i16 addrspace(4)* %gep + %gep = getelementptr inbounds i16, i16* %out, i64 2047 + store i16 %hi, i16* %gep ret void } @@ -297,12 +298,12 @@ ; VI: flat_store_short v[0:1], v2{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 -define void @store_flat_hi_v2i16_neg_offset(i16 addrspace(4)* %out, i32 %arg) #0 { +define void @store_flat_hi_v2i16_neg_offset(i16* %out, i32 %arg) #0 { entry: %value = bitcast i32 %arg to <2 x i16> %hi = extractelement <2 x i16> %value, i32 1 - %gep = getelementptr inbounds i16, i16 addrspace(4)* %out, i64 -1023 - store i16 %hi, i16 addrspace(4)* %gep + %gep = getelementptr inbounds i16, i16* %out, i64 -1023 + store i16 %hi, i16* %gep ret void } @@ -316,13 +317,13 @@ ; VI: flat_store_byte v[0:1], v2{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 -define void @store_flat_hi_v2i16_i8_max_offset(i8 addrspace(4)* %out, i32 %arg) #0 { +define void @store_flat_hi_v2i16_i8_max_offset(i8* %out, i32 %arg) #0 { entry: %value = bitcast i32 %arg to <2 x i16> %hi = extractelement <2 x i16> %value, i32 1 %trunc = trunc i16 %hi to i8 - %gep = getelementptr inbounds i8, i8 addrspace(4)* %out, i64 4095 - store i8 %trunc, i8 addrspace(4)* %gep + %gep = getelementptr inbounds i8, i8* %out, i64 4095 + store i8 %trunc, i8* %gep ret void } @@ -337,13 +338,13 @@ ; VI: flat_store_byte v[0:1], v2{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 -define void @store_flat_hi_v2i16_i8_neg_offset(i8 addrspace(4)* %out, i32 %arg) #0 { +define void @store_flat_hi_v2i16_i8_neg_offset(i8* %out, i32 %arg) #0 { entry: %value = bitcast i32 %arg to <2 x i16> %hi = extractelement <2 x i16> %value, i32 1 %trunc = trunc i16 %hi to i8 - %gep = getelementptr inbounds i8, i8 addrspace(4)* %out, i64 -4095 - store i8 %trunc, i8 addrspace(4)* %gep + %gep = getelementptr inbounds i8, i8* %out, i64 -4095 + store i8 %trunc, i8* %gep ret void } @@ -357,12 +358,12 @@ ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 -define void @store_private_hi_v2i16(i16* %out, i32 %arg) #0 { +define void @store_private_hi_v2i16(i16 addrspace(5)* %out, i32 %arg) #0 { entry: ; FIXME: ABI for pre-gfx9 %value = bitcast i32 %arg to <2 x i16> %hi = extractelement <2 x i16> %value, i32 1 - store i16 %hi, i16* %out + store i16 %hi, i16 addrspace(5)* %out ret void } @@ -376,12 +377,12 @@ ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 -define void @store_private_hi_v2f16(half* %out, i32 %arg) #0 { +define void @store_private_hi_v2f16(half addrspace(5)* %out, i32 %arg) #0 { entry: ; FIXME: ABI for pre-gfx9 %value = bitcast i32 %arg to <2 x half> %hi = extractelement <2 x half> %value, i32 1 - store half %hi, half* %out + store half %hi, half addrspace(5)* %out ret void } @@ -395,11 +396,11 @@ ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 -define void @store_private_hi_i32_shift(i16* %out, i32 %value) #0 { +define void @store_private_hi_i32_shift(i16 addrspace(5)* %out, i32 %value) #0 { entry: %hi32 = lshr i32 %value, 16 %hi = trunc i32 %hi32 to i16 - store i16 %hi, i16* %out + store i16 %hi, i16 addrspace(5)* %out ret void } @@ -413,12 +414,12 @@ ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 -define void @store_private_hi_v2i16_i8(i8* %out, i32 %arg) #0 { +define void @store_private_hi_v2i16_i8(i8 addrspace(5)* %out, i32 %arg) #0 { entry: %value = bitcast i32 %arg to <2 x i16> %hi = extractelement <2 x i16> %value, i32 1 %trunc = trunc i16 %hi to i8 - store i8 %trunc, i8* %out + store i8 %trunc, i8 addrspace(5)* %out ret void } @@ -432,11 +433,11 @@ ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 -define void @store_private_hi_i8_shift(i8* %out, i32 %value) #0 { +define void @store_private_hi_i8_shift(i8 addrspace(5)* %out, i32 %value) #0 { entry: %hi32 = lshr i32 %value, 16 %hi = trunc i32 %hi32 to i8 - store i8 %hi, i8* %out + store i8 %hi, i8 addrspace(5)* %out ret void } @@ -449,12 +450,12 @@ ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 -define void @store_private_hi_v2i16_max_offset(i16* byval %out, i32 %arg) #0 { +define void @store_private_hi_v2i16_max_offset(i16 addrspace(5)* byval %out, i32 %arg) #0 { entry: %value = bitcast i32 %arg to <2 x i16> %hi = extractelement <2 x i16> %value, i32 1 - %gep = getelementptr inbounds i16, i16* %out, i64 2045 - store i16 %hi, i16* %gep + %gep = getelementptr inbounds i16, i16 addrspace(5)* %out, i64 2045 + store i16 %hi, i16 addrspace(5)* %gep ret void } @@ -475,7 +476,7 @@ ; FIXME: ABI for pre-gfx9 %value = bitcast i32 %arg to <2 x i16> %hi = extractelement <2 x i16> %value, i32 1 - store volatile i16 %hi, i16* null + store volatile i16 %hi, i16 addrspace(5)* null ret void } @@ -495,7 +496,7 @@ %value = bitcast i32 %arg to <2 x i16> %hi = extractelement <2 x i16> %value, i32 1 %trunc = trunc i16 %hi to i8 - store volatile i8 %trunc, i8* null + store volatile i8 %trunc, i8 addrspace(5)* null ret void } @@ -599,14 +600,14 @@ ; GFX9-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s5 offset:4094 define void @store_private_hi_v2i16_to_offset(i32 %arg) #0 { entry: - %obj0 = alloca [10 x i32], align 4 - %obj1 = alloca [4096 x i16], align 2 - %bc = bitcast [10 x i32]* %obj0 to i32* - store volatile i32 123, i32* %bc + %obj0 = alloca [10 x i32], align 4, addrspace(5) + %obj1 = alloca [4096 x i16], align 2, addrspace(5) + %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* + store volatile i32 123, i32 addrspace(5)* %bc %value = bitcast i32 %arg to <2 x i16> %hi = extractelement <2 x i16> %value, i32 1 - %gep = getelementptr inbounds [4096 x i16], [4096 x i16]* %obj1, i32 0, i32 2025 - store i16 %hi, i16* %gep + %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2025 + store i16 %hi, i16 addrspace(5)* %gep ret void } @@ -616,15 +617,15 @@ ; GFX9-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s5 offset:4095 define void @store_private_hi_v2i16_i8_to_offset(i32 %arg) #0 { entry: - %obj0 = alloca [10 x i32], align 4 - %obj1 = alloca [4096 x i8], align 2 - %bc = bitcast [10 x i32]* %obj0 to i32* - store volatile i32 123, i32* %bc + %obj0 = alloca [10 x i32], align 4, addrspace(5) + %obj1 = alloca [4096 x i8], align 2, addrspace(5) + %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* + store volatile i32 123, i32 addrspace(5)* %bc %value = bitcast i32 %arg to <2 x i16> %hi = extractelement <2 x i16> %value, i32 1 - %gep = getelementptr inbounds [4096 x i8], [4096 x i8]* %obj1, i32 0, i32 4051 + %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051 %trunc = trunc i16 %hi to i8 - store i8 %trunc, i8* %gep + store i8 %trunc, i8 addrspace(5)* %gep ret void } Index: test/CodeGen/AMDGPU/store-private.ll =================================================================== --- test/CodeGen/AMDGPU/store-private.ll +++ test/CodeGen/AMDGPU/store-private.ll @@ -15,9 +15,9 @@ ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, ; SI: buffer_store_byte -define amdgpu_kernel void @store_i1(i1 addrspace(0)* %out) { +define amdgpu_kernel void @store_i1(i1 addrspace(5)* %out) { entry: - store i1 true, i1 addrspace(0)* %out + store i1 true, i1 addrspace(5)* %out ret void } @@ -44,9 +44,9 @@ ; SI: buffer_store_byte -define amdgpu_kernel void @store_i8(i8 addrspace(0)* %out, i8 %in) { +define amdgpu_kernel void @store_i8(i8 addrspace(5)* %out, i8 %in) { entry: - store i8 %in, i8 addrspace(0)* %out + store i8 %in, i8 addrspace(5)* %out ret void } @@ -72,9 +72,9 @@ ; EG: MOV * T(0 + AR.x).X+, [[RES]] ; SI: buffer_store_short -define amdgpu_kernel void @store_i16(i16 addrspace(0)* %out, i16 %in) { +define amdgpu_kernel void @store_i16(i16 addrspace(5)* %out, i16 %in) { entry: - store i16 %in, i16 addrspace(0)* %out + store i16 %in, i16 addrspace(5)* %out ret void } @@ -102,9 +102,9 @@ ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, ; CM: MOVA_INT ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, -define amdgpu_kernel void @store_i24(i24 addrspace(0)* %out, i24 %in) { +define amdgpu_kernel void @store_i24(i24 addrspace(5)* %out, i24 %in) { entry: - store i24 %in, i24 addrspace(0)* %out + store i24 %in, i24 addrspace(5)* %out ret void } @@ -120,9 +120,9 @@ ; CM: MOVA_INT ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, ; CM-NOT: MOVA_INT -define amdgpu_kernel void @store_i25(i25 addrspace(0)* %out, i25 %in) { +define amdgpu_kernel void @store_i25(i25 addrspace(5)* %out, i25 %in) { entry: - store i25 %in, i25 addrspace(0)* %out + store i25 %in, i25 addrspace(5)* %out ret void } @@ -141,10 +141,10 @@ ; CM-NOT: MOVA_INT ; SI: buffer_store_short -define amdgpu_kernel void @store_v2i8(<2 x i8> addrspace(0)* %out, <2 x i32> %in) { +define amdgpu_kernel void @store_v2i8(<2 x i8> addrspace(5)* %out, <2 x i32> %in) { entry: %0 = trunc <2 x i32> %in to <2 x i8> - store <2 x i8> %0, <2 x i8> addrspace(0)* %out + store <2 x i8> %0, <2 x i8> addrspace(5)* %out ret void } @@ -172,10 +172,10 @@ ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, ; SI: buffer_store_byte -define amdgpu_kernel void @store_v2i8_unaligned(<2 x i8> addrspace(0)* %out, <2 x i32> %in) { +define amdgpu_kernel void @store_v2i8_unaligned(<2 x i8> addrspace(5)* %out, <2 x i32> %in) { entry: %0 = trunc <2 x i32> %in to <2 x i8> - store <2 x i8> %0, <2 x i8> addrspace(0)* %out, align 1 + store <2 x i8> %0, <2 x i8> addrspace(5)* %out, align 1 ret void } @@ -191,10 +191,10 @@ ; CM-NOT: MOVA_INT ; SI: buffer_store_dword -define amdgpu_kernel void @store_v2i16(<2 x i16> addrspace(0)* %out, <2 x i32> %in) { +define amdgpu_kernel void @store_v2i16(<2 x i16> addrspace(5)* %out, <2 x i32> %in) { entry: %0 = trunc <2 x i32> %in to <2 x i16> - store <2 x i16> %0, <2 x i16> addrspace(0)* %out + store <2 x i16> %0, <2 x i16> addrspace(5)* %out ret void } @@ -223,10 +223,10 @@ ; SI: buffer_store_short ; SI: buffer_store_short -define amdgpu_kernel void @store_v2i16_unaligned(<2 x i16> addrspace(0)* %out, <2 x i32> %in) { +define amdgpu_kernel void @store_v2i16_unaligned(<2 x i16> addrspace(5)* %out, <2 x i32> %in) { entry: %0 = trunc <2 x i32> %in to <2 x i16> - store <2 x i16> %0, <2 x i16> addrspace(0)* %out, align 2 + store <2 x i16> %0, <2 x i16> addrspace(5)* %out, align 2 ret void } @@ -240,10 +240,10 @@ ; CM-NOT: MOVA_INT ; SI: buffer_store_dword -define amdgpu_kernel void @store_v4i8(<4 x i8> addrspace(0)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i8(<4 x i8> addrspace(5)* %out, <4 x i32> %in) { entry: %0 = trunc <4 x i32> %in to <4 x i8> - store <4 x i8> %0, <4 x i8> addrspace(0)* %out + store <4 x i8> %0, <4 x i8> addrspace(5)* %out ret void } @@ -299,10 +299,10 @@ ; SI: buffer_store_byte ; SI: buffer_store_byte ; SI-NOT: buffer_store_dword -define amdgpu_kernel void @store_v4i8_unaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i8_unaligned(<4 x i8> addrspace(5)* %out, <4 x i32> %in) { entry: %0 = trunc <4 x i32> %in to <4 x i8> - store <4 x i8> %0, <4 x i8> addrspace(0)* %out, align 1 + store <4 x i8> %0, <4 x i8> addrspace(5)* %out, align 1 ret void } @@ -410,10 +410,10 @@ ; SI: buffer_store_byte ; SI: buffer_store_byte ; SI-NOT: buffer_store_dword -define amdgpu_kernel void @store_v8i8_unaligned(<8 x i8> addrspace(0)* %out, <8 x i32> %in) { +define amdgpu_kernel void @store_v8i8_unaligned(<8 x i8> addrspace(5)* %out, <8 x i32> %in) { entry: %0 = trunc <8 x i32> %in to <8 x i8> - store <8 x i8> %0, <8 x i8> addrspace(0)* %out, align 1 + store <8 x i8> %0, <8 x i8> addrspace(5)* %out, align 1 ret void } @@ -443,10 +443,10 @@ ; SI: buffer_store_short ; SI: buffer_store_short ; SI-NOT: buffer_store_dword -define amdgpu_kernel void @store_v4i8_halfaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i8_halfaligned(<4 x i8> addrspace(5)* %out, <4 x i32> %in) { entry: %0 = trunc <4 x i32> %in to <4 x i8> - store <4 x i8> %0, <4 x i8> addrspace(0)* %out, align 2 + store <4 x i8> %0, <4 x i8> addrspace(5)* %out, align 2 ret void } @@ -460,8 +460,8 @@ ; SI: buffer_store_dword -define amdgpu_kernel void @store_f32(float addrspace(0)* %out, float %in) { - store float %in, float addrspace(0)* %out +define amdgpu_kernel void @store_f32(float addrspace(5)* %out, float %in) { + store float %in, float addrspace(5)* %out ret void } @@ -480,10 +480,10 @@ ; XSI: buffer_store_dwordx2 ; SI: buffer_store_dword ; SI: buffer_store_dword -define amdgpu_kernel void @store_v4i16(<4 x i16> addrspace(0)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i16(<4 x i16> addrspace(5)* %out, <4 x i32> %in) { entry: %0 = trunc <4 x i32> %in to <4 x i16> - store <4 x i16> %0, <4 x i16> addrspace(0)* %out + store <4 x i16> %0, <4 x i16> addrspace(5)* %out ret void } @@ -504,11 +504,11 @@ ; SI: buffer_store_dword ; SI: buffer_store_dword -define amdgpu_kernel void @store_v2f32(<2 x float> addrspace(0)* %out, float %a, float %b) { +define amdgpu_kernel void @store_v2f32(<2 x float> addrspace(5)* %out, float %a, float %b) { entry: %0 = insertelement <2 x float> , float %a, i32 0 %1 = insertelement <2 x float> %0, float %b, i32 1 - store <2 x float> %1, <2 x float> addrspace(0)* %out + store <2 x float> %1, <2 x float> addrspace(5)* %out ret void } @@ -533,8 +533,8 @@ ; SI: buffer_store_dword ; SI: buffer_store_dword -define amdgpu_kernel void @store_v3i32(<3 x i32> addrspace(0)* %out, <3 x i32> %a) nounwind { - store <3 x i32> %a, <3 x i32> addrspace(0)* %out, align 16 +define amdgpu_kernel void @store_v3i32(<3 x i32> addrspace(5)* %out, <3 x i32> %a) nounwind { + store <3 x i32> %a, <3 x i32> addrspace(5)* %out, align 16 ret void } @@ -563,9 +563,9 @@ ; SI: buffer_store_dword ; SI: buffer_store_dword ; SI: buffer_store_dword -define amdgpu_kernel void @store_v4i32(<4 x i32> addrspace(0)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i32(<4 x i32> addrspace(5)* %out, <4 x i32> %in) { entry: - store <4 x i32> %in, <4 x i32> addrspace(0)* %out + store <4 x i32> %in, <4 x i32> addrspace(5)* %out ret void } @@ -594,9 +594,9 @@ ; SI: buffer_store_dword ; SI: buffer_store_dword ; SI: buffer_store_dword -define amdgpu_kernel void @store_v4i32_unaligned(<4 x i32> addrspace(0)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i32_unaligned(<4 x i32> addrspace(5)* %out, <4 x i32> %in) { entry: - store <4 x i32> %in, <4 x i32> addrspace(0)* %out, align 4 + store <4 x i32> %in, <4 x i32> addrspace(5)* %out, align 4 ret void } @@ -626,9 +626,9 @@ ; SI: buffer_store_dword ; SI: buffer_store_dword ; SI: buffer_store_dword -define amdgpu_kernel void @store_v4f32(<4 x float> addrspace(0)* %out, <4 x float> addrspace(0)* %in) { - %1 = load <4 x float>, <4 x float> addrspace(0) * %in - store <4 x float> %1, <4 x float> addrspace(0)* %out +define amdgpu_kernel void @store_v4f32(<4 x float> addrspace(5)* %out, <4 x float> addrspace(5)* %in) { + %1 = load <4 x float>, <4 x float> addrspace(5)* %in + store <4 x float> %1, <4 x float> addrspace(5)* %out ret void } @@ -644,10 +644,10 @@ ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, ; SI: buffer_store_byte -define amdgpu_kernel void @store_i64_i8(i8 addrspace(0)* %out, i64 %in) { +define amdgpu_kernel void @store_i64_i8(i8 addrspace(5)* %out, i64 %in) { entry: %0 = trunc i64 %in to i8 - store i8 %0, i8 addrspace(0)* %out + store i8 %0, i8 addrspace(5)* %out ret void } @@ -663,10 +663,10 @@ ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, ; SI: buffer_store_short -define amdgpu_kernel void @store_i64_i16(i16 addrspace(0)* %out, i64 %in) { +define amdgpu_kernel void @store_i64_i16(i16 addrspace(5)* %out, i64 %in) { entry: %0 = trunc i64 %in to i16 - store i16 %0, i16 addrspace(0)* %out + store i16 %0, i16 addrspace(5)* %out ret void } @@ -689,14 +689,14 @@ ; XSI: buffer_store_dwordx2 ; SI: buffer_store_dword ; SI: buffer_store_dword -define amdgpu_kernel void @vecload2(i32 addrspace(0)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 { +define amdgpu_kernel void @vecload2(i32 addrspace(5)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 { entry: %0 = load i32, i32 addrspace(2)* %mem, align 4 %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1 %1 = load i32, i32 addrspace(2)* %arrayidx1.i, align 4 - store i32 %0, i32 addrspace(0)* %out, align 4 - %arrayidx1 = getelementptr inbounds i32, i32 addrspace(0)* %out, i64 1 - store i32 %1, i32 addrspace(0)* %arrayidx1, align 4 + store i32 %0, i32 addrspace(5)* %out, align 4 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(5)* %out, i64 1 + store i32 %1, i32 addrspace(5)* %arrayidx1, align 4 ret void } @@ -727,15 +727,15 @@ ; SI: buffer_store_dword ; SI: buffer_store_dword ; SI: buffer_store_dword -define amdgpu_kernel void @i128-const-store(i32 addrspace(0)* %out) { +define amdgpu_kernel void @i128-const-store(i32 addrspace(5)* %out) { entry: - store i32 1, i32 addrspace(0)* %out, align 4 - %arrayidx2 = getelementptr inbounds i32, i32 addrspace(0)* %out, i64 1 - store i32 1, i32 addrspace(0)* %arrayidx2, align 4 - %arrayidx4 = getelementptr inbounds i32, i32 addrspace(0)* %out, i64 2 - store i32 2, i32 addrspace(0)* %arrayidx4, align 4 - %arrayidx6 = getelementptr inbounds i32, i32 addrspace(0)* %out, i64 3 - store i32 2, i32 addrspace(0)* %arrayidx6, align 4 + store i32 1, i32 addrspace(5)* %out, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(5)* %out, i64 1 + store i32 1, i32 addrspace(5)* %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds i32, i32 addrspace(5)* %out, i64 2 + store i32 2, i32 addrspace(5)* %arrayidx4, align 4 + %arrayidx6 = getelementptr inbounds i32, i32 addrspace(5)* %out, i64 3 + store i32 2, i32 addrspace(5)* %arrayidx6, align 4 ret void } Index: test/CodeGen/AMDGPU/store-vector-ptrs.ll =================================================================== --- test/CodeGen/AMDGPU/store-vector-ptrs.ll +++ test/CodeGen/AMDGPU/store-vector-ptrs.ll @@ -5,8 +5,8 @@ ; AMDGPUDAGToDAGISel::SelectMUBUFScratch() which is used for selecting ; scratch loads and stores. ; CHECK-LABEL: {{^}}store_vector_ptrs: -define amdgpu_kernel void @store_vector_ptrs(<4 x i32*>* %out, <4 x [1024 x i32]*> %array) nounwind { - %p = getelementptr [1024 x i32], <4 x [1024 x i32]*> %array, <4 x i16> zeroinitializer, <4 x i16> - store <4 x i32*> %p, <4 x i32*>* %out +define amdgpu_kernel void @store_vector_ptrs(<4 x i32 addrspace(5)*> addrspace(5)* %out, <4 x [1024 x i32] addrspace(5)*> %array) nounwind { + %p = getelementptr [1024 x i32], <4 x [1024 x i32] addrspace(5)*> %array, <4 x i16> zeroinitializer, <4 x i16> + store <4 x i32 addrspace(5)*> %p, <4 x i32 addrspace(5)*> addrspace(5)* %out ret void } Index: test/CodeGen/AMDGPU/syncscopes.ll =================================================================== --- test/CodeGen/AMDGPU/syncscopes.ll +++ test/CodeGen/AMDGPU/syncscopes.ll @@ -6,14 +6,14 @@ ; GCN: FLAT_STORE_DWORD killed renamable %vgpr7_vgpr8, killed renamable %vgpr6, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("wavefront") seq_cst 4 into %ir.wavefront_out) define void @syncscopes( i32 %agent, - i32 addrspace(4)* %agent_out, + i32* %agent_out, i32 %workgroup, - i32 addrspace(4)* %workgroup_out, + i32* %workgroup_out, i32 %wavefront, - i32 addrspace(4)* %wavefront_out) { + i32* %wavefront_out) { entry: - store atomic i32 %agent, i32 addrspace(4)* %agent_out syncscope("agent") seq_cst, align 4 - store atomic i32 %workgroup, i32 addrspace(4)* %workgroup_out syncscope("workgroup") seq_cst, align 4 - store atomic i32 %wavefront, i32 addrspace(4)* %wavefront_out syncscope("wavefront") seq_cst, align 4 + store atomic i32 %agent, i32* %agent_out syncscope("agent") seq_cst, align 4 + store atomic i32 %workgroup, i32* %workgroup_out syncscope("workgroup") seq_cst, align 4 + store atomic i32 %wavefront, i32* %wavefront_out syncscope("wavefront") seq_cst, align 4 ret void } Index: test/CodeGen/AMDGPU/target-cpu.ll =================================================================== --- test/CodeGen/AMDGPU/target-cpu.ll +++ test/CodeGen/AMDGPU/target-cpu.ll @@ -1,4 +1,5 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s +target datalayout = "A5" declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #1 @@ -81,10 +82,10 @@ ; CHECK: ; LDSByteSize: 5120 define amdgpu_kernel void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 { entry: - %stack = alloca [5 x i32], align 4 + %stack = alloca [5 x i32], align 4, addrspace(5) %tmp = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp - %load = load i32, i32* %arrayidx1 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp + %load = load i32, i32 addrspace(5)* %arrayidx1 store i32 %load, i32 addrspace(1)* %out ret void } @@ -95,10 +96,10 @@ ; CHECK: ScratchSize: 24 define amdgpu_kernel void @promote_alloca_disabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #6 { entry: - %stack = alloca [5 x i32], align 4 + %stack = alloca [5 x i32], align 4, addrspace(5) %tmp = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp - %load = load i32, i32* %arrayidx1 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp + %load = load i32, i32 addrspace(5)* %arrayidx1 store i32 %load, i32 addrspace(1)* %out ret void } Index: test/CodeGen/AMDGPU/waitcnt-flat.ll =================================================================== --- test/CodeGen/AMDGPU/waitcnt-flat.ll +++ test/CodeGen/AMDGPU/waitcnt-flat.ll @@ -10,13 +10,13 @@ ; XGCN: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[DATA:v[0-9]+]] ; XGCN: s_waitcnt vmcnt(0) lgkmcnt(0) ; XGCN: flat_load_dword [[DATA]], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test(i32 addrspace(4)* %out, i32 %in) { - store volatile i32 0, i32 addrspace(4)* %out - %val = load volatile i32, i32 addrspace(4)* %out +define amdgpu_kernel void @test(i32* %out, i32 %in) { + store volatile i32 0, i32* %out + %val = load volatile i32, i32* %out ret void } -; Make sure lgkmcnt isn't used for global_* instructions +; Make sure lgkmcnt isn't used for global_ addrspace(5)* instructions ; GCN-LABEL: {{^}}test_waitcnt_type_flat_global: ; GFX9: global_load_dword [[LD:v[0-9]+]] ; GFX9-NEXT: s_waitcnt vmcnt(0){{$}} Index: test/CodeGen/AMDGPU/waitcnt-looptest.ll =================================================================== --- test/CodeGen/AMDGPU/waitcnt-looptest.ll +++ test/CodeGen/AMDGPU/waitcnt-looptest.ll @@ -17,8 +17,8 @@ define amdgpu_kernel void @testKernel(i32 addrspace(1)* nocapture %arg) local_unnamed_addr #0 { bb: - store <2 x float> , <2 x float> addrspace(4)* bitcast (float addrspace(4)* getelementptr ([100 x float], [100 x float] addrspace(4)* addrspacecast ([100 x float] addrspace(1)* @data_generic to [100 x float] addrspace(4)*), i64 0, i64 4) to <2 x float> addrspace(4)*), align 4 - store <2 x float> , <2 x float> addrspace(4)* bitcast (float addrspace(4)* getelementptr ([100 x float], [100 x float] addrspace(4)* addrspacecast ([100 x float] addrspace(1)* @data_reference to [100 x float] addrspace(4)*), i64 0, i64 4) to <2 x float> addrspace(4)*), align 4 + store <2 x float> , <2 x float>* bitcast (float* getelementptr ([100 x float], [100 x float]* addrspacecast ([100 x float] addrspace(1)* @data_generic to [100 x float]*), i64 0, i64 4) to <2 x float>*), align 4 + store <2 x float> , <2 x float>* bitcast (float* getelementptr ([100 x float], [100 x float]* addrspacecast ([100 x float] addrspace(1)* @data_reference to [100 x float]*), i64 0, i64 4) to <2 x float>*), align 4 br label %bb18 bb1: ; preds = %bb18 Index: test/CodeGen/AMDGPU/waitcnt.mir =================================================================== --- test/CodeGen/AMDGPU/waitcnt.mir +++ test/CodeGen/AMDGPU/waitcnt.mir @@ -3,8 +3,8 @@ --- | define amdgpu_kernel void @flat_zero_waitcnt(i32 addrspace(1)* %global4, <4 x i32> addrspace(1)* %global16, - i32 addrspace(4)* %flat4, - <4 x i32> addrspace(4)* %flat16) { + i32* %flat4, + <4 x i32>* %flat16) { ret void } Index: test/CodeGen/AMDGPU/wqm.ll =================================================================== --- test/CodeGen/AMDGPU/wqm.ll +++ test/CodeGen/AMDGPU/wqm.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=VI %s +target datalayout = "A5" ; Check that WQM isn't triggered by image load/store intrinsics. ; @@ -657,17 +658,17 @@ ; CHECK: buffer_store_dwordx4 define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind { entry: - %array = alloca [32 x i32], align 4 + %array = alloca [32 x i32], align 4, addrspace(5) call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) - %s.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 0 - store volatile i32 %a, i32* %s.gep, align 4 + %s.gep = getelementptr [32 x i32], [32 x i32] addrspace(5)* %array, i32 0, i32 0 + store volatile i32 %a, i32 addrspace(5)* %s.gep, align 4 call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0) - %c.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 %idx - %c = load i32, i32* %c.gep, align 4 + %c.gep = getelementptr [32 x i32], [32 x i32] addrspace(5)* %array, i32 0, i32 %idx + %c = load i32, i32 addrspace(5)* %c.gep, align 4 %c.bc = bitcast i32 %c to float %t = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) Index: test/DebugInfo/AMDGPU/code-pointer-size.ll =================================================================== --- test/DebugInfo/AMDGPU/code-pointer-size.ll +++ test/DebugInfo/AMDGPU/code-pointer-size.ll @@ -1,14 +1,15 @@ ; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -filetype=obj < %s | llvm-dwarfdump -debug-info - | FileCheck %s +target datalayout = "A5" ; LLVM IR generated with the following command and OpenCL source: ; ; $clang -cl-std=CL2.0 -g -O0 -target amdgcn-amd-amdhsa -S -emit-llvm ; -; kernel void kernel1(global int *A) { +; kernel void kernel1(global int addrspace(5)*A) { ; *A = 11; ; } ; -; kernel void kernel2(global int *B) { +; kernel void kernel2(global int addrspace(5)*B) { ; *B = 12; ; } @@ -20,20 +21,20 @@ define amdgpu_kernel void @kernel1(i32 addrspace(1)* %A) !dbg !7 { entry: - %A.addr = alloca i32 addrspace(1)*, align 4 - store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4 - call void @llvm.dbg.declare(metadata i32 addrspace(1)** %A.addr, metadata !16, metadata !17), !dbg !18 - %0 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !19 + %A.addr = alloca i32 addrspace(1)*, align 4, addrspace(5) + store i32 addrspace(1)* %A, i32 addrspace(1)* addrspace(5)* %A.addr, align 4 + call void @llvm.dbg.declare(metadata i32 addrspace(1)* addrspace(5)* %A.addr, metadata !16, metadata !17), !dbg !18 + %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %A.addr, align 4, !dbg !19 store i32 11, i32 addrspace(1)* %0, align 4, !dbg !20 ret void, !dbg !21 } define amdgpu_kernel void @kernel2(i32 addrspace(1)* %B) !dbg !22 { entry: - %B.addr = alloca i32 addrspace(1)*, align 4 - store i32 addrspace(1)* %B, i32 addrspace(1)** %B.addr, align 4 - call void @llvm.dbg.declare(metadata i32 addrspace(1)** %B.addr, metadata !23, metadata !17), !dbg !24 - %0 = load i32 addrspace(1)*, i32 addrspace(1)** %B.addr, align 4, !dbg !25 + %B.addr = alloca i32 addrspace(1)*, align 4, addrspace(5) + store i32 addrspace(1)* %B, i32 addrspace(1)* addrspace(5)* %B.addr, align 4 + call void @llvm.dbg.declare(metadata i32 addrspace(1)* addrspace(5)* %B.addr, metadata !23, metadata !17), !dbg !24 + %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %B.addr, align 4, !dbg !25 store i32 12, i32 addrspace(1)* %0, align 4, !dbg !26 ret void, !dbg !27 } @@ -57,7 +58,7 @@ !11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) !12 = !{i32 1} !13 = !{!"none"} -!14 = !{!"int*"} +!14 = !{!"int addrspace(5)*"} !15 = !{!""} !16 = !DILocalVariable(name: "A", arg: 1, scope: !7, file: !1, line: 1, type: !10) !17 = !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef) Index: test/DebugInfo/AMDGPU/dwarfdump-relocs.ll =================================================================== --- test/DebugInfo/AMDGPU/dwarfdump-relocs.ll +++ test/DebugInfo/AMDGPU/dwarfdump-relocs.ll @@ -1,14 +1,15 @@ ; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -filetype=obj < %s | llvm-dwarfdump -v - 2>&1 | FileCheck %s +target datalayout = "A5" ; LLVM IR generated with the following command and OpenCL source: ; ; $clang -cl-std=CL2.0 -g -O0 -target amdgcn-amd-amdhsa -S -emit-llvm ; -; kernel void kernel1(global int *A) { +; kernel void kernel1(global int addrspace(5)*A) { ; *A = 11; ; } ; -; kernel void kernel2(global int *B) { +; kernel void kernel2(global int addrspace(5)*B) { ; *B = 12; ; } @@ -19,20 +20,20 @@ define amdgpu_kernel void @kernel1(i32 addrspace(1)* %A) !dbg !7 { entry: - %A.addr = alloca i32 addrspace(1)*, align 4 - store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4 - call void @llvm.dbg.declare(metadata i32 addrspace(1)** %A.addr, metadata !16, metadata !17), !dbg !18 - %0 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !19 + %A.addr = alloca i32 addrspace(1)*, align 4, addrspace(5) + store i32 addrspace(1)* %A, i32 addrspace(1)* addrspace(5)* %A.addr, align 4 + call void @llvm.dbg.declare(metadata i32 addrspace(1)* addrspace(5)* %A.addr, metadata !16, metadata !17), !dbg !18 + %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %A.addr, align 4, !dbg !19 store i32 11, i32 addrspace(1)* %0, align 4, !dbg !20 ret void, !dbg !21 } define amdgpu_kernel void @kernel2(i32 addrspace(1)* %B) !dbg !22 { entry: - %B.addr = alloca i32 addrspace(1)*, align 4 - store i32 addrspace(1)* %B, i32 addrspace(1)** %B.addr, align 4 - call void @llvm.dbg.declare(metadata i32 addrspace(1)** %B.addr, metadata !23, metadata !17), !dbg !24 - %0 = load i32 addrspace(1)*, i32 addrspace(1)** %B.addr, align 4, !dbg !25 + %B.addr = alloca i32 addrspace(1)*, align 4, addrspace(5) + store i32 addrspace(1)* %B, i32 addrspace(1)* addrspace(5)* %B.addr, align 4 + call void @llvm.dbg.declare(metadata i32 addrspace(1)* addrspace(5)* %B.addr, metadata !23, metadata !17), !dbg !24 + %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %B.addr, align 4, !dbg !25 store i32 12, i32 addrspace(1)* %0, align 4, !dbg !26 ret void, !dbg !27 } @@ -56,7 +57,7 @@ !11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) !12 = !{i32 1} !13 = !{!"none"} -!14 = !{!"int*"} +!14 = !{!"int addrspace(5)*"} !15 = !{!""} !16 = !DILocalVariable(name: "A", arg: 1, scope: !7, file: !1, line: 1, type: !10) !17 = !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef) Index: test/DebugInfo/AMDGPU/pointer-address-space.ll =================================================================== --- test/DebugInfo/AMDGPU/pointer-address-space.ll +++ test/DebugInfo/AMDGPU/pointer-address-space.ll @@ -1,15 +1,16 @@ ; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -filetype=obj < %s | llvm-dwarfdump -v -debug-info - | FileCheck %s +target datalayout = "A5" ; LLVM IR generated with the following command and OpenCL source: ; ; $clang -cl-std=CL2.0 -g -O0 -target amdgcn-amd-amdhsa -S -emit-llvm ; ; kernel void kernel1() { -; global int *FuncVar0 = 0; -; constant int *FuncVar1 = 0; -; local int *FuncVar2 = 0; -; private int *FuncVar3 = 0; -; int *FuncVar4 = 0; +; global int addrspace(5)*FuncVar0 = 0; +; constant int addrspace(5)*FuncVar1 = 0; +; local int addrspace(5)*FuncVar2 = 0; +; private int addrspace(5)*FuncVar3 = 0; +; int addrspace(5)*FuncVar4 = 0; ; } ; CHECK: DW_AT_name {{.*}}"FuncVar0" @@ -53,21 +54,21 @@ define amdgpu_kernel void @kernel1() !dbg !7 { entry: - %FuncVar0 = alloca i32 addrspace(1)*, align 4 - %FuncVar1 = alloca i32 addrspace(2)*, align 4 - %FuncVar2 = alloca i32 addrspace(3)*, align 4 - %FuncVar3 = alloca i32*, align 4 - %FuncVar4 = alloca i32 addrspace(4)*, align 4 - call void @llvm.dbg.declare(metadata i32 addrspace(1)** %FuncVar0, metadata !10, metadata !13), !dbg !14 - store i32 addrspace(1)* null, i32 addrspace(1)** %FuncVar0, align 4, !dbg !14 - call void @llvm.dbg.declare(metadata i32 addrspace(2)** %FuncVar1, metadata !15, metadata !13), !dbg !16 - store i32 addrspace(2)* null, i32 addrspace(2)** %FuncVar1, align 4, !dbg !16 - call void @llvm.dbg.declare(metadata i32 addrspace(3)** %FuncVar2, metadata !17, metadata !13), !dbg !19 - store i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*), i32 addrspace(3)** %FuncVar2, align 4, !dbg !19 - call void @llvm.dbg.declare(metadata i32** %FuncVar3, metadata !20, metadata !13), !dbg !22 - store i32* addrspacecast (i32 addrspace(4)* null to i32*), i32** %FuncVar3, align 4, !dbg !22 - call void @llvm.dbg.declare(metadata i32 addrspace(4)** %FuncVar4, metadata !23, metadata !13), !dbg !24 - store i32 addrspace(4)* null, i32 addrspace(4)** %FuncVar4, align 4, !dbg !24 + %FuncVar0 = alloca i32 addrspace(1)*, align 4, addrspace(5) + %FuncVar1 = alloca i32 addrspace(2)*, align 4, addrspace(5) + %FuncVar2 = alloca i32 addrspace(3)*, align 4, addrspace(5) + %FuncVar3 = alloca i32 addrspace(5)*, align 4, addrspace(5) + %FuncVar4 = alloca i32*, align 4, addrspace(5) + call void @llvm.dbg.declare(metadata i32 addrspace(1)* addrspace(5)* %FuncVar0, metadata !10, metadata !13), !dbg !14 + store i32 addrspace(1)* null, i32 addrspace(1)* addrspace(5)* %FuncVar0, align 4, !dbg !14 + call void @llvm.dbg.declare(metadata i32 addrspace(2)* addrspace(5)* %FuncVar1, metadata !15, metadata !13), !dbg !16 + store i32 addrspace(2)* null, i32 addrspace(2)* addrspace(5)* %FuncVar1, align 4, !dbg !16 + call void @llvm.dbg.declare(metadata i32 addrspace(3)* addrspace(5)* %FuncVar2, metadata !17, metadata !13), !dbg !19 + store i32 addrspace(3)* addrspacecast (i32* null to i32 addrspace(3)*), i32 addrspace(3)* addrspace(5)* %FuncVar2, align 4, !dbg !19 + call void @llvm.dbg.declare(metadata i32 addrspace(5)* addrspace(5)* %FuncVar3, metadata !20, metadata !13), !dbg !22 + store i32 addrspace(5)* addrspacecast (i32* null to i32 addrspace(5)*), i32 addrspace(5)* addrspace(5)* %FuncVar3, align 4, !dbg !22 + call void @llvm.dbg.declare(metadata i32* addrspace(5)* %FuncVar4, metadata !23, metadata !13), !dbg !24 + store i32* null, i32* addrspace(5)* %FuncVar4, align 4, !dbg !24 ret void, !dbg !25 } Index: test/DebugInfo/AMDGPU/variable-locations.ll =================================================================== --- test/DebugInfo/AMDGPU/variable-locations.ll +++ test/DebugInfo/AMDGPU/variable-locations.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs -filetype=obj < %s | llvm-dwarfdump -v -debug-info - | FileCheck %s +target datalayout = "A5" ; LLVM IR generated with the following command and OpenCL source: ; @@ -7,7 +8,7 @@ ; global int GlobA; ; global int GlobB; ; -; kernel void kernel1(unsigned int ArgN, global int *ArgA, global int *ArgB) { +; kernel void kernel1(unsigned int ArgN, global int addrspace(5)*ArgA, global int addrspace(5)*ArgB) { ; ArgA[ArgN] += ArgB[ArgN]; ; } @@ -45,22 +46,22 @@ ; CHECK-NEXT: DW_AT_name {{.*}}"ArgB" i32 addrspace(1)* %ArgB) !dbg !13 { entry: - %ArgN.addr = alloca i32, align 4 - %ArgA.addr = alloca i32 addrspace(1)*, align 4 - %ArgB.addr = alloca i32 addrspace(1)*, align 4 - store i32 %ArgN, i32* %ArgN.addr, align 4 - call void @llvm.dbg.declare(metadata i32* %ArgN.addr, metadata !22, metadata !23), !dbg !24 - store i32 addrspace(1)* %ArgA, i32 addrspace(1)** %ArgA.addr, align 4 - call void @llvm.dbg.declare(metadata i32 addrspace(1)** %ArgA.addr, metadata !25, metadata !23), !dbg !26 - store i32 addrspace(1)* %ArgB, i32 addrspace(1)** %ArgB.addr, align 4 - call void @llvm.dbg.declare(metadata i32 addrspace(1)** %ArgB.addr, metadata !27, metadata !23), !dbg !28 - %0 = load i32 addrspace(1)*, i32 addrspace(1)** %ArgB.addr, align 4, !dbg !29 - %1 = load i32, i32* %ArgN.addr, align 4, !dbg !30 + %ArgN.addr = alloca i32, align 4, addrspace(5) + %ArgA.addr = alloca i32 addrspace(1)*, align 4, addrspace(5) + %ArgB.addr = alloca i32 addrspace(1)*, align 4, addrspace(5) + store i32 %ArgN, i32 addrspace(5)* %ArgN.addr, align 4 + call void @llvm.dbg.declare(metadata i32 addrspace(5)* %ArgN.addr, metadata !22, metadata !23), !dbg !24 + store i32 addrspace(1)* %ArgA, i32 addrspace(1)* addrspace(5)* %ArgA.addr, align 4 + call void @llvm.dbg.declare(metadata i32 addrspace(1)* addrspace(5)* %ArgA.addr, metadata !25, metadata !23), !dbg !26 + store i32 addrspace(1)* %ArgB, i32 addrspace(1)* addrspace(5)* %ArgB.addr, align 4 + call void @llvm.dbg.declare(metadata i32 addrspace(1)* addrspace(5)* %ArgB.addr, metadata !27, metadata !23), !dbg !28 + %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %ArgB.addr, align 4, !dbg !29 + %1 = load i32, i32 addrspace(5)* %ArgN.addr, align 4, !dbg !30 %idxprom = zext i32 %1 to i64, !dbg !29 %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 %idxprom, !dbg !29 %2 = load i32, i32 addrspace(1)* %arrayidx, align 4, !dbg !29 - %3 = load i32 addrspace(1)*, i32 addrspace(1)** %ArgA.addr, align 4, !dbg !31 - %4 = load i32, i32* %ArgN.addr, align 4, !dbg !32 + %3 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %ArgA.addr, align 4, !dbg !31 + %4 = load i32, i32 addrspace(5)* %ArgN.addr, align 4, !dbg !32 %idxprom1 = zext i32 %4 to i64, !dbg !31 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %3, i64 %idxprom1, !dbg !31 %5 = load i32, i32 addrspace(1)* %arrayidx2, align 4, !dbg !33 @@ -94,7 +95,7 @@ !17 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !8, size: 64) !18 = !{i32 0, i32 1, i32 1} !19 = !{!"none", !"none", !"none"} -!20 = !{!"uint", !"int*", !"int*"} +!20 = !{!"uint", !"int addrspace(5)*", !"int addrspace(5)*"} !21 = !{!"", !"", !""} !22 = !DILocalVariable(name: "ArgN", arg: 1, scope: !13, file: !3, line: 4, type: !16) !23 = !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef) Index: test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll =================================================================== --- test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll +++ test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll @@ -5,7 +5,7 @@ ; CHECK: br ; CHECK-NOT: addrspacecast define i64 @no_sink_local_to_flat(i1 %pred, i64 addrspace(3)* %ptr) { - %ptr_cast = addrspacecast i64 addrspace(3)* %ptr to i64 addrspace(4)* + %ptr_cast = addrspacecast i64 addrspace(3)* %ptr to i64* br i1 %pred, label %l1, label %l2 l1: @@ -13,7 +13,7 @@ ret i64 %v1 l2: - %v2 = load i64, i64 addrspace(4)* %ptr_cast + %v2 = load i64, i64* %ptr_cast ret i64 %v2 } @@ -21,16 +21,16 @@ ; CHECK: addrspacecast ; CHECK: br ; CHECK-NOT: addrspacecast -define i64 @no_sink_private_to_flat(i1 %pred, i64* %ptr) { - %ptr_cast = addrspacecast i64* %ptr to i64 addrspace(4)* +define i64 @no_sink_private_to_flat(i1 %pred, i64 addrspace(5)* %ptr) { + %ptr_cast = addrspacecast i64 addrspace(5)* %ptr to i64* br i1 %pred, label %l1, label %l2 l1: - %v1 = load i64, i64* %ptr + %v1 = load i64, i64 addrspace(5)* %ptr ret i64 %v1 l2: - %v2 = load i64, i64 addrspace(4)* %ptr_cast + %v2 = load i64, i64* %ptr_cast ret i64 %v2 } @@ -40,7 +40,7 @@ ; CHECK: br ; CHECK: addrspacecast define i64 @sink_global_to_flat(i1 %pred, i64 addrspace(1)* %ptr) { - %ptr_cast = addrspacecast i64 addrspace(1)* %ptr to i64 addrspace(4)* + %ptr_cast = addrspacecast i64 addrspace(1)* %ptr to i64* br i1 %pred, label %l1, label %l2 l1: @@ -48,7 +48,7 @@ ret i64 %v1 l2: - %v2 = load i64, i64 addrspace(4)* %ptr_cast + %v2 = load i64, i64* %ptr_cast ret i64 %v2 } @@ -56,12 +56,12 @@ ; CHECK-NOT: addrspacecast ; CHECK: br ; CHECK: addrspacecast -define i64 @sink_flat_to_global(i1 %pred, i64 addrspace(4)* %ptr) { - %ptr_cast = addrspacecast i64 addrspace(4)* %ptr to i64 addrspace(1)* +define i64 @sink_flat_to_global(i1 %pred, i64* %ptr) { + %ptr_cast = addrspacecast i64* %ptr to i64 addrspace(1)* br i1 %pred, label %l1, label %l2 l1: - %v1 = load i64, i64 addrspace(4)* %ptr + %v1 = load i64, i64* %ptr ret i64 %v1 l2: @@ -73,12 +73,12 @@ ; CHECK-NOT: addrspacecast ; CHECK: br ; CHECK: addrspacecast -define i64 @sink_flat_to_constant(i1 %pred, i64 addrspace(4)* %ptr) { - %ptr_cast = addrspacecast i64 addrspace(4)* %ptr to i64 addrspace(2)* +define i64 @sink_flat_to_constant(i1 %pred, i64* %ptr) { + %ptr_cast = addrspacecast i64* %ptr to i64 addrspace(2)* br i1 %pred, label %l1, label %l2 l1: - %v1 = load i64, i64 addrspace(4)* %ptr + %v1 = load i64, i64* %ptr ret i64 %v1 l2: @@ -90,12 +90,12 @@ ; CHECK-NOT: addrspacecast ; CHECK: br ; CHECK: addrspacecast -define i64 @sink_flat_to_local(i1 %pred, i64 addrspace(4)* %ptr) { - %ptr_cast = addrspacecast i64 addrspace(4)* %ptr to i64 addrspace(3)* +define i64 @sink_flat_to_local(i1 %pred, i64* %ptr) { + %ptr_cast = addrspacecast i64* %ptr to i64 addrspace(3)* br i1 %pred, label %l1, label %l2 l1: - %v1 = load i64, i64 addrspace(4)* %ptr + %v1 = load i64, i64* %ptr ret i64 %v1 l2: @@ -107,15 +107,15 @@ ; CHECK-NOT: addrspacecast ; CHECK: br ; CHECK: addrspacecast -define i64 @sink_flat_to_private(i1 %pred, i64 addrspace(4)* %ptr) { - %ptr_cast = addrspacecast i64 addrspace(4)* %ptr to i64* +define i64 @sink_flat_to_private(i1 %pred, i64* %ptr) { + %ptr_cast = addrspacecast i64* %ptr to i64 addrspace(5)* br i1 %pred, label %l1, label %l2 l1: - %v1 = load i64, i64 addrspace(4)* %ptr + %v1 = load i64, i64* %ptr ret i64 %v1 l2: - %v2 = load i64, i64* %ptr_cast + %v2 = load i64, i64 addrspace(5)* %ptr_cast ret i64 %v2 } Index: test/Transforms/InferAddressSpaces/AMDGPU/basic.ll =================================================================== --- test/Transforms/InferAddressSpaces/AMDGPU/basic.ll +++ test/Transforms/InferAddressSpaces/AMDGPU/basic.ll @@ -3,69 +3,69 @@ ; Trivial optimization of generic addressing ; CHECK-LABEL: @load_global_from_flat( -; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(1)* +; CHECK-NEXT: %tmp0 = addrspacecast float* %generic_scalar to float addrspace(1)* ; CHECK-NEXT: %tmp1 = load float, float addrspace(1)* %tmp0 ; CHECK-NEXT: ret float %tmp1 -define float @load_global_from_flat(float addrspace(4)* %generic_scalar) #0 { - %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(1)* +define float @load_global_from_flat(float* %generic_scalar) #0 { + %tmp0 = addrspacecast float* %generic_scalar to float addrspace(1)* %tmp1 = load float, float addrspace(1)* %tmp0 ret float %tmp1 } ; CHECK-LABEL: @load_constant_from_flat( -; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(2)* +; CHECK-NEXT: %tmp0 = addrspacecast float* %generic_scalar to float addrspace(2)* ; CHECK-NEXT: %tmp1 = load float, float addrspace(2)* %tmp0 ; CHECK-NEXT: ret float %tmp1 -define float @load_constant_from_flat(float addrspace(4)* %generic_scalar) #0 { - %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(2)* +define float @load_constant_from_flat(float* %generic_scalar) #0 { + %tmp0 = addrspacecast float* %generic_scalar to float addrspace(2)* %tmp1 = load float, float addrspace(2)* %tmp0 ret float %tmp1 } ; CHECK-LABEL: @load_group_from_flat( -; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(3)* +; CHECK-NEXT: %tmp0 = addrspacecast float* %generic_scalar to float addrspace(3)* ; CHECK-NEXT: %tmp1 = load float, float addrspace(3)* %tmp0 ; CHECK-NEXT: ret float %tmp1 -define float @load_group_from_flat(float addrspace(4)* %generic_scalar) #0 { - %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(3)* +define float @load_group_from_flat(float* %generic_scalar) #0 { + %tmp0 = addrspacecast float* %generic_scalar to float addrspace(3)* %tmp1 = load float, float addrspace(3)* %tmp0 ret float %tmp1 } ; CHECK-LABEL: @load_private_from_flat( -; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float* -; CHECK-NEXT: %tmp1 = load float, float* %tmp0 +; CHECK-NEXT: %tmp0 = addrspacecast float* %generic_scalar to float addrspace(5)* +; CHECK-NEXT: %tmp1 = load float, float addrspace(5)* %tmp0 ; CHECK-NEXT: ret float %tmp1 -define float @load_private_from_flat(float addrspace(4)* %generic_scalar) #0 { - %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float* - %tmp1 = load float, float* %tmp0 +define float @load_private_from_flat(float* %generic_scalar) #0 { + %tmp0 = addrspacecast float* %generic_scalar to float addrspace(5)* + %tmp1 = load float, float addrspace(5)* %tmp0 ret float %tmp1 } ; CHECK-LABEL: @store_global_from_flat( -; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(1)* +; CHECK-NEXT: %tmp0 = addrspacecast float* %generic_scalar to float addrspace(1)* ; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* %tmp0 -define amdgpu_kernel void @store_global_from_flat(float addrspace(4)* %generic_scalar) #0 { - %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(1)* +define amdgpu_kernel void @store_global_from_flat(float* %generic_scalar) #0 { + %tmp0 = addrspacecast float* %generic_scalar to float addrspace(1)* store float 0.0, float addrspace(1)* %tmp0 ret void } ; CHECK-LABEL: @store_group_from_flat( -; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(3)* +; CHECK-NEXT: %tmp0 = addrspacecast float* %generic_scalar to float addrspace(3)* ; CHECK-NEXT: store float 0.000000e+00, float addrspace(3)* %tmp0 -define amdgpu_kernel void @store_group_from_flat(float addrspace(4)* %generic_scalar) #0 { - %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(3)* +define amdgpu_kernel void @store_group_from_flat(float* %generic_scalar) #0 { + %tmp0 = addrspacecast float* %generic_scalar to float addrspace(3)* store float 0.0, float addrspace(3)* %tmp0 ret void } ; CHECK-LABEL: @store_private_from_flat( -; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float* -; CHECK-NEXT: store float 0.000000e+00, float* %tmp0 -define amdgpu_kernel void @store_private_from_flat(float addrspace(4)* %generic_scalar) #0 { - %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float* - store float 0.0, float* %tmp0 +; CHECK-NEXT: %tmp0 = addrspacecast float* %generic_scalar to float addrspace(5)* +; CHECK-NEXT: store float 0.000000e+00, float addrspace(5)* %tmp0 +define amdgpu_kernel void @store_private_from_flat(float* %generic_scalar) #0 { + %tmp0 = addrspacecast float* %generic_scalar to float addrspace(5)* + store float 0.0, float addrspace(5)* %tmp0 ret void } @@ -75,10 +75,10 @@ ; CHECK-NEXT: store i32 %val, i32 addrspace(1)* %output, align 4 ; CHECK-NEXT: ret void define amdgpu_kernel void @load_store_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 { - %tmp0 = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)* - %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)* - %val = load i32, i32 addrspace(4)* %tmp0, align 4 - store i32 %val, i32 addrspace(4)* %tmp1, align 4 + %tmp0 = addrspacecast i32 addrspace(1)* %input to i32* + %tmp1 = addrspacecast i32 addrspace(1)* %output to i32* + %val = load i32, i32* %tmp0, align 4 + store i32 %val, i32* %tmp1, align 4 ret void } @@ -88,95 +88,95 @@ ; CHECK-NEXT: store i32 %val, i32 addrspace(3)* %output, align 4 ; CHECK-NEXT: ret void define amdgpu_kernel void @load_store_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 { - %tmp0 = addrspacecast i32 addrspace(3)* %input to i32 addrspace(4)* - %tmp1 = addrspacecast i32 addrspace(3)* %output to i32 addrspace(4)* - %val = load i32, i32 addrspace(4)* %tmp0, align 4 - store i32 %val, i32 addrspace(4)* %tmp1, align 4 + %tmp0 = addrspacecast i32 addrspace(3)* %input to i32* + %tmp1 = addrspacecast i32 addrspace(3)* %output to i32* + %val = load i32, i32* %tmp0, align 4 + store i32 %val, i32* %tmp1, align 4 ret void } ; Optimized to private load/store. ; CHECK-LABEL: @load_store_private( -; CHECK-NEXT: %val = load i32, i32* %input, align 4 -; CHECK-NEXT: store i32 %val, i32* %output, align 4 +; CHECK-NEXT: %val = load i32, i32 addrspace(5)* %input, align 4 +; CHECK-NEXT: store i32 %val, i32 addrspace(5)* %output, align 4 ; CHECK-NEXT: ret void -define amdgpu_kernel void @load_store_private(i32* nocapture %input, i32* nocapture %output) #0 { - %tmp0 = addrspacecast i32* %input to i32 addrspace(4)* - %tmp1 = addrspacecast i32* %output to i32 addrspace(4)* - %val = load i32, i32 addrspace(4)* %tmp0, align 4 - store i32 %val, i32 addrspace(4)* %tmp1, align 4 +define amdgpu_kernel void @load_store_private(i32 addrspace(5)* nocapture %input, i32 addrspace(5)* nocapture %output) #0 { + %tmp0 = addrspacecast i32 addrspace(5)* %input to i32* + %tmp1 = addrspacecast i32 addrspace(5)* %output to i32* + %val = load i32, i32* %tmp0, align 4 + store i32 %val, i32* %tmp1, align 4 ret void } ; No optimization. flat load/store. ; CHECK-LABEL: @load_store_flat( -; CHECK-NEXT: %val = load i32, i32 addrspace(4)* %input, align 4 -; CHECK-NEXT: store i32 %val, i32 addrspace(4)* %output, align 4 +; CHECK-NEXT: %val = load i32, i32* %input, align 4 +; CHECK-NEXT: store i32 %val, i32* %output, align 4 ; CHECK-NEXT: ret void -define amdgpu_kernel void @load_store_flat(i32 addrspace(4)* nocapture %input, i32 addrspace(4)* nocapture %output) #0 { - %val = load i32, i32 addrspace(4)* %input, align 4 - store i32 %val, i32 addrspace(4)* %output, align 4 +define amdgpu_kernel void @load_store_flat(i32* nocapture %input, i32* nocapture %output) #0 { + %val = load i32, i32* %input, align 4 + store i32 %val, i32* %output, align 4 ret void } ; CHECK-LABEL: @store_addrspacecast_ptr_value( -; CHECK: %cast = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)* -; CHECK-NEXT: store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %output, align 4 -define amdgpu_kernel void @store_addrspacecast_ptr_value(i32 addrspace(1)* nocapture %input, i32 addrspace(4)* addrspace(1)* nocapture %output) #0 { - %cast = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)* - store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %output, align 4 +; CHECK: %cast = addrspacecast i32 addrspace(1)* %input to i32* +; CHECK-NEXT: store i32* %cast, i32* addrspace(1)* %output, align 4 +define amdgpu_kernel void @store_addrspacecast_ptr_value(i32 addrspace(1)* nocapture %input, i32* addrspace(1)* nocapture %output) #0 { + %cast = addrspacecast i32 addrspace(1)* %input to i32* + store i32* %cast, i32* addrspace(1)* %output, align 4 ret void } ; CHECK-LABEL: @atomicrmw_add_global_to_flat( ; CHECK-NEXT: %ret = atomicrmw add i32 addrspace(1)* %global.ptr, i32 %y seq_cst define i32 @atomicrmw_add_global_to_flat(i32 addrspace(1)* %global.ptr, i32 %y) #0 { - %cast = addrspacecast i32 addrspace(1)* %global.ptr to i32 addrspace(4)* - %ret = atomicrmw add i32 addrspace(4)* %cast, i32 %y seq_cst + %cast = addrspacecast i32 addrspace(1)* %global.ptr to i32* + %ret = atomicrmw add i32* %cast, i32 %y seq_cst ret i32 %ret } ; CHECK-LABEL: @atomicrmw_add_group_to_flat( ; CHECK-NEXT: %ret = atomicrmw add i32 addrspace(3)* %group.ptr, i32 %y seq_cst define i32 @atomicrmw_add_group_to_flat(i32 addrspace(3)* %group.ptr, i32 %y) #0 { - %cast = addrspacecast i32 addrspace(3)* %group.ptr to i32 addrspace(4)* - %ret = atomicrmw add i32 addrspace(4)* %cast, i32 %y seq_cst + %cast = addrspacecast i32 addrspace(3)* %group.ptr to i32* + %ret = atomicrmw add i32* %cast, i32 %y seq_cst ret i32 %ret } ; CHECK-LABEL: @cmpxchg_global_to_flat( ; CHECK: %ret = cmpxchg i32 addrspace(1)* %global.ptr, i32 %cmp, i32 %val seq_cst monotonic define { i32, i1 } @cmpxchg_global_to_flat(i32 addrspace(1)* %global.ptr, i32 %cmp, i32 %val) #0 { - %cast = addrspacecast i32 addrspace(1)* %global.ptr to i32 addrspace(4)* - %ret = cmpxchg i32 addrspace(4)* %cast, i32 %cmp, i32 %val seq_cst monotonic + %cast = addrspacecast i32 addrspace(1)* %global.ptr to i32* + %ret = cmpxchg i32* %cast, i32 %cmp, i32 %val seq_cst monotonic ret { i32, i1 } %ret } ; CHECK-LABEL: @cmpxchg_group_to_flat( ; CHECK: %ret = cmpxchg i32 addrspace(3)* %group.ptr, i32 %cmp, i32 %val seq_cst monotonic define { i32, i1 } @cmpxchg_group_to_flat(i32 addrspace(3)* %group.ptr, i32 %cmp, i32 %val) #0 { - %cast = addrspacecast i32 addrspace(3)* %group.ptr to i32 addrspace(4)* - %ret = cmpxchg i32 addrspace(4)* %cast, i32 %cmp, i32 %val seq_cst monotonic + %cast = addrspacecast i32 addrspace(3)* %group.ptr to i32* + %ret = cmpxchg i32* %cast, i32 %cmp, i32 %val seq_cst monotonic ret { i32, i1 } %ret } ; Not pointer operand ; CHECK-LABEL: @cmpxchg_group_to_flat_wrong_operand( -; CHECK: %cast.cmp = addrspacecast i32 addrspace(3)* %cmp.ptr to i32 addrspace(4)* -; CHECK: %ret = cmpxchg i32 addrspace(4)* addrspace(3)* %cas.ptr, i32 addrspace(4)* %cast.cmp, i32 addrspace(4)* %val seq_cst monotonic -define { i32 addrspace(4)*, i1 } @cmpxchg_group_to_flat_wrong_operand(i32 addrspace(4)* addrspace(3)* %cas.ptr, i32 addrspace(3)* %cmp.ptr, i32 addrspace(4)* %val) #0 { - %cast.cmp = addrspacecast i32 addrspace(3)* %cmp.ptr to i32 addrspace(4)* - %ret = cmpxchg i32 addrspace(4)* addrspace(3)* %cas.ptr, i32 addrspace(4)* %cast.cmp, i32 addrspace(4)* %val seq_cst monotonic - ret { i32 addrspace(4)*, i1 } %ret +; CHECK: %cast.cmp = addrspacecast i32 addrspace(3)* %cmp.ptr to i32* +; CHECK: %ret = cmpxchg i32* addrspace(3)* %cas.ptr, i32* %cast.cmp, i32* %val seq_cst monotonic +define { i32*, i1 } @cmpxchg_group_to_flat_wrong_operand(i32* addrspace(3)* %cas.ptr, i32 addrspace(3)* %cmp.ptr, i32* %val) #0 { + %cast.cmp = addrspacecast i32 addrspace(3)* %cmp.ptr to i32* + %ret = cmpxchg i32* addrspace(3)* %cas.ptr, i32* %cast.cmp, i32* %val seq_cst monotonic + ret { i32*, i1 } %ret } ; Null pointer in local addr space ; CHECK-LABEL: @local_nullptr -; CHECK: icmp ne i8 addrspace(3)* %a, addrspacecast (i8* null to i8 addrspace(3)*) +; CHECK: icmp ne i8 addrspace(3)* %a, addrspacecast (i8 addrspace(5)* null to i8 addrspace(3)*) ; CHECK-NOT: i8 addrspace(3)* null define void @local_nullptr(i32 addrspace(1)* nocapture %results, i8 addrspace(3)* %a) { entry: - %tobool = icmp ne i8 addrspace(3)* %a, addrspacecast (i8* null to i8 addrspace(3)*) + %tobool = icmp ne i8 addrspace(3)* %a, addrspacecast (i8 addrspace(5)* null to i8 addrspace(3)*) %conv = zext i1 %tobool to i32 store i32 %conv, i32 addrspace(1)* %results, align 4 ret void Index: test/Transforms/InferAddressSpaces/AMDGPU/icmp.ll =================================================================== --- test/Transforms/InferAddressSpaces/AMDGPU/icmp.ll +++ test/Transforms/InferAddressSpaces/AMDGPU/icmp.ll @@ -3,57 +3,57 @@ ; CHECK-LABEL: @icmp_flat_cmp_self( ; CHECK: %cmp = icmp eq i32 addrspace(3)* %group.ptr.0, %group.ptr.0 define i1 @icmp_flat_cmp_self(i32 addrspace(3)* %group.ptr.0) #0 { - %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* - %cmp = icmp eq i32 addrspace(4)* %cast0, %cast0 + %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* + %cmp = icmp eq i32* %cast0, %cast0 ret i1 %cmp } ; CHECK-LABEL: @icmp_flat_flat_from_group( ; CHECK: %cmp = icmp eq i32 addrspace(3)* %group.ptr.0, %group.ptr.1 define i1 @icmp_flat_flat_from_group(i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 { - %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* - %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)* - %cmp = icmp eq i32 addrspace(4)* %cast0, %cast1 + %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* + %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32* + %cmp = icmp eq i32* %cast0, %cast1 ret i1 %cmp } ; CHECK-LABEL: @icmp_mismatch_flat_from_group_private( -; CHECK: %1 = addrspacecast i32* %private.ptr.0 to i32 addrspace(4)* -; CHECK: %2 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)* -; CHECK: %cmp = icmp eq i32 addrspace(4)* %1, %2 -define i1 @icmp_mismatch_flat_from_group_private(i32* %private.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 { - %cast0 = addrspacecast i32* %private.ptr.0 to i32 addrspace(4)* - %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)* - %cmp = icmp eq i32 addrspace(4)* %cast0, %cast1 +; CHECK: %1 = addrspacecast i32 addrspace(5)* %private.ptr.0 to i32* +; CHECK: %2 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32* +; CHECK: %cmp = icmp eq i32* %1, %2 +define i1 @icmp_mismatch_flat_from_group_private(i32 addrspace(5)* %private.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 { + %cast0 = addrspacecast i32 addrspace(5)* %private.ptr.0 to i32* + %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32* + %cmp = icmp eq i32* %cast0, %cast1 ret i1 %cmp } ; CHECK-LABEL: @icmp_flat_group_flat( -; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* -; CHECK: %cmp = icmp eq i32 addrspace(4)* %1, %flat.ptr.1 -define i1 @icmp_flat_group_flat(i32 addrspace(3)* %group.ptr.0, i32 addrspace(4)* %flat.ptr.1) #0 { - %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* - %cmp = icmp eq i32 addrspace(4)* %cast0, %flat.ptr.1 +; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* +; CHECK: %cmp = icmp eq i32* %1, %flat.ptr.1 +define i1 @icmp_flat_group_flat(i32 addrspace(3)* %group.ptr.0, i32* %flat.ptr.1) #0 { + %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* + %cmp = icmp eq i32* %cast0, %flat.ptr.1 ret i1 %cmp } ; CHECK-LABEL: @icmp_flat_flat_group( -; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)* -; CHECK: %cmp = icmp eq i32 addrspace(4)* %flat.ptr.0, %1 -define i1 @icmp_flat_flat_group(i32 addrspace(4)* %flat.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 { - %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)* - %cmp = icmp eq i32 addrspace(4)* %flat.ptr.0, %cast1 +; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32* +; CHECK: %cmp = icmp eq i32* %flat.ptr.0, %1 +define i1 @icmp_flat_flat_group(i32* %flat.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 { + %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32* + %cmp = icmp eq i32* %flat.ptr.0, %cast1 ret i1 %cmp } ; Keeping as cmp addrspace(3)* is better ; CHECK-LABEL: @icmp_flat_to_group_cmp( -; CHECK: %cast0 = addrspacecast i32 addrspace(4)* %flat.ptr.0 to i32 addrspace(3)* -; CHECK: %cast1 = addrspacecast i32 addrspace(4)* %flat.ptr.1 to i32 addrspace(3)* +; CHECK: %cast0 = addrspacecast i32* %flat.ptr.0 to i32 addrspace(3)* +; CHECK: %cast1 = addrspacecast i32* %flat.ptr.1 to i32 addrspace(3)* ; CHECK: %cmp = icmp eq i32 addrspace(3)* %cast0, %cast1 -define i1 @icmp_flat_to_group_cmp(i32 addrspace(4)* %flat.ptr.0, i32 addrspace(4)* %flat.ptr.1) #0 { - %cast0 = addrspacecast i32 addrspace(4)* %flat.ptr.0 to i32 addrspace(3)* - %cast1 = addrspacecast i32 addrspace(4)* %flat.ptr.1 to i32 addrspace(3)* +define i1 @icmp_flat_to_group_cmp(i32* %flat.ptr.0, i32* %flat.ptr.1) #0 { + %cast0 = addrspacecast i32* %flat.ptr.0 to i32 addrspace(3)* + %cast1 = addrspacecast i32* %flat.ptr.1 to i32 addrspace(3)* %cmp = icmp eq i32 addrspace(3)* %cast0, %cast1 ret i1 %cmp } @@ -62,35 +62,35 @@ ; constant cast if this is OK to change if 0 is a valid pointer. ; CHECK-LABEL: @icmp_group_flat_cmp_null( -; CHECK: %cmp = icmp eq i32 addrspace(3)* %group.ptr.0, addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*) +; CHECK: %cmp = icmp eq i32 addrspace(3)* %group.ptr.0, addrspacecast (i32* null to i32 addrspace(3)*) define i1 @icmp_group_flat_cmp_null(i32 addrspace(3)* %group.ptr.0) #0 { - %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* - %cmp = icmp eq i32 addrspace(4)* %cast0, null + %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* + %cmp = icmp eq i32* %cast0, null ret i1 %cmp } ; CHECK-LABEL: @icmp_group_flat_cmp_constant_inttoptr( -; CHECK: %cmp = icmp eq i32 addrspace(3)* %group.ptr.0, addrspacecast (i32 addrspace(4)* inttoptr (i64 400 to i32 addrspace(4)*) to i32 addrspace(3)*) +; CHECK: %cmp = icmp eq i32 addrspace(3)* %group.ptr.0, addrspacecast (i32* inttoptr (i64 400 to i32*) to i32 addrspace(3)*) define i1 @icmp_group_flat_cmp_constant_inttoptr(i32 addrspace(3)* %group.ptr.0) #0 { - %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* - %cmp = icmp eq i32 addrspace(4)* %cast0, inttoptr (i64 400 to i32 addrspace(4)*) + %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* + %cmp = icmp eq i32* %cast0, inttoptr (i64 400 to i32*) ret i1 %cmp } ; CHECK-LABEL: @icmp_mismatch_flat_group_private_cmp_null( -; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* -; CHECK: %cmp = icmp eq i32 addrspace(4)* %1, addrspacecast (i32* null to i32 addrspace(4)*) +; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* +; CHECK: %cmp = icmp eq i32* %1, addrspacecast (i32 addrspace(5)* null to i32*) define i1 @icmp_mismatch_flat_group_private_cmp_null(i32 addrspace(3)* %group.ptr.0) #0 { - %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* - %cmp = icmp eq i32 addrspace(4)* %cast0, addrspacecast (i32* null to i32 addrspace(4)*) + %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* + %cmp = icmp eq i32* %cast0, addrspacecast (i32 addrspace(5)* null to i32*) ret i1 %cmp } ; CHECK-LABEL: @icmp_mismatch_flat_group_private_cmp_undef( ; CHECK: %cmp = icmp eq i32 addrspace(3)* %group.ptr.0, undef define i1 @icmp_mismatch_flat_group_private_cmp_undef(i32 addrspace(3)* %group.ptr.0) #0 { - %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* - %cmp = icmp eq i32 addrspace(4)* %cast0, addrspacecast (i32* undef to i32 addrspace(4)*) + %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* + %cmp = icmp eq i32* %cast0, addrspacecast (i32 addrspace(5)* undef to i32*) ret i1 %cmp } @@ -98,62 +98,62 @@ @global0 = internal addrspace(1) global i32 0, align 4 ; CHECK-LABEL: @icmp_mismatch_flat_group_global_cmp_gv( -; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* -; CHECK: %cmp = icmp eq i32 addrspace(4)* %1, addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*) +; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* +; CHECK: %cmp = icmp eq i32* %1, addrspacecast (i32 addrspace(1)* @global0 to i32*) define i1 @icmp_mismatch_flat_group_global_cmp_gv(i32 addrspace(3)* %group.ptr.0) #0 { - %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* - %cmp = icmp eq i32 addrspace(4)* %cast0, addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*) + %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* + %cmp = icmp eq i32* %cast0, addrspacecast (i32 addrspace(1)* @global0 to i32*) ret i1 %cmp } ; CHECK-LABEL: @icmp_mismatch_group_global_cmp_gv_gv( -; CHECK: %cmp = icmp eq i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*) +; CHECK: %cmp = icmp eq i32* addrspacecast (i32 addrspace(3)* @lds0 to i32*), addrspacecast (i32 addrspace(1)* @global0 to i32*) define i1 @icmp_mismatch_group_global_cmp_gv_gv(i32 addrspace(3)* %group.ptr.0) #0 { - %cmp = icmp eq i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*) + %cmp = icmp eq i32* addrspacecast (i32 addrspace(3)* @lds0 to i32*), addrspacecast (i32 addrspace(1)* @global0 to i32*) ret i1 %cmp } ; CHECK-LABEL: @icmp_group_flat_cmp_undef( ; CHECK: %cmp = icmp eq i32 addrspace(3)* %group.ptr.0, undef define i1 @icmp_group_flat_cmp_undef(i32 addrspace(3)* %group.ptr.0) #0 { - %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* - %cmp = icmp eq i32 addrspace(4)* %cast0, undef + %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* + %cmp = icmp eq i32* %cast0, undef ret i1 %cmp } ; Test non-canonical orders ; CHECK-LABEL: @icmp_mismatch_flat_group_private_cmp_null_swap( -; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* -; CHECK: %cmp = icmp eq i32 addrspace(4)* addrspacecast (i32* null to i32 addrspace(4)*), %1 +; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* +; CHECK: %cmp = icmp eq i32* addrspacecast (i32 addrspace(5)* null to i32*), %1 define i1 @icmp_mismatch_flat_group_private_cmp_null_swap(i32 addrspace(3)* %group.ptr.0) #0 { - %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* - %cmp = icmp eq i32 addrspace(4)* addrspacecast (i32* null to i32 addrspace(4)*), %cast0 + %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* + %cmp = icmp eq i32* addrspacecast (i32 addrspace(5)* null to i32*), %cast0 ret i1 %cmp } ; CHECK-LABEL: @icmp_group_flat_cmp_undef_swap( ; CHECK: %cmp = icmp eq i32 addrspace(3)* undef, %group.ptr.0 define i1 @icmp_group_flat_cmp_undef_swap(i32 addrspace(3)* %group.ptr.0) #0 { - %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* - %cmp = icmp eq i32 addrspace(4)* undef, %cast0 + %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* + %cmp = icmp eq i32* undef, %cast0 ret i1 %cmp } ; CHECK-LABEL: @icmp_mismatch_flat_group_private_cmp_undef_swap( ; CHECK: %cmp = icmp eq i32 addrspace(3)* undef, %group.ptr.0 define i1 @icmp_mismatch_flat_group_private_cmp_undef_swap(i32 addrspace(3)* %group.ptr.0) #0 { - %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* - %cmp = icmp eq i32 addrspace(4)* addrspacecast (i32* undef to i32 addrspace(4)*), %cast0 + %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* + %cmp = icmp eq i32* addrspacecast (i32 addrspace(5)* undef to i32*), %cast0 ret i1 %cmp } ; TODO: Should be handled ; CHECK-LABEL: @icmp_flat_flat_from_group_vector( -; CHECK: %cmp = icmp eq <2 x i32 addrspace(4)*> %cast0, %cast1 +; CHECK: %cmp = icmp eq <2 x i32*> %cast0, %cast1 define <2 x i1> @icmp_flat_flat_from_group_vector(<2 x i32 addrspace(3)*> %group.ptr.0, <2 x i32 addrspace(3)*> %group.ptr.1) #0 { - %cast0 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.0 to <2 x i32 addrspace(4)*> - %cast1 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.1 to <2 x i32 addrspace(4)*> - %cmp = icmp eq <2 x i32 addrspace(4)*> %cast0, %cast1 + %cast0 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.0 to <2 x i32*> + %cast1 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.1 to <2 x i32*> + %cmp = icmp eq <2 x i32*> %cast0, %cast1 ret <2 x i1> %cmp } Index: test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll =================================================================== --- test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll +++ test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll @@ -30,29 +30,29 @@ ; CHECK: ret void define amdgpu_kernel void @load_store_lds_f32(i32 %i, float %v) #0 { bb: - %tmp = load float, float addrspace(4)* addrspacecast (float addrspace(3)* @scalar to float addrspace(4)*), align 4 + %tmp = load float, float* addrspacecast (float addrspace(3)* @scalar to float*), align 4 call void @use(float %tmp) - store float %v, float addrspace(4)* addrspacecast (float addrspace(3)* @scalar to float addrspace(4)*), align 4 + store float %v, float* addrspacecast (float addrspace(3)* @scalar to float*), align 4 call void @llvm.amdgcn.s.barrier() - %tmp1 = addrspacecast float addrspace(3)* @scalar to float addrspace(4)* - %tmp2 = load float, float addrspace(4)* %tmp1, align 4 + %tmp1 = addrspacecast float addrspace(3)* @scalar to float* + %tmp2 = load float, float* %tmp1, align 4 call void @use(float %tmp2) - store float %v, float addrspace(4)* %tmp1, align 4 + store float %v, float* %tmp1, align 4 call void @llvm.amdgcn.s.barrier() - %tmp3 = load float, float addrspace(4)* getelementptr inbounds ([10 x float], [10 x float] addrspace(4)* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float] addrspace(4)*), i32 0, i32 5), align 4 + %tmp3 = load float, float* getelementptr inbounds ([10 x float], [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4 call void @use(float %tmp3) - store float %v, float addrspace(4)* getelementptr inbounds ([10 x float], [10 x float] addrspace(4)* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float] addrspace(4)*), i32 0, i32 5), align 4 + store float %v, float* getelementptr inbounds ([10 x float], [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4 call void @llvm.amdgcn.s.barrier() - %tmp4 = getelementptr inbounds [10 x float], [10 x float] addrspace(4)* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float] addrspace(4)*), i32 0, i32 5 - %tmp5 = load float, float addrspace(4)* %tmp4, align 4 + %tmp4 = getelementptr inbounds [10 x float], [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5 + %tmp5 = load float, float* %tmp4, align 4 call void @use(float %tmp5) - store float %v, float addrspace(4)* %tmp4, align 4 + store float %v, float* %tmp4, align 4 call void @llvm.amdgcn.s.barrier() - %tmp6 = addrspacecast [10 x float] addrspace(3)* @array to [10 x float] addrspace(4)* - %tmp7 = getelementptr inbounds [10 x float], [10 x float] addrspace(4)* %tmp6, i32 0, i32 %i - %tmp8 = load float, float addrspace(4)* %tmp7, align 4 + %tmp6 = addrspacecast [10 x float] addrspace(3)* @array to [10 x float]* + %tmp7 = getelementptr inbounds [10 x float], [10 x float]* %tmp6, i32 0, i32 %i + %tmp8 = load float, float* %tmp7, align 4 call void @use(float %tmp8) - store float %v, float addrspace(4)* %tmp7, align 4 + store float %v, float* %tmp7, align 4 call void @llvm.amdgcn.s.barrier() ret void } @@ -61,7 +61,7 @@ ; CHECK: %tmp = load i32, i32 addrspace(3)* bitcast (float addrspace(3)* @scalar to i32 addrspace(3)*), align 4 define i32 @constexpr_load_int_from_float_lds() #0 { bb: - %tmp = load i32, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @scalar to i32 addrspace(3)*) to i32 addrspace(4)*), align 4 + %tmp = load i32, i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @scalar to i32 addrspace(3)*) to i32*), align 4 ret i32 %tmp } @@ -73,18 +73,18 @@ ; CHECK: ret i32 %tmp4 define i32 @load_int_from_global_float(float addrspace(1)* %input, i32 %i, i32 %j) #0 { bb: - %tmp = addrspacecast float addrspace(1)* %input to float addrspace(4)* - %tmp1 = getelementptr float, float addrspace(4)* %tmp, i32 %i - %tmp2 = getelementptr float, float addrspace(4)* %tmp1, i32 %j - %tmp3 = bitcast float addrspace(4)* %tmp2 to i32 addrspace(4)* - %tmp4 = load i32, i32 addrspace(4)* %tmp3 + %tmp = addrspacecast float addrspace(1)* %input to float* + %tmp1 = getelementptr float, float* %tmp, i32 %i + %tmp2 = getelementptr float, float* %tmp1, i32 %j + %tmp3 = bitcast float* %tmp2 to i32* + %tmp4 = load i32, i32* %tmp3 ret i32 %tmp4 } ; CHECK-LABEL: @nested_const_expr( ; CHECK: store i32 1, i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds ([10 x float], [10 x float] addrspace(3)* @array, i64 0, i64 1) to i32 addrspace(3)*), align 4 define amdgpu_kernel void @nested_const_expr() #0 { - store i32 1, i32 addrspace(4)* bitcast (float addrspace(4)* getelementptr ([10 x float], [10 x float] addrspace(4)* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float] addrspace(4)*), i64 0, i64 1) to i32 addrspace(4)*), align 4 + store i32 1, i32* bitcast (float* getelementptr ([10 x float], [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i64 0, i64 1) to i32*), align 4 ret void } @@ -95,10 +95,10 @@ ; CHECK-NEXT: ret void define amdgpu_kernel void @rauw(float addrspace(1)* %input) #0 { bb: - %generic_input = addrspacecast float addrspace(1)* %input to float addrspace(4)* - %addr = getelementptr float, float addrspace(4)* %generic_input, i64 10 - %v = load float, float addrspace(4)* %addr - store float %v, float addrspace(4)* %addr + %generic_input = addrspacecast float addrspace(1)* %input to float* + %addr = getelementptr float, float* %generic_input, i64 10 + %v = load float, float* %addr + store float %v, float* %addr ret void } @@ -119,27 +119,27 @@ ; CHECK: br i1 %exit_cond, label %exit, label %loop define amdgpu_kernel void @loop() #0 { entry: - %p = addrspacecast [10 x float] addrspace(3)* @array to float addrspace(4)* - %end = getelementptr float, float addrspace(4)* %p, i64 10 + %p = addrspacecast [10 x float] addrspace(3)* @array to float* + %end = getelementptr float, float* %p, i64 10 br label %loop loop: ; preds = %loop, %entry - %i = phi float addrspace(4)* [ %p, %entry ], [ %i2, %loop ] - %v = load float, float addrspace(4)* %i + %i = phi float* [ %p, %entry ], [ %i2, %loop ] + %v = load float, float* %i call void @use(float %v) - %i2 = getelementptr float, float addrspace(4)* %i, i64 1 - %exit_cond = icmp eq float addrspace(4)* %i2, %end + %i2 = getelementptr float, float* %i, i64 1 + %exit_cond = icmp eq float* %i2, %end br i1 %exit_cond, label %exit, label %loop exit: ; preds = %loop ret void } -@generic_end = external addrspace(1) global float addrspace(4)* +@generic_end = external addrspace(1) global float* ; CHECK-LABEL: @loop_with_generic_bound( ; CHECK: %p = bitcast [10 x float] addrspace(3)* @array to float addrspace(3)* -; CHECK: %end = load float addrspace(4)*, float addrspace(4)* addrspace(1)* @generic_end +; CHECK: %end = load float*, float* addrspace(1)* @generic_end ; CHECK: br label %loop ; CHECK: loop: @@ -147,21 +147,21 @@ ; CHECK: %v = load float, float addrspace(3)* %i ; CHECK: call void @use(float %v) ; CHECK: %i2 = getelementptr float, float addrspace(3)* %i, i64 1 -; CHECK: %0 = addrspacecast float addrspace(3)* %i2 to float addrspace(4)* -; CHECK: %exit_cond = icmp eq float addrspace(4)* %0, %end +; CHECK: %0 = addrspacecast float addrspace(3)* %i2 to float* +; CHECK: %exit_cond = icmp eq float* %0, %end ; CHECK: br i1 %exit_cond, label %exit, label %loop define amdgpu_kernel void @loop_with_generic_bound() #0 { entry: - %p = addrspacecast [10 x float] addrspace(3)* @array to float addrspace(4)* - %end = load float addrspace(4)*, float addrspace(4)* addrspace(1)* @generic_end + %p = addrspacecast [10 x float] addrspace(3)* @array to float* + %end = load float*, float* addrspace(1)* @generic_end br label %loop loop: ; preds = %loop, %entry - %i = phi float addrspace(4)* [ %p, %entry ], [ %i2, %loop ] - %v = load float, float addrspace(4)* %i + %i = phi float* [ %p, %entry ], [ %i2, %loop ] + %v = load float, float* %i call void @use(float %v) - %i2 = getelementptr float, float addrspace(4)* %i, i64 1 - %exit_cond = icmp eq float addrspace(4)* %i2, %end + %i2 = getelementptr float, float* %i, i64 1 + %exit_cond = icmp eq float* %i2, %end br i1 %exit_cond, label %exit, label %loop exit: ; preds = %loop Index: test/Transforms/InferAddressSpaces/AMDGPU/infer-addrspacecast.ll =================================================================== --- test/Transforms/InferAddressSpaces/AMDGPU/infer-addrspacecast.ll +++ test/Transforms/InferAddressSpaces/AMDGPU/infer-addrspacecast.ll @@ -8,9 +8,9 @@ ; CHECK-NEXT: store i32 8, i32 addrspace(3)* %gep0, align 8 ; CHECK-NEXT: ret void define void @addrspacecast_gep_addrspacecast(i32 addrspace(3)* %ptr) { - %asc0 = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* - %gep0 = getelementptr i32, i32 addrspace(4)* %asc0, i64 9 - %asc1 = addrspacecast i32 addrspace(4)* %gep0 to i32 addrspace(3)* + %asc0 = addrspacecast i32 addrspace(3)* %ptr to i32* + %gep0 = getelementptr i32, i32* %asc0, i64 9 + %asc1 = addrspacecast i32* %gep0 to i32 addrspace(3)* store i32 8, i32 addrspace(3)* %asc1, align 8 ret void } @@ -21,9 +21,9 @@ ; CHECK-NEXT: store i8 8, i8 addrspace(3)* [[CAST]], align 8 ; CHECK-NEXT: ret void define void @addrspacecast_different_pointee_type(i32 addrspace(3)* %ptr) { - %asc0 = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* - %gep0 = getelementptr i32, i32 addrspace(4)* %asc0, i64 9 - %asc1 = addrspacecast i32 addrspace(4)* %gep0 to i8 addrspace(3)* + %asc0 = addrspacecast i32 addrspace(3)* %ptr to i32* + %gep0 = getelementptr i32, i32* %asc0, i64 9 + %asc1 = addrspacecast i32* %gep0 to i8 addrspace(3)* store i8 8, i8 addrspace(3)* %asc1, align 8 ret void } @@ -33,24 +33,24 @@ ; CHECK-NEXT: store volatile i32 addrspace(3)* %gep0, i32 addrspace(3)* addrspace(1)* undef ; CHECK-NEXT: ret void define void @addrspacecast_to_memory(i32 addrspace(3)* %ptr) { - %asc0 = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* - %gep0 = getelementptr i32, i32 addrspace(4)* %asc0, i64 9 - %asc1 = addrspacecast i32 addrspace(4)* %gep0 to i32 addrspace(3)* + %asc0 = addrspacecast i32 addrspace(3)* %ptr to i32* + %gep0 = getelementptr i32, i32* %asc0, i64 9 + %asc1 = addrspacecast i32* %gep0 to i32 addrspace(3)* store volatile i32 addrspace(3)* %asc1, i32 addrspace(3)* addrspace(1)* undef ret void } ; CHECK-LABEL: @multiuse_addrspacecast_gep_addrspacecast( -; CHECK: %1 = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* -; CHECK-NEXT: store volatile i32 addrspace(4)* %1, i32 addrspace(4)* addrspace(1)* undef +; CHECK: %1 = addrspacecast i32 addrspace(3)* %ptr to i32* +; CHECK-NEXT: store volatile i32* %1, i32* addrspace(1)* undef ; CHECK-NEXT: %gep0 = getelementptr i32, i32 addrspace(3)* %ptr, i64 9 ; CHECK-NEXT: store i32 8, i32 addrspace(3)* %gep0, align 8 ; CHECK-NEXT: ret void define void @multiuse_addrspacecast_gep_addrspacecast(i32 addrspace(3)* %ptr) { - %asc0 = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* - store volatile i32 addrspace(4)* %asc0, i32 addrspace(4)* addrspace(1)* undef - %gep0 = getelementptr i32, i32 addrspace(4)* %asc0, i64 9 - %asc1 = addrspacecast i32 addrspace(4)* %gep0 to i32 addrspace(3)* + %asc0 = addrspacecast i32 addrspace(3)* %ptr to i32* + store volatile i32* %asc0, i32* addrspace(1)* undef + %gep0 = getelementptr i32, i32* %asc0, i64 9 + %asc1 = addrspacecast i32* %gep0 to i32 addrspace(3)* store i32 8, i32 addrspace(3)* %asc1, align 8 ret void } Index: test/Transforms/InferAddressSpaces/AMDGPU/infer-getelementptr.ll =================================================================== --- test/Transforms/InferAddressSpaces/AMDGPU/infer-getelementptr.ll +++ test/Transforms/InferAddressSpaces/AMDGPU/infer-getelementptr.ll @@ -9,8 +9,8 @@ ; CHECK: %gep0 = getelementptr inbounds double, double addrspace(3)* getelementptr inbounds ([648 x double], [648 x double] addrspace(3)* @lds, i64 0, i64 384), i64 %idx0 ; CHECK-NEXT: store double 1.000000e+00, double addrspace(3)* %gep0, align 8 define void @simplified_constexpr_gep_addrspacecast(i64 %idx0, i64 %idx1) { - %gep0 = getelementptr inbounds double, double addrspace(4)* addrspacecast (double addrspace(3)* getelementptr inbounds ([648 x double], [648 x double] addrspace(3)* @lds, i64 0, i64 384) to double addrspace(4)*), i64 %idx0 - %asc = addrspacecast double addrspace(4)* %gep0 to double addrspace(3)* + %gep0 = getelementptr inbounds double, double* addrspacecast (double addrspace(3)* getelementptr inbounds ([648 x double], [648 x double] addrspace(3)* @lds, i64 0, i64 384) to double*), i64 %idx0 + %asc = addrspacecast double* %gep0 to double addrspace(3)* store double 1.000000e+00, double addrspace(3)* %asc, align 8 ret void } @@ -19,8 +19,8 @@ ; CHECK-NEXT: %gep0 = getelementptr inbounds double, double addrspace(3)* getelementptr inbounds ([648 x double], [648 x double] addrspace(3)* @lds, i64 0, i64 384), i64 %idx0 ; CHECK-NEXT: store double 1.000000e+00, double addrspace(3)* %gep0, align 8 define void @constexpr_gep_addrspacecast(i64 %idx0, i64 %idx1) { - %gep0 = getelementptr inbounds double, double addrspace(4)* getelementptr ([648 x double], [648 x double] addrspace(4)* addrspacecast ([648 x double] addrspace(3)* @lds to [648 x double] addrspace(4)*), i64 0, i64 384), i64 %idx0 - %asc = addrspacecast double addrspace(4)* %gep0 to double addrspace(3)* + %gep0 = getelementptr inbounds double, double* getelementptr ([648 x double], [648 x double]* addrspacecast ([648 x double] addrspace(3)* @lds to [648 x double]*), i64 0, i64 384), i64 %idx0 + %asc = addrspacecast double* %gep0 to double addrspace(3)* store double 1.0, double addrspace(3)* %asc, align 8 ret void } @@ -30,27 +30,27 @@ ; CHECK-NEXT: %gep1 = getelementptr inbounds double, double addrspace(3)* %gep0, i64 %idx1 ; CHECK-NEXT: store double 1.000000e+00, double addrspace(3)* %gep1, align 8 define void @constexpr_gep_gep_addrspacecast(i64 %idx0, i64 %idx1) { - %gep0 = getelementptr inbounds double, double addrspace(4)* getelementptr ([648 x double], [648 x double] addrspace(4)* addrspacecast ([648 x double] addrspace(3)* @lds to [648 x double] addrspace(4)*), i64 0, i64 384), i64 %idx0 - %gep1 = getelementptr inbounds double, double addrspace(4)* %gep0, i64 %idx1 - %asc = addrspacecast double addrspace(4)* %gep1 to double addrspace(3)* + %gep0 = getelementptr inbounds double, double* getelementptr ([648 x double], [648 x double]* addrspacecast ([648 x double] addrspace(3)* @lds to [648 x double]*), i64 0, i64 384), i64 %idx0 + %gep1 = getelementptr inbounds double, double* %gep0, i64 %idx1 + %asc = addrspacecast double* %gep1 to double addrspace(3)* store double 1.0, double addrspace(3)* %asc, align 8 ret void } ; Don't crash ; CHECK-LABEL: @vector_gep( -; CHECK: %cast = addrspacecast <4 x [1024 x i32] addrspace(3)*> %array to <4 x [1024 x i32] addrspace(4)*> +; CHECK: %cast = addrspacecast <4 x [1024 x i32] addrspace(3)*> %array to <4 x [1024 x i32]*> define amdgpu_kernel void @vector_gep(<4 x [1024 x i32] addrspace(3)*> %array) nounwind { - %cast = addrspacecast <4 x [1024 x i32] addrspace(3)*> %array to <4 x [1024 x i32] addrspace(4)*> - %p = getelementptr [1024 x i32], <4 x [1024 x i32] addrspace(4)*> %cast, <4 x i16> zeroinitializer, <4 x i16> - %p0 = extractelement <4 x i32 addrspace(4)*> %p, i32 0 - %p1 = extractelement <4 x i32 addrspace(4)*> %p, i32 1 - %p2 = extractelement <4 x i32 addrspace(4)*> %p, i32 2 - %p3 = extractelement <4 x i32 addrspace(4)*> %p, i32 3 - store i32 99, i32 addrspace(4)* %p0 - store i32 99, i32 addrspace(4)* %p1 - store i32 99, i32 addrspace(4)* %p2 - store i32 99, i32 addrspace(4)* %p3 + %cast = addrspacecast <4 x [1024 x i32] addrspace(3)*> %array to <4 x [1024 x i32]*> + %p = getelementptr [1024 x i32], <4 x [1024 x i32]*> %cast, <4 x i16> zeroinitializer, <4 x i16> + %p0 = extractelement <4 x i32*> %p, i32 0 + %p1 = extractelement <4 x i32*> %p, i32 1 + %p2 = extractelement <4 x i32*> %p, i32 2 + %p3 = extractelement <4 x i32*> %p, i32 3 + store i32 99, i32* %p0 + store i32 99, i32* %p1 + store i32 99, i32* %p2 + store i32 99, i32* %p3 ret void } @@ -61,12 +61,12 @@ ; CHECK-NEXT: store double 1.000000e+00, double addrspace(3)* %gep1, align 8 ; CHECK-NEXT: ret void define void @repeated_constexpr_gep_addrspacecast(i64 %idx0, i64 %idx1) { - %gep0 = getelementptr inbounds double, double addrspace(4)* getelementptr ([648 x double], [648 x double] addrspace(4)* addrspacecast ([648 x double] addrspace(3)* @lds to [648 x double] addrspace(4)*), i64 0, i64 384), i64 %idx0 - %asc0 = addrspacecast double addrspace(4)* %gep0 to double addrspace(3)* + %gep0 = getelementptr inbounds double, double* getelementptr ([648 x double], [648 x double]* addrspacecast ([648 x double] addrspace(3)* @lds to [648 x double]*), i64 0, i64 384), i64 %idx0 + %asc0 = addrspacecast double* %gep0 to double addrspace(3)* store double 1.0, double addrspace(3)* %asc0, align 8 - %gep1 = getelementptr inbounds double, double addrspace(4)* getelementptr ([648 x double], [648 x double] addrspace(4)* addrspacecast ([648 x double] addrspace(3)* @lds to [648 x double] addrspace(4)*), i64 0, i64 384), i64 %idx1 - %asc1 = addrspacecast double addrspace(4)* %gep1 to double addrspace(3)* + %gep1 = getelementptr inbounds double, double* getelementptr ([648 x double], [648 x double]* addrspacecast ([648 x double] addrspace(3)* @lds to [648 x double]*), i64 0, i64 384), i64 %idx1 + %asc1 = addrspacecast double* %gep1 to double addrspace(3)* store double 1.0, double addrspace(3)* %asc1, align 8 ret void Index: test/Transforms/InferAddressSpaces/AMDGPU/intrinsics.ll =================================================================== --- test/Transforms/InferAddressSpaces/AMDGPU/intrinsics.ll +++ test/Transforms/InferAddressSpaces/AMDGPU/intrinsics.ll @@ -3,143 +3,143 @@ ; CHECK-LABEL: @objectsize_group_to_flat_i32( ; CHECK: %val = call i32 @llvm.objectsize.i32.p3i8(i8 addrspace(3)* %group.ptr, i1 true, i1 false) define i32 @objectsize_group_to_flat_i32(i8 addrspace(3)* %group.ptr) #0 { - %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)* - %val = call i32 @llvm.objectsize.i32.p4i8(i8 addrspace(4)* %cast, i1 true, i1 false) + %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8* + %val = call i32 @llvm.objectsize.i32.p0i8(i8* %cast, i1 true, i1 false) ret i32 %val } ; CHECK-LABEL: @objectsize_global_to_flat_i64( ; CHECK: %val = call i64 @llvm.objectsize.i64.p3i8(i8 addrspace(3)* %global.ptr, i1 true, i1 false) define i64 @objectsize_global_to_flat_i64(i8 addrspace(3)* %global.ptr) #0 { - %cast = addrspacecast i8 addrspace(3)* %global.ptr to i8 addrspace(4)* - %val = call i64 @llvm.objectsize.i64.p4i8(i8 addrspace(4)* %cast, i1 true, i1 false) + %cast = addrspacecast i8 addrspace(3)* %global.ptr to i8* + %val = call i64 @llvm.objectsize.i64.p0i8(i8* %cast, i1 true, i1 false) ret i64 %val } ; CHECK-LABEL: @atomicinc_global_to_flat_i32( ; CHECK: call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %global.ptr, i32 %y, i32 0, i32 0, i1 false) define i32 @atomicinc_global_to_flat_i32(i32 addrspace(1)* %global.ptr, i32 %y) #0 { - %cast = addrspacecast i32 addrspace(1)* %global.ptr to i32 addrspace(4)* - %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %cast, i32 %y, i32 0, i32 0, i1 false) + %cast = addrspacecast i32 addrspace(1)* %global.ptr to i32* + %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %cast, i32 %y, i32 0, i32 0, i1 false) ret i32 %ret } ; CHECK-LABEL: @atomicinc_group_to_flat_i32( ; CHECK: %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %group.ptr, i32 %y, i32 0, i32 0, i1 false) define i32 @atomicinc_group_to_flat_i32(i32 addrspace(3)* %group.ptr, i32 %y) #0 { - %cast = addrspacecast i32 addrspace(3)* %group.ptr to i32 addrspace(4)* - %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %cast, i32 %y, i32 0, i32 0, i1 false) + %cast = addrspacecast i32 addrspace(3)* %group.ptr to i32* + %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %cast, i32 %y, i32 0, i32 0, i1 false) ret i32 %ret } ; CHECK-LABEL: @atomicinc_global_to_flat_i64( ; CHECK: call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %global.ptr, i64 %y, i32 0, i32 0, i1 false) define i64 @atomicinc_global_to_flat_i64(i64 addrspace(1)* %global.ptr, i64 %y) #0 { - %cast = addrspacecast i64 addrspace(1)* %global.ptr to i64 addrspace(4)* - %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %cast, i64 %y, i32 0, i32 0, i1 false) + %cast = addrspacecast i64 addrspace(1)* %global.ptr to i64* + %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %cast, i64 %y, i32 0, i32 0, i1 false) ret i64 %ret } ; CHECK-LABEL: @atomicinc_group_to_flat_i64( ; CHECK: call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %group.ptr, i64 %y, i32 0, i32 0, i1 false) define i64 @atomicinc_group_to_flat_i64(i64 addrspace(3)* %group.ptr, i64 %y) #0 { - %cast = addrspacecast i64 addrspace(3)* %group.ptr to i64 addrspace(4)* - %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %cast, i64 %y, i32 0, i32 0, i1 false) + %cast = addrspacecast i64 addrspace(3)* %group.ptr to i64* + %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %cast, i64 %y, i32 0, i32 0, i1 false) ret i64 %ret } ; CHECK-LABEL: @atomicdec_global_to_flat_i32( ; CHECK: call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %global.ptr, i32 %val, i32 0, i32 0, i1 false) define i32 @atomicdec_global_to_flat_i32(i32 addrspace(1)* %global.ptr, i32 %val) #0 { - %cast = addrspacecast i32 addrspace(1)* %global.ptr to i32 addrspace(4)* - %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %cast, i32 %val, i32 0, i32 0, i1 false) + %cast = addrspacecast i32 addrspace(1)* %global.ptr to i32* + %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* %cast, i32 %val, i32 0, i32 0, i1 false) ret i32 %ret } ; CHECK-LABEL: @atomicdec_group_to_flat_i32( ; CHECK: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %group.ptr, i32 %val, i32 0, i32 0, i1 false) define i32 @atomicdec_group_to_flat_i32(i32 addrspace(3)* %group.ptr, i32 %val) #0 { - %cast = addrspacecast i32 addrspace(3)* %group.ptr to i32 addrspace(4)* - %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %cast, i32 %val, i32 0, i32 0, i1 false) + %cast = addrspacecast i32 addrspace(3)* %group.ptr to i32* + %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* %cast, i32 %val, i32 0, i32 0, i1 false) ret i32 %ret } ; CHECK-LABEL: @atomicdec_global_to_flat_i64( ; CHECK: call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %global.ptr, i64 %y, i32 0, i32 0, i1 false) define i64 @atomicdec_global_to_flat_i64(i64 addrspace(1)* %global.ptr, i64 %y) #0 { - %cast = addrspacecast i64 addrspace(1)* %global.ptr to i64 addrspace(4)* - %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %cast, i64 %y, i32 0, i32 0, i1 false) + %cast = addrspacecast i64 addrspace(1)* %global.ptr to i64* + %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %cast, i64 %y, i32 0, i32 0, i1 false) ret i64 %ret } ; CHECK-LABEL: @atomicdec_group_to_flat_i64( ; CHECK: call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %group.ptr, i64 %y, i32 0, i32 0, i1 false define i64 @atomicdec_group_to_flat_i64(i64 addrspace(3)* %group.ptr, i64 %y) #0 { - %cast = addrspacecast i64 addrspace(3)* %group.ptr to i64 addrspace(4)* - %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %cast, i64 %y, i32 0, i32 0, i1 false) + %cast = addrspacecast i64 addrspace(3)* %group.ptr to i64* + %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %cast, i64 %y, i32 0, i32 0, i1 false) ret i64 %ret } ; CHECK-LABEL: @volatile_atomicinc_group_to_flat_i64( -; CHECK-NEXT: %1 = addrspacecast i64 addrspace(3)* %group.ptr to i64 addrspace(4)* -; CHECK-NEXT: %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %1, i64 %y, i32 0, i32 0, i1 true) +; CHECK-NEXT: %1 = addrspacecast i64 addrspace(3)* %group.ptr to i64* +; CHECK-NEXT: %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %1, i64 %y, i32 0, i32 0, i1 true) define i64 @volatile_atomicinc_group_to_flat_i64(i64 addrspace(3)* %group.ptr, i64 %y) #0 { - %cast = addrspacecast i64 addrspace(3)* %group.ptr to i64 addrspace(4)* - %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %cast, i64 %y, i32 0, i32 0, i1 true) + %cast = addrspacecast i64 addrspace(3)* %group.ptr to i64* + %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %cast, i64 %y, i32 0, i32 0, i1 true) ret i64 %ret } ; CHECK-LABEL: @volatile_atomicdec_global_to_flat_i32( -; CHECK-NEXT: %1 = addrspacecast i32 addrspace(1)* %global.ptr to i32 addrspace(4)* -; CHECK-NEXT: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %1, i32 %val, i32 0, i32 0, i1 true) +; CHECK-NEXT: %1 = addrspacecast i32 addrspace(1)* %global.ptr to i32* +; CHECK-NEXT: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* %1, i32 %val, i32 0, i32 0, i1 true) define i32 @volatile_atomicdec_global_to_flat_i32(i32 addrspace(1)* %global.ptr, i32 %val) #0 { - %cast = addrspacecast i32 addrspace(1)* %global.ptr to i32 addrspace(4)* - %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %cast, i32 %val, i32 0, i32 0, i1 true) + %cast = addrspacecast i32 addrspace(1)* %global.ptr to i32* + %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* %cast, i32 %val, i32 0, i32 0, i1 true) ret i32 %ret } ; CHECK-LABEL: @volatile_atomicdec_group_to_flat_i32( -; CHECK-NEXT: %1 = addrspacecast i32 addrspace(3)* %group.ptr to i32 addrspace(4)* -; CHECK-NEXT: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %1, i32 %val, i32 0, i32 0, i1 true) +; CHECK-NEXT: %1 = addrspacecast i32 addrspace(3)* %group.ptr to i32* +; CHECK-NEXT: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* %1, i32 %val, i32 0, i32 0, i1 true) define i32 @volatile_atomicdec_group_to_flat_i32(i32 addrspace(3)* %group.ptr, i32 %val) #0 { - %cast = addrspacecast i32 addrspace(3)* %group.ptr to i32 addrspace(4)* - %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %cast, i32 %val, i32 0, i32 0, i1 true) + %cast = addrspacecast i32 addrspace(3)* %group.ptr to i32* + %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* %cast, i32 %val, i32 0, i32 0, i1 true) ret i32 %ret } ; CHECK-LABEL: @volatile_atomicdec_global_to_flat_i64( -; CHECK-NEXT: %1 = addrspacecast i64 addrspace(1)* %global.ptr to i64 addrspace(4)* -; CHECK-NEXT: %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %1, i64 %y, i32 0, i32 0, i1 true) +; CHECK-NEXT: %1 = addrspacecast i64 addrspace(1)* %global.ptr to i64* +; CHECK-NEXT: %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %1, i64 %y, i32 0, i32 0, i1 true) define i64 @volatile_atomicdec_global_to_flat_i64(i64 addrspace(1)* %global.ptr, i64 %y) #0 { - %cast = addrspacecast i64 addrspace(1)* %global.ptr to i64 addrspace(4)* - %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %cast, i64 %y, i32 0, i32 0, i1 true) + %cast = addrspacecast i64 addrspace(1)* %global.ptr to i64* + %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %cast, i64 %y, i32 0, i32 0, i1 true) ret i64 %ret } ; CHECK-LABEL: @volatile_atomicdec_group_to_flat_i64( -; CHECK-NEXT: %1 = addrspacecast i64 addrspace(3)* %group.ptr to i64 addrspace(4)* -; CHECK-NEXT: %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %1, i64 %y, i32 0, i32 0, i1 true) +; CHECK-NEXT: %1 = addrspacecast i64 addrspace(3)* %group.ptr to i64* +; CHECK-NEXT: %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %1, i64 %y, i32 0, i32 0, i1 true) define i64 @volatile_atomicdec_group_to_flat_i64(i64 addrspace(3)* %group.ptr, i64 %y) #0 { - %cast = addrspacecast i64 addrspace(3)* %group.ptr to i64 addrspace(4)* - %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %cast, i64 %y, i32 0, i32 0, i1 true) + %cast = addrspacecast i64 addrspace(3)* %group.ptr to i64* + %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %cast, i64 %y, i32 0, i32 0, i1 true) ret i64 %ret } ; CHECK-LABEL: @invalid_variable_volatile_atomicinc_group_to_flat_i64( -; CHECK-NEXT: %1 = addrspacecast i64 addrspace(3)* %group.ptr to i64 addrspace(4)* -; CHECK-NEXT: %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %1, i64 %y, i32 0, i32 0, i1 %volatile.var) +; CHECK-NEXT: %1 = addrspacecast i64 addrspace(3)* %group.ptr to i64* +; CHECK-NEXT: %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %1, i64 %y, i32 0, i32 0, i1 %volatile.var) define i64 @invalid_variable_volatile_atomicinc_group_to_flat_i64(i64 addrspace(3)* %group.ptr, i64 %y, i1 %volatile.var) #0 { - %cast = addrspacecast i64 addrspace(3)* %group.ptr to i64 addrspace(4)* - %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %cast, i64 %y, i32 0, i32 0, i1 %volatile.var) + %cast = addrspacecast i64 addrspace(3)* %group.ptr to i64* + %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %cast, i64 %y, i32 0, i32 0, i1 %volatile.var) ret i64 %ret } -declare i32 @llvm.objectsize.i32.p4i8(i8 addrspace(4)*, i1, i1) #1 -declare i64 @llvm.objectsize.i64.p4i8(i8 addrspace(4)*, i1, i1) #1 -declare i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* nocapture, i32, i32, i32, i1) #2 -declare i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* nocapture, i64, i32, i32, i1) #2 -declare i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* nocapture, i32, i32, i32, i1) #2 -declare i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* nocapture, i64, i32, i32, i1) #2 +declare i32 @llvm.objectsize.i32.p0i8(i8*, i1, i1) #1 +declare i64 @llvm.objectsize.i64.p0i8(i8*, i1, i1) #1 +declare i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* nocapture, i32, i32, i32, i1) #2 +declare i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* nocapture, i64, i32, i32, i1) #2 +declare i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* nocapture, i32, i32, i32, i1) #2 +declare i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* nocapture, i64, i32, i32, i1) #2 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll =================================================================== --- test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll +++ test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll @@ -3,100 +3,100 @@ ; CHECK-LABEL: @memset_group_to_flat( ; CHECK: call void @llvm.memset.p3i8.i64(i8 addrspace(3)* %group.ptr, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 define amdgpu_kernel void @memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y) #0 { - %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)* - call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 + %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8* + call void @llvm.memset.p0i8.i64(i8* %cast, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 ret void } ; CHECK-LABEL: @memset_global_to_flat( ; CHECK: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %global.ptr, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 define amdgpu_kernel void @memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 { - %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)* - call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 + %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8* + call void @llvm.memset.p0i8.i64(i8* %cast, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 ret void } ; CHECK-LABEL: @memset_group_to_flat_no_md( ; CHECK: call void @llvm.memset.p3i8.i64(i8 addrspace(3)* %group.ptr, i8 4, i64 %size, i32 4, i1 false){{$}} define amdgpu_kernel void @memset_group_to_flat_no_md(i8 addrspace(3)* %group.ptr, i64 %size) #0 { - %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)* - call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 %size, i32 4, i1 false) + %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8* + call void @llvm.memset.p0i8.i64(i8* %cast, i8 4, i64 %size, i32 4, i1 false) ret void } ; CHECK-LABEL: @memset_global_to_flat_no_md( ; CHECK: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %global.ptr, i8 4, i64 %size, i32 4, i1 false){{$}} define amdgpu_kernel void @memset_global_to_flat_no_md(i8 addrspace(1)* %global.ptr, i64 %size) #0 { - %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)* - call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 %size, i32 4, i1 false) + %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8* + call void @llvm.memset.p0i8.i64(i8* %cast, i8 4, i64 %size, i32 4, i1 false) ret void } ; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group( -; CHCK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 -define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { - %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* - call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 +; CHCK: call void @llvm.memcpy.p0i8.p3i8.i64(i8* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 +define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group(i8* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { + %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8* + call void @llvm.memcpy.p4i8.p0i8.i64(i8* %dest, i8* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 ret void } ; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_with_group( -; CHECK: call void @llvm.memcpy.p3i8.p4i8.i64(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(4)* %src.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 -define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_with_group(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(4)* %src.ptr, i64 %size) #0 { - %cast.dest = addrspacecast i8 addrspace(3)* %dest.group.ptr to i8 addrspace(4)* - call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast.dest, i8 addrspace(4)* %src.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 +; CHECK: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* %dest.group.ptr, i8* %src.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 +define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_with_group(i8 addrspace(3)* %dest.group.ptr, i8* %src.ptr, i64 %size) #0 { + %cast.dest = addrspacecast i8 addrspace(3)* %dest.group.ptr to i8* + call void @llvm.memcpy.p4i8.p0i8.i64(i8* %cast.dest, i8* %src.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 ret void } ; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_src_with_group( ; CHECK: call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* %src.group.ptr, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_src_with_group(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { - %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* - %cast.dest = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* - call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast.dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 + %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8* + %cast.dest = addrspacecast i8 addrspace(3)* %src.group.ptr to i8* + call void @llvm.memcpy.p4i8.p0i8.i64(i8* %cast.dest, i8* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 ret void } ; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_group_src_global( ; CHECK: call void @llvm.memcpy.p3i8.p1i8.i64(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(1)* %src.global.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_group_src_global(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(1)* %src.global.ptr, i64 %size) #0 { - %cast.src = addrspacecast i8 addrspace(1)* %src.global.ptr to i8 addrspace(4)* - %cast.dest = addrspacecast i8 addrspace(3)* %dest.group.ptr to i8 addrspace(4)* - call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast.dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 + %cast.src = addrspacecast i8 addrspace(1)* %src.global.ptr to i8* + %cast.dest = addrspacecast i8 addrspace(3)* %dest.group.ptr to i8* + call void @llvm.memcpy.p4i8.p0i8.i64(i8* %cast.dest, i8* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 ret void } ; CHECK-LABEL: @memcpy_group_to_flat_replace_dest_global( ; CHECK: call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %dest.global.ptr, i8 addrspace(3)* %src.group.ptr, i32 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 define amdgpu_kernel void @memcpy_group_to_flat_replace_dest_global(i8 addrspace(1)* %dest.global.ptr, i8 addrspace(3)* %src.group.ptr, i32 %size) #0 { - %cast.dest = addrspacecast i8 addrspace(1)* %dest.global.ptr to i8 addrspace(4)* - call void @llvm.memcpy.p4i8.p3i8.i32(i8 addrspace(4)* %cast.dest, i8 addrspace(3)* %src.group.ptr, i32 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 + %cast.dest = addrspacecast i8 addrspace(1)* %dest.global.ptr to i8* + call void @llvm.memcpy.p0i8.p3i8.i32(i8* %cast.dest, i8 addrspace(3)* %src.group.ptr, i32 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 ret void } ; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct( -; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa.struct !7 -define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { - %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* - call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa.struct !7 +; CHECK: call void @llvm.memcpy.p0i8.p3i8.i64(i8* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa.struct !7 +define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct(i8* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { + %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8* + call void @llvm.memcpy.p4i8.p0i8.i64(i8* %dest, i8* %cast.src, i64 %size, i32 4, i1 false), !tbaa.struct !7 ret void } ; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group_no_md( -; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}} -define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { - %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* - call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false) +; CHECK: call void @llvm.memcpy.p0i8.p3i8.i64(i8* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}} +define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_no_md(i8* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { + %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8* + call void @llvm.memcpy.p4i8.p0i8.i64(i8* %dest, i8* %cast.src, i64 %size, i32 4, i1 false) ret void } ; CHECK-LABEL: @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md( -; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest0, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}} -; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest1, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}} -define amdgpu_kernel void @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrspace(4)* %dest0, i8 addrspace(4)* %dest1, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { - %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* - call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest0, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false) - call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest1, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false) +; CHECK: call void @llvm.memcpy.p0i8.p3i8.i64(i8* %dest0, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}} +; CHECK: call void @llvm.memcpy.p0i8.p3i8.i64(i8* %dest1, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}} +define amdgpu_kernel void @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md(i8* %dest0, i8* %dest1, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { + %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8* + call void @llvm.memcpy.p4i8.p0i8.i64(i8* %dest0, i8* %cast.src, i64 %size, i32 4, i1 false) + call void @llvm.memcpy.p4i8.p0i8.i64(i8* %dest1, i8* %cast.src, i64 %size, i32 4, i1 false) ret void } @@ -104,22 +104,22 @@ ; CHECK-LABEL: @memcpy_group_flat_to_flat_self( ; CHECK: call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* %group.ptr, i8 addrspace(3)* %group.ptr, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 define amdgpu_kernel void @memcpy_group_flat_to_flat_self(i8 addrspace(3)* %group.ptr) #0 { - %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)* - call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast, i8 addrspace(4)* %cast, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 + %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8* + call void @llvm.memcpy.p4i8.p0i8.i64(i8* %cast, i8* %cast, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 ret void } ; CHECK-LABEL: @memmove_flat_to_flat_replace_src_with_group( -; CHECK: call void @llvm.memmove.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 -define amdgpu_kernel void @memmove_flat_to_flat_replace_src_with_group(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { - %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* - call void @llvm.memmove.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 +; CHECK: call void @llvm.memmove.p0i8.p3i8.i64(i8* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 +define amdgpu_kernel void @memmove_flat_to_flat_replace_src_with_group(i8* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { + %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8* + call void @llvm.memmove.p4i8.p0i8.i64(i8* %dest, i8* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 ret void } -declare void @llvm.memset.p4i8.i64(i8 addrspace(4)* nocapture writeonly, i8, i64, i32, i1) #1 -declare void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* nocapture writeonly, i8 addrspace(4)* nocapture readonly, i64, i32, i1) #1 -declare void @llvm.memcpy.p4i8.p3i8.i32(i8 addrspace(4)* nocapture writeonly, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #1 -declare void @llvm.memmove.p4i8.p4i8.i64(i8 addrspace(4)* nocapture writeonly, i8 addrspace(4)* nocapture readonly, i64, i32, i1) #1 +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) #1 +declare void @llvm.memcpy.p4i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #1 +declare void @llvm.memcpy.p0i8.p3i8.i32(i8* nocapture writeonly, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #1 +declare void @llvm.memmove.p4i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #1 attributes #0 = { nounwind } attributes #1 = { argmemonly nounwind } Index: test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions.ll =================================================================== --- test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions.ll +++ test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions.ll @@ -1,4 +1,5 @@ ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -infer-address-spaces %s | FileCheck %s +target datalayout = "A5" ; Regression tests from old HSAIL addrspacecast optimization pass @@ -14,7 +15,7 @@ %tmp1 = call i32 @llvm.amdgcn.workitem.id.x() %tmp2 = zext i32 %tmp1 to i64 %tmp3 = add i64 %tmp2, %arg0 - %vecload1 = load <2 x double>, <2 x double> addrspace(4)* bitcast (double addrspace(4)* getelementptr ([100 x double], [100 x double] addrspace(4)* addrspacecast ([100 x double] addrspace(1)* @data to [100 x double] addrspace(4)*), i64 0, i64 4) to <2 x double> addrspace(4)*), align 8 + %vecload1 = load <2 x double>, <2 x double>* bitcast (double* getelementptr ([100 x double], [100 x double]* addrspacecast ([100 x double] addrspace(1)* @data to [100 x double]*), i64 0, i64 4) to <2 x double>*), align 8 %cmp = fcmp ord <2 x double> %vecload1, zeroinitializer %sext = sext <2 x i1> %cmp to <2 x i64> %tmp4 = extractelement <2 x i64> %sext, i64 0 @@ -30,7 +31,7 @@ @generic_address_bug9749.val = internal addrspace(1) global float 0.0, align 4 -declare i32 @_Z9get_fencePU3AS4v(i8 addrspace(4)*) +declare i32 @_Z9get_fencePv(i8*) %opencl.pipe_t = type opaque ; This is a compile time assert bug, but we still want to check optimization @@ -53,24 +54,24 @@ ; Should generate flat load ; CHECK-LABEL: @generic_address_bug9749( ; CHECK: br i1 -; CHECK: load float, float addrspace(4)* +; CHECK: load float, float* ; CHECK: br label define amdgpu_kernel void @generic_address_bug9749(i32 addrspace(1)* nocapture %results) #0 { entry: - %ptr = alloca float addrspace(4)*, align 8 + %ptr = alloca float*, align 8, addrspace(5) %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 store float 0x3FB99999A0000000, float addrspace(1)* @generic_address_bug9749.val, align 4 - store volatile float addrspace(4)* addrspacecast (float addrspace(1)* @generic_address_bug9749.val to float addrspace(4)*), float addrspace(4)** %ptr, align 8 - %tmp2 = load volatile float addrspace(4)*, float addrspace(4)** %ptr, align 8 + store volatile float* addrspacecast (float addrspace(1)* @generic_address_bug9749.val to float*), float* addrspace(5)* %ptr, align 8 + %tmp2 = load volatile float*, float* addrspace(5)* %ptr, align 8 %tmp3 = load float, float addrspace(1)* @generic_address_bug9749.val, align 4 - %tmp4 = bitcast float addrspace(4)* %tmp2 to i8 addrspace(4)* - %call.i = call i32 @_Z9get_fencePU3AS4v(i8 addrspace(4)* %tmp4) #1 + %tmp4 = bitcast float* %tmp2 to i8* + %call.i = call i32 @_Z9get_fencePv(i8* %tmp4) #1 %switch.i.i = icmp ult i32 %call.i, 4 br i1 %switch.i.i, label %if.end.i, label %helperFunction.exit if.end.i: ; preds = %entry - %tmp5 = load float, float addrspace(4)* %tmp2, align 4 + %tmp5 = load float, float* %tmp2, align 4 %not.cmp.i = fcmp oeq float %tmp5, %tmp3 %phitmp = zext i1 %not.cmp.i to i32 br label %helperFunction.exit @@ -91,14 +92,14 @@ br i1 %cmp1, label %for.end, label %for.body.lr.ph for.body.lr.ph: ; preds = %entry - %tmp = addrspacecast i32 addrspace(3)* %in to i32 addrspace(4)* + %tmp = addrspacecast i32 addrspace(3)* %in to i32* br label %for.body for.body: ; preds = %for.body, %for.body.lr.ph %i.03 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] - %ptr.02 = phi i32 addrspace(4)* [ %tmp, %for.body.lr.ph ], [ %add.ptr, %for.body ] - store i32 %i.03, i32 addrspace(4)* %ptr.02, align 4 - %add.ptr = getelementptr inbounds i32, i32 addrspace(4)* %ptr.02, i64 4 + %ptr.02 = phi i32* [ %tmp, %for.body.lr.ph ], [ %add.ptr, %for.body ] + store i32 %i.03, i32* %ptr.02, align 4 + %add.ptr = getelementptr inbounds i32, i32* %ptr.02, i64 4 %inc = add nuw i32 %i.03, 1 %exitcond = icmp eq i32 %inc, %numElems br i1 %exitcond, label %for.end, label %for.body @@ -116,23 +117,23 @@ %tmp2 = zext i32 %tmp1 to i64 %tmp3 = add i64 %tmp2, %arg0 %sext = shl i64 %tmp3, 32 - %tmp4 = addrspacecast i32 addrspace(3)* %destValues to i32 addrspace(4)* - %tmp5 = addrspacecast i32 addrspace(3)* %sourceA to i32 addrspace(4)* + %tmp4 = addrspacecast i32 addrspace(3)* %destValues to i32* + %tmp5 = addrspacecast i32 addrspace(3)* %sourceA to i32* %tmp6 = ashr exact i64 %sext, 31 - %tmp7 = getelementptr inbounds i32, i32 addrspace(4)* %tmp5, i64 %tmp6 - %arrayidx_v4 = bitcast i32 addrspace(4)* %tmp7 to <2 x i32> addrspace(4)* - %vecload = load <2 x i32>, <2 x i32> addrspace(4)* %arrayidx_v4, align 4 + %tmp7 = getelementptr inbounds i32, i32* %tmp5, i64 %tmp6 + %arrayidx_v4 = bitcast i32* %tmp7 to <2 x i32>* + %vecload = load <2 x i32>, <2 x i32>* %arrayidx_v4, align 4 %tmp8 = extractelement <2 x i32> %vecload, i32 0 %tmp9 = extractelement <2 x i32> %vecload, i32 1 %tmp10 = icmp eq i32 %tmp8, 0 %tmp11 = select i1 %tmp10, i32 32, i32 %tmp8 %tmp12 = icmp eq i32 %tmp9, 0 %tmp13 = select i1 %tmp12, i32 32, i32 %tmp9 - %tmp14 = getelementptr inbounds i32, i32 addrspace(4)* %tmp4, i64 %tmp6 + %tmp14 = getelementptr inbounds i32, i32* %tmp4, i64 %tmp6 %tmp15 = insertelement <2 x i32> undef, i32 %tmp11, i32 0 %tmp16 = insertelement <2 x i32> %tmp15, i32 %tmp13, i32 1 - %arrayidx_v41 = bitcast i32 addrspace(4)* %tmp14 to <2 x i32> addrspace(4)* - store <2 x i32> %tmp16, <2 x i32> addrspace(4)* %arrayidx_v41, align 4 + %arrayidx_v41 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp16, <2 x i32>* %arrayidx_v41, align 4 ret void } Index: test/Transforms/InferAddressSpaces/AMDGPU/select.ll =================================================================== --- test/Transforms/InferAddressSpaces/AMDGPU/select.ll +++ test/Transforms/InferAddressSpaces/AMDGPU/select.ll @@ -4,25 +4,25 @@ ; this doesn't do something insane on non-canonical IR. ; CHECK-LABEL: @return_select_group_flat( -; CHECK-NEXT: %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* -; CHECK-NEXT: %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)* -; CHECK-NEXT: %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* %cast1 -; CHECK-NEXT: ret i32 addrspace(4)* %select -define i32 addrspace(4)* @return_select_group_flat(i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 { - %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* - %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)* - %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* %cast1 - ret i32 addrspace(4)* %select +; CHECK-NEXT: %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* +; CHECK-NEXT: %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32* +; CHECK-NEXT: %select = select i1 %c, i32* %cast0, i32* %cast1 +; CHECK-NEXT: ret i32* %select +define i32* @return_select_group_flat(i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 { + %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* + %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32* + %select = select i1 %c, i32* %cast0, i32* %cast1 + ret i32* %select } ; CHECK-LABEL: @store_select_group_flat( ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1 ; CHECK: store i32 -1, i32 addrspace(3)* %select define amdgpu_kernel void @store_select_group_flat(i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 { - %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* - %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)* - %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* %cast1 - store i32 -1, i32 addrspace(4)* %select + %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* + %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32* + %select = select i1 %c, i32* %cast0, i32* %cast1 + store i32 -1, i32* %select ret void } @@ -31,23 +31,23 @@ ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1, !prof !0 ; CHECK: %load = load i32, i32 addrspace(3)* %select define i32 @load_select_group_flat_md(i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 { - %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* - %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)* - %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* %cast1, !prof !0 - %load = load i32, i32 addrspace(4)* %select + %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* + %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32* + %select = select i1 %c, i32* %cast0, i32* %cast1, !prof !0 + %load = load i32, i32* %select ret i32 %load } ; CHECK-LABEL: @store_select_mismatch_group_private_flat( -; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* -; CHECK: %2 = addrspacecast i32* %private.ptr.1 to i32 addrspace(4)* -; CHECK: %select = select i1 %c, i32 addrspace(4)* %1, i32 addrspace(4)* %2 -; CHECK: store i32 -1, i32 addrspace(4)* %select -define amdgpu_kernel void @store_select_mismatch_group_private_flat(i1 %c, i32 addrspace(3)* %group.ptr.0, i32* %private.ptr.1) #0 { - %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* - %cast1 = addrspacecast i32* %private.ptr.1 to i32 addrspace(4)* - %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* %cast1 - store i32 -1, i32 addrspace(4)* %select +; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* +; CHECK: %2 = addrspacecast i32 addrspace(5)* %private.ptr.1 to i32* +; CHECK: %select = select i1 %c, i32* %1, i32* %2 +; CHECK: store i32 -1, i32* %select +define amdgpu_kernel void @store_select_mismatch_group_private_flat(i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(5)* %private.ptr.1) #0 { + %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* + %cast1 = addrspacecast i32 addrspace(5)* %private.ptr.1 to i32* + %select = select i1 %c, i32* %cast0, i32* %cast1 + store i32 -1, i32* %select ret void } @@ -58,35 +58,35 @@ ; CHECK: %tmp = load i32, i32 addrspace(3)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(3)* @lds0, i32 addrspace(3)* @lds1) define i32 @constexpr_select_group_flat() #0 { bb: - %tmp = load i32, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds1 to i32 addrspace(4)*)) + %tmp = load i32, i32* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32* addrspacecast (i32 addrspace(3)* @lds0 to i32*), i32* addrspacecast (i32 addrspace(3)* @lds1 to i32*)) ret i32 %tmp } ; CHECK-LABEL: @constexpr_select_group_global_flat_mismatch( -; CHECK: %tmp = load i32, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)) +; CHECK: %tmp = load i32, i32* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32* addrspacecast (i32 addrspace(3)* @lds0 to i32*), i32* addrspacecast (i32 addrspace(1)* @global0 to i32*)) define i32 @constexpr_select_group_global_flat_mismatch() #0 { bb: - %tmp = load i32, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)) + %tmp = load i32, i32* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32* addrspacecast (i32 addrspace(3)* @lds0 to i32*), i32* addrspacecast (i32 addrspace(1)* @global0 to i32*)) ret i32 %tmp } ; CHECK-LABEL: @store_select_group_flat_null( -; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*) +; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* addrspacecast (i32* null to i32 addrspace(3)*) ; CHECK: store i32 -1, i32 addrspace(3)* %select define amdgpu_kernel void @store_select_group_flat_null(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { - %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* - %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* null - store i32 -1, i32 addrspace(4)* %select + %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* + %select = select i1 %c, i32* %cast0, i32* null + store i32 -1, i32* %select ret void } ; CHECK-LABEL: @store_select_group_flat_null_swap( -; CHECK: %select = select i1 %c, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*), i32 addrspace(3)* %group.ptr.0 +; CHECK: %select = select i1 %c, i32 addrspace(3)* addrspacecast (i32* null to i32 addrspace(3)*), i32 addrspace(3)* %group.ptr.0 ; CHECK: store i32 -1, i32 addrspace(3)* %select define amdgpu_kernel void @store_select_group_flat_null_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { - %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* - %select = select i1 %c, i32 addrspace(4)* null, i32 addrspace(4)* %cast0 - store i32 -1, i32 addrspace(4)* %select + %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* + %select = select i1 %c, i32* null, i32* %cast0 + store i32 -1, i32* %select ret void } @@ -94,9 +94,9 @@ ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* undef ; CHECK: store i32 -1, i32 addrspace(3)* %select define amdgpu_kernel void @store_select_group_flat_undef(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { - %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* - %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* undef - store i32 -1, i32 addrspace(4)* %select + %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* + %select = select i1 %c, i32* %cast0, i32* undef + store i32 -1, i32* %select ret void } @@ -104,21 +104,21 @@ ; CHECK: %select = select i1 %c, i32 addrspace(3)* undef, i32 addrspace(3)* %group.ptr.0 ; CHECK: store i32 -1, i32 addrspace(3)* %select define amdgpu_kernel void @store_select_group_flat_undef_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { - %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* - %select = select i1 %c, i32 addrspace(4)* undef, i32 addrspace(4)* %cast0 - store i32 -1, i32 addrspace(4)* %select + %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* + %select = select i1 %c, i32* undef, i32* %cast0 + store i32 -1, i32* %select ret void } ; CHECK-LABEL: @store_select_gep_group_flat_null( -; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*) +; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* addrspacecast (i32* null to i32 addrspace(3)*) ; CHECK: %gep = getelementptr i32, i32 addrspace(3)* %select, i64 16 ; CHECK: store i32 -1, i32 addrspace(3)* %gep define amdgpu_kernel void @store_select_gep_group_flat_null(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { - %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* - %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* null - %gep = getelementptr i32, i32 addrspace(4)* %select, i64 16 - store i32 -1, i32 addrspace(4)* %gep + %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* + %select = select i1 %c, i32* %cast0, i32* null + %gep = getelementptr i32, i32* %select, i64 16 + store i32 -1, i32* %gep ret void } @@ -128,19 +128,19 @@ ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* @lds1 ; CHECK: store i32 7, i32 addrspace(3)* %select define amdgpu_kernel void @store_select_group_flat_constexpr(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { - %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* - %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds1 to i32 addrspace(4)*) - store i32 7, i32 addrspace(4)* %select + %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* + %select = select i1 %c, i32* %cast0, i32* addrspacecast (i32 addrspace(3)* @lds1 to i32*) + store i32 7, i32* %select ret void } ; CHECK-LABEL: @store_select_group_flat_inttoptr_flat( -; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* inttoptr (i64 12345 to i32 addrspace(4)*) to i32 addrspace(3)*) +; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* addrspacecast (i32* inttoptr (i64 12345 to i32*) to i32 addrspace(3)*) ; CHECK: store i32 7, i32 addrspace(3)* %select define amdgpu_kernel void @store_select_group_flat_inttoptr_flat(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { - %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* - %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* inttoptr (i64 12345 to i32 addrspace(4)*) - store i32 7, i32 addrspace(4)* %select + %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* + %select = select i1 %c, i32* %cast0, i32* inttoptr (i64 12345 to i32*) + store i32 7, i32* %select ret void } @@ -148,114 +148,114 @@ ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* inttoptr (i32 400 to i32 addrspace(3)*) ; CHECK-NEXT: store i32 7, i32 addrspace(3)* %select define amdgpu_kernel void @store_select_group_flat_inttoptr_group(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { - %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* - %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* inttoptr (i32 400 to i32 addrspace(3)*) to i32 addrspace(4)*) - store i32 7, i32 addrspace(4)* %select + %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* + %select = select i1 %c, i32* %cast0, i32* addrspacecast (i32 addrspace(3)* inttoptr (i32 400 to i32 addrspace(3)*) to i32*) + store i32 7, i32* %select ret void } ; CHECK-LABEL: @store_select_group_global_mismatch_flat_constexpr( -; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* -; CHECK: %select = select i1 %c, i32 addrspace(4)* %1, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*) -; CHECK: store i32 7, i32 addrspace(4)* %select +; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* +; CHECK: %select = select i1 %c, i32* %1, i32* addrspacecast (i32 addrspace(1)* @global0 to i32*) +; CHECK: store i32 7, i32* %select define amdgpu_kernel void @store_select_group_global_mismatch_flat_constexpr(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { - %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* - %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*) - store i32 7, i32 addrspace(4)* %select + %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* + %select = select i1 %c, i32* %cast0, i32* addrspacecast (i32 addrspace(1)* @global0 to i32*) + store i32 7, i32* %select ret void } ; CHECK-LABEL: @store_select_group_global_mismatch_flat_constexpr_swap( -; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* -; CHECK: %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*), i32 addrspace(4)* %1 -; CHECK: store i32 7, i32 addrspace(4)* %select +; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* +; CHECK: %select = select i1 %c, i32* addrspacecast (i32 addrspace(1)* @global0 to i32*), i32* %1 +; CHECK: store i32 7, i32* %select define amdgpu_kernel void @store_select_group_global_mismatch_flat_constexpr_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { - %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* - %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*), i32 addrspace(4)* %cast0 - store i32 7, i32 addrspace(4)* %select + %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* + %select = select i1 %c, i32* addrspacecast (i32 addrspace(1)* @global0 to i32*), i32* %cast0 + store i32 7, i32* %select ret void } ; CHECK-LABEL: @store_select_group_global_mismatch_null_null( -; CHECK: %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*) -; CHECK: store i32 7, i32 addrspace(4)* %select +; CHECK: %select = select i1 %c, i32* addrspacecast (i32 addrspace(3)* null to i32*), i32* addrspacecast (i32 addrspace(1)* null to i32*) +; CHECK: store i32 7, i32* %select define amdgpu_kernel void @store_select_group_global_mismatch_null_null(i1 %c) #0 { - %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*) - store i32 7, i32 addrspace(4)* %select + %select = select i1 %c, i32* addrspacecast (i32 addrspace(3)* null to i32*), i32* addrspacecast (i32 addrspace(1)* null to i32*) + store i32 7, i32* %select ret void } ; CHECK-LABEL: @store_select_group_global_mismatch_null_null_constexpr( -; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4 +; CHECK: store i32 7, i32* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32* addrspacecast (i32 addrspace(3)* null to i32*), i32* addrspacecast (i32 addrspace(1)* null to i32*)), align 4 define amdgpu_kernel void @store_select_group_global_mismatch_null_null_constexpr() #0 { - store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4 + store i32 7, i32* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32* addrspacecast (i32 addrspace(3)* null to i32*), i32* addrspacecast (i32 addrspace(1)* null to i32*)), align 4 ret void } ; CHECK-LABEL: @store_select_group_global_mismatch_gv_null_constexpr( -; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4 +; CHECK: store i32 7, i32* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32* addrspacecast (i32 addrspace(3)* @lds0 to i32*), i32* addrspacecast (i32 addrspace(1)* null to i32*)), align 4 define amdgpu_kernel void @store_select_group_global_mismatch_gv_null_constexpr() #0 { - store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4 + store i32 7, i32* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32* addrspacecast (i32 addrspace(3)* @lds0 to i32*), i32* addrspacecast (i32 addrspace(1)* null to i32*)), align 4 ret void } ; CHECK-LABEL: @store_select_group_global_mismatch_null_gv_constexpr( -; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)), align 4 +; CHECK: store i32 7, i32* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32* addrspacecast (i32 addrspace(3)* null to i32*), i32* addrspacecast (i32 addrspace(1)* @global0 to i32*)), align 4 define amdgpu_kernel void @store_select_group_global_mismatch_null_gv_constexpr() #0 { - store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)), align 4 + store i32 7, i32* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32* addrspacecast (i32 addrspace(3)* null to i32*), i32* addrspacecast (i32 addrspace(1)* @global0 to i32*)), align 4 ret void } ; CHECK-LABEL: @store_select_group_global_mismatch_inttoptr_null_constexpr( -; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* inttoptr (i64 123 to i32 addrspace(3)*) to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4 +; CHECK: store i32 7, i32* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32* addrspacecast (i32 addrspace(3)* inttoptr (i64 123 to i32 addrspace(3)*) to i32*), i32* addrspacecast (i32 addrspace(1)* null to i32*)), align 4 define amdgpu_kernel void @store_select_group_global_mismatch_inttoptr_null_constexpr() #0 { - store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* inttoptr (i64 123 to i32 addrspace(3)*) to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4 + store i32 7, i32* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32* addrspacecast (i32 addrspace(3)* inttoptr (i64 123 to i32 addrspace(3)*) to i32*), i32* addrspacecast (i32 addrspace(1)* null to i32*)), align 4 ret void } ; CHECK-LABEL: @store_select_group_global_mismatch_inttoptr_flat_null_constexpr( -; CHECK: store i32 7, i32 addrspace(1)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(1)* addrspacecast (i32 addrspace(4)* inttoptr (i64 123 to i32 addrspace(4)*) to i32 addrspace(1)*), i32 addrspace(1)* null), align 4 +; CHECK: store i32 7, i32 addrspace(1)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(1)* addrspacecast (i32* inttoptr (i64 123 to i32*) to i32 addrspace(1)*), i32 addrspace(1)* null), align 4 define amdgpu_kernel void @store_select_group_global_mismatch_inttoptr_flat_null_constexpr() #0 { - store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* inttoptr (i64 123 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4 + store i32 7, i32* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32* inttoptr (i64 123 to i32*), i32* addrspacecast (i32 addrspace(1)* null to i32*)), align 4 ret void } ; CHECK-LABEL: @store_select_group_global_mismatch_undef_undef_constexpr( ; CHECK: store i32 7, i32 addrspace(3)* null define amdgpu_kernel void @store_select_group_global_mismatch_undef_undef_constexpr() #0 { - store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* undef to i32 addrspace(4)*)), align 4 + store i32 7, i32* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32* addrspacecast (i32 addrspace(3)* null to i32*), i32* addrspacecast (i32 addrspace(1)* undef to i32*)), align 4 ret void } @lds2 = external addrspace(3) global [1024 x i32], align 4 ; CHECK-LABEL: @store_select_group_constexpr_ptrtoint( -; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* -; CHECK: %select = select i1 %c, i32 addrspace(4)* %1, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* inttoptr (i32 add (i32 ptrtoint ([1024 x i32] addrspace(3)* @lds2 to i32), i32 124) to i32 addrspace(1)*) to i32 addrspace(4)*) -; CHECK: store i32 7, i32 addrspace(4)* %select +; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* +; CHECK: %select = select i1 %c, i32* %1, i32* addrspacecast (i32 addrspace(1)* inttoptr (i32 add (i32 ptrtoint ([1024 x i32] addrspace(3)* @lds2 to i32), i32 124) to i32 addrspace(1)*) to i32*) +; CHECK: store i32 7, i32* %select define amdgpu_kernel void @store_select_group_constexpr_ptrtoint(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { - %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* - %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* inttoptr (i32 add (i32 ptrtoint ([1024 x i32] addrspace(3)* @lds2 to i32), i32 124) to i32 addrspace(1)*) to i32 addrspace(4)*) - store i32 7, i32 addrspace(4)* %select + %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32* + %select = select i1 %c, i32* %cast0, i32* addrspacecast (i32 addrspace(1)* inttoptr (i32 add (i32 ptrtoint ([1024 x i32] addrspace(3)* @lds2 to i32), i32 124) to i32 addrspace(1)*) to i32*) + store i32 7, i32* %select ret void } ; CHECK-LABEL: @store_select_group_flat_vector( -; CHECK: %cast0 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.0 to <2 x i32 addrspace(4)*> -; CHECK: %cast1 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.1 to <2 x i32 addrspace(4)*> -; CHECK: %select = select i1 %c, <2 x i32 addrspace(4)*> %cast0, <2 x i32 addrspace(4)*> %cast1 -; CHECK: %extract0 = extractelement <2 x i32 addrspace(4)*> %select, i32 0 -; CHECK: %extract1 = extractelement <2 x i32 addrspace(4)*> %select, i32 1 -; CHECK: store i32 -1, i32 addrspace(4)* %extract0 -; CHECK: store i32 -2, i32 addrspace(4)* %extract1 +; CHECK: %cast0 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.0 to <2 x i32*> +; CHECK: %cast1 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.1 to <2 x i32*> +; CHECK: %select = select i1 %c, <2 x i32*> %cast0, <2 x i32*> %cast1 +; CHECK: %extract0 = extractelement <2 x i32*> %select, i32 0 +; CHECK: %extract1 = extractelement <2 x i32*> %select, i32 1 +; CHECK: store i32 -1, i32* %extract0 +; CHECK: store i32 -2, i32* %extract1 define amdgpu_kernel void @store_select_group_flat_vector(i1 %c, <2 x i32 addrspace(3)*> %group.ptr.0, <2 x i32 addrspace(3)*> %group.ptr.1) #0 { - %cast0 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.0 to <2 x i32 addrspace(4)*> - %cast1 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.1 to <2 x i32 addrspace(4)*> - %select = select i1 %c, <2 x i32 addrspace(4)*> %cast0, <2 x i32 addrspace(4)*> %cast1 - %extract0 = extractelement <2 x i32 addrspace(4)*> %select, i32 0 - %extract1 = extractelement <2 x i32 addrspace(4)*> %select, i32 1 - store i32 -1, i32 addrspace(4)* %extract0 - store i32 -2, i32 addrspace(4)* %extract1 + %cast0 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.0 to <2 x i32*> + %cast1 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.1 to <2 x i32*> + %select = select i1 %c, <2 x i32*> %cast0, <2 x i32*> %cast1 + %extract0 = extractelement <2 x i32*> %select, i32 0 + %extract1 = extractelement <2 x i32*> %select, i32 1 + store i32 -1, i32* %extract0 + store i32 -2, i32* %extract1 ret void } Index: test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll =================================================================== --- test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll +++ test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll @@ -3,138 +3,138 @@ ; Check that volatile users of addrspacecast are not replaced. ; CHECK-LABEL: @volatile_load_flat_from_global( -; CHECK: load volatile i32, i32 addrspace(4)* +; CHECK: load volatile i32, i32* ; CHECK: store i32 %val, i32 addrspace(1)* define amdgpu_kernel void @volatile_load_flat_from_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 { - %tmp0 = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)* - %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)* - %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4 - store i32 %val, i32 addrspace(4)* %tmp1, align 4 + %tmp0 = addrspacecast i32 addrspace(1)* %input to i32* + %tmp1 = addrspacecast i32 addrspace(1)* %output to i32* + %val = load volatile i32, i32* %tmp0, align 4 + store i32 %val, i32* %tmp1, align 4 ret void } ; CHECK-LABEL: @volatile_load_flat_from_constant( -; CHECK: load volatile i32, i32 addrspace(4)* +; CHECK: load volatile i32, i32* ; CHECK: store i32 %val, i32 addrspace(1)* define amdgpu_kernel void @volatile_load_flat_from_constant(i32 addrspace(2)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 { - %tmp0 = addrspacecast i32 addrspace(2)* %input to i32 addrspace(4)* - %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)* - %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4 - store i32 %val, i32 addrspace(4)* %tmp1, align 4 + %tmp0 = addrspacecast i32 addrspace(2)* %input to i32* + %tmp1 = addrspacecast i32 addrspace(1)* %output to i32* + %val = load volatile i32, i32* %tmp0, align 4 + store i32 %val, i32* %tmp1, align 4 ret void } ; CHECK-LABEL: @volatile_load_flat_from_group( -; CHECK: load volatile i32, i32 addrspace(4)* +; CHECK: load volatile i32, i32* ; CHECK: store i32 %val, i32 addrspace(3)* define amdgpu_kernel void @volatile_load_flat_from_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 { - %tmp0 = addrspacecast i32 addrspace(3)* %input to i32 addrspace(4)* - %tmp1 = addrspacecast i32 addrspace(3)* %output to i32 addrspace(4)* - %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4 - store i32 %val, i32 addrspace(4)* %tmp1, align 4 + %tmp0 = addrspacecast i32 addrspace(3)* %input to i32* + %tmp1 = addrspacecast i32 addrspace(3)* %output to i32* + %val = load volatile i32, i32* %tmp0, align 4 + store i32 %val, i32* %tmp1, align 4 ret void } ; CHECK-LABEL: @volatile_load_flat_from_private( -; CHECK: load volatile i32, i32 addrspace(4)* -; CHECK: store i32 %val, i32* -define amdgpu_kernel void @volatile_load_flat_from_private(i32* nocapture %input, i32* nocapture %output) #0 { - %tmp0 = addrspacecast i32* %input to i32 addrspace(4)* - %tmp1 = addrspacecast i32* %output to i32 addrspace(4)* - %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4 - store i32 %val, i32 addrspace(4)* %tmp1, align 4 +; CHECK: load volatile i32, i32* +; CHECK: store i32 %val, i32 addrspace(5)* +define amdgpu_kernel void @volatile_load_flat_from_private(i32 addrspace(5)* nocapture %input, i32 addrspace(5)* nocapture %output) #0 { + %tmp0 = addrspacecast i32 addrspace(5)* %input to i32* + %tmp1 = addrspacecast i32 addrspace(5)* %output to i32* + %val = load volatile i32, i32* %tmp0, align 4 + store i32 %val, i32* %tmp1, align 4 ret void } ; CHECK-LABEL: @volatile_store_flat_to_global( ; CHECK: load i32, i32 addrspace(1)* -; CHECK: store volatile i32 %val, i32 addrspace(4)* +; CHECK: store volatile i32 %val, i32* define amdgpu_kernel void @volatile_store_flat_to_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 { - %tmp0 = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)* - %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)* - %val = load i32, i32 addrspace(4)* %tmp0, align 4 - store volatile i32 %val, i32 addrspace(4)* %tmp1, align 4 + %tmp0 = addrspacecast i32 addrspace(1)* %input to i32* + %tmp1 = addrspacecast i32 addrspace(1)* %output to i32* + %val = load i32, i32* %tmp0, align 4 + store volatile i32 %val, i32* %tmp1, align 4 ret void } ; CHECK-LABEL: @volatile_store_flat_to_group( ; CHECK: load i32, i32 addrspace(3)* -; CHECK: store volatile i32 %val, i32 addrspace(4)* +; CHECK: store volatile i32 %val, i32* define amdgpu_kernel void @volatile_store_flat_to_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 { - %tmp0 = addrspacecast i32 addrspace(3)* %input to i32 addrspace(4)* - %tmp1 = addrspacecast i32 addrspace(3)* %output to i32 addrspace(4)* - %val = load i32, i32 addrspace(4)* %tmp0, align 4 - store volatile i32 %val, i32 addrspace(4)* %tmp1, align 4 + %tmp0 = addrspacecast i32 addrspace(3)* %input to i32* + %tmp1 = addrspacecast i32 addrspace(3)* %output to i32* + %val = load i32, i32* %tmp0, align 4 + store volatile i32 %val, i32* %tmp1, align 4 ret void } ; CHECK-LABEL: @volatile_store_flat_to_private( -; CHECK: load i32, i32* -; CHECK: store volatile i32 %val, i32 addrspace(4)* -define amdgpu_kernel void @volatile_store_flat_to_private(i32* nocapture %input, i32* nocapture %output) #0 { - %tmp0 = addrspacecast i32* %input to i32 addrspace(4)* - %tmp1 = addrspacecast i32* %output to i32 addrspace(4)* - %val = load i32, i32 addrspace(4)* %tmp0, align 4 - store volatile i32 %val, i32 addrspace(4)* %tmp1, align 4 +; CHECK: load i32, i32 addrspace(5)* +; CHECK: store volatile i32 %val, i32* +define amdgpu_kernel void @volatile_store_flat_to_private(i32 addrspace(5)* nocapture %input, i32 addrspace(5)* nocapture %output) #0 { + %tmp0 = addrspacecast i32 addrspace(5)* %input to i32* + %tmp1 = addrspacecast i32 addrspace(5)* %output to i32* + %val = load i32, i32* %tmp0, align 4 + store volatile i32 %val, i32* %tmp1, align 4 ret void } ; CHECK-LABEL: @volatile_atomicrmw_add_group_to_flat( -; CHECK: addrspacecast i32 addrspace(3)* %group.ptr to i32 addrspace(4)* -; CHECK: atomicrmw volatile add i32 addrspace(4)* +; CHECK: addrspacecast i32 addrspace(3)* %group.ptr to i32* +; CHECK: atomicrmw volatile add i32* define i32 @volatile_atomicrmw_add_group_to_flat(i32 addrspace(3)* %group.ptr, i32 %y) #0 { - %cast = addrspacecast i32 addrspace(3)* %group.ptr to i32 addrspace(4)* - %ret = atomicrmw volatile add i32 addrspace(4)* %cast, i32 %y seq_cst + %cast = addrspacecast i32 addrspace(3)* %group.ptr to i32* + %ret = atomicrmw volatile add i32* %cast, i32 %y seq_cst ret i32 %ret } ; CHECK-LABEL: @volatile_atomicrmw_add_global_to_flat( -; CHECK: addrspacecast i32 addrspace(1)* %global.ptr to i32 addrspace(4)* -; CHECK: %ret = atomicrmw volatile add i32 addrspace(4)* +; CHECK: addrspacecast i32 addrspace(1)* %global.ptr to i32* +; CHECK: %ret = atomicrmw volatile add i32* define i32 @volatile_atomicrmw_add_global_to_flat(i32 addrspace(1)* %global.ptr, i32 %y) #0 { - %cast = addrspacecast i32 addrspace(1)* %global.ptr to i32 addrspace(4)* - %ret = atomicrmw volatile add i32 addrspace(4)* %cast, i32 %y seq_cst + %cast = addrspacecast i32 addrspace(1)* %global.ptr to i32* + %ret = atomicrmw volatile add i32* %cast, i32 %y seq_cst ret i32 %ret } ; CHECK-LABEL: @volatile_cmpxchg_global_to_flat( -; CHECK: addrspacecast i32 addrspace(1)* %global.ptr to i32 addrspace(4)* -; CHECK: cmpxchg volatile i32 addrspace(4)* +; CHECK: addrspacecast i32 addrspace(1)* %global.ptr to i32* +; CHECK: cmpxchg volatile i32* define { i32, i1 } @volatile_cmpxchg_global_to_flat(i32 addrspace(1)* %global.ptr, i32 %cmp, i32 %val) #0 { - %cast = addrspacecast i32 addrspace(1)* %global.ptr to i32 addrspace(4)* - %ret = cmpxchg volatile i32 addrspace(4)* %cast, i32 %cmp, i32 %val seq_cst monotonic + %cast = addrspacecast i32 addrspace(1)* %global.ptr to i32* + %ret = cmpxchg volatile i32* %cast, i32 %cmp, i32 %val seq_cst monotonic ret { i32, i1 } %ret } ; CHECK-LABEL: @volatile_cmpxchg_group_to_flat( -; CHECK: addrspacecast i32 addrspace(3)* %group.ptr to i32 addrspace(4)* -; CHECK: cmpxchg volatile i32 addrspace(4)* +; CHECK: addrspacecast i32 addrspace(3)* %group.ptr to i32* +; CHECK: cmpxchg volatile i32* define { i32, i1 } @volatile_cmpxchg_group_to_flat(i32 addrspace(3)* %group.ptr, i32 %cmp, i32 %val) #0 { - %cast = addrspacecast i32 addrspace(3)* %group.ptr to i32 addrspace(4)* - %ret = cmpxchg volatile i32 addrspace(4)* %cast, i32 %cmp, i32 %val seq_cst monotonic + %cast = addrspacecast i32 addrspace(3)* %group.ptr to i32* + %ret = cmpxchg volatile i32* %cast, i32 %cmp, i32 %val seq_cst monotonic ret { i32, i1 } %ret } ; FIXME: Shouldn't be losing names ; CHECK-LABEL: @volatile_memset_group_to_flat( -; CHECK: addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)* -; CHECK: call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %1, i8 4, i64 32, i32 4, i1 true) +; CHECK: addrspacecast i8 addrspace(3)* %group.ptr to i8* +; CHECK: call void @llvm.memset.p0i8.i64(i8* %1, i8 4, i64 32, i32 4, i1 true) define amdgpu_kernel void @volatile_memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y) #0 { - %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)* - call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 true) + %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8* + call void @llvm.memset.p0i8.i64(i8* %cast, i8 4, i64 32, i32 4, i1 true) ret void } ; CHECK-LABEL: @volatile_memset_global_to_flat( -; CHECK: addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)* -; CHECK: call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %1, i8 4, i64 32, i32 4, i1 true) +; CHECK: addrspacecast i8 addrspace(1)* %global.ptr to i8* +; CHECK: call void @llvm.memset.p0i8.i64(i8* %1, i8 4, i64 32, i32 4, i1 true) define amdgpu_kernel void @volatile_memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 { - %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)* - call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 true) + %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8* + call void @llvm.memset.p0i8.i64(i8* %cast, i8 4, i64 32, i32 4, i1 true) ret void } -declare void @llvm.memset.p4i8.i64(i8 addrspace(4)* nocapture writeonly, i8, i64, i32, i1) #1 +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) #1 attributes #0 = { nounwind } attributes #1 = { argmemonly nounwind } Index: test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll @@ -1,38 +1,38 @@ ; RUN: opt -S -load-store-vectorizer -mattr=-unaligned-buffer-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=ALIGNED -check-prefix=ALL %s ; RUN: opt -S -load-store-vectorizer -mattr=+unaligned-buffer-access,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=UNALIGNED -check-prefix=ALL %s -target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" +target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5" target triple = "amdgcn--" ; ALL-LABEL: @load_unknown_offset_align1_i8( ; ALL: alloca [128 x i8], align 1 -; UNALIGNED: load <2 x i8>, <2 x i8>* %{{[0-9]+}}, align 1{{$}} +; UNALIGNED: load <2 x i8>, <2 x i8> addrspace(5)* %{{[0-9]+}}, align 1{{$}} -; ALIGNED: load i8, i8* %ptr0, align 1{{$}} -; ALIGNED: load i8, i8* %ptr1, align 1{{$}} +; ALIGNED: load i8, i8 addrspace(5)* %ptr0, align 1{{$}} +; ALIGNED: load i8, i8 addrspace(5)* %ptr1, align 1{{$}} define amdgpu_kernel void @load_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 { - %alloca = alloca [128 x i8], align 1 - %ptr0 = getelementptr inbounds [128 x i8], [128 x i8]* %alloca, i32 0, i32 %offset - %val0 = load i8, i8* %ptr0, align 1 - %ptr1 = getelementptr inbounds i8, i8* %ptr0, i32 1 - %val1 = load i8, i8* %ptr1, align 1 + %alloca = alloca [128 x i8], align 1, addrspace(5) + %ptr0 = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* %alloca, i32 0, i32 %offset + %val0 = load i8, i8 addrspace(5)* %ptr0, align 1 + %ptr1 = getelementptr inbounds i8, i8 addrspace(5)* %ptr0, i32 1 + %val1 = load i8, i8 addrspace(5)* %ptr1, align 1 %add = add i8 %val0, %val1 store i8 %add, i8 addrspace(1)* %out ret void } ; ALL-LABEL: @load_unknown_offset_align1_i16( -; ALL: alloca [128 x i16], align 1{{$}} -; UNALIGNED: load <2 x i16>, <2 x i16>* %{{[0-9]+}}, align 1{{$}} +; ALL: alloca [128 x i16], align 1, addrspace(5){{$}} +; UNALIGNED: load <2 x i16>, <2 x i16> addrspace(5)* %{{[0-9]+}}, align 1{{$}} -; ALIGNED: load i16, i16* %ptr0, align 1{{$}} -; ALIGNED: load i16, i16* %ptr1, align 1{{$}} +; ALIGNED: load i16, i16 addrspace(5)* %ptr0, align 1{{$}} +; ALIGNED: load i16, i16 addrspace(5)* %ptr1, align 1{{$}} define amdgpu_kernel void @load_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 { - %alloca = alloca [128 x i16], align 1 - %ptr0 = getelementptr inbounds [128 x i16], [128 x i16]* %alloca, i32 0, i32 %offset - %val0 = load i16, i16* %ptr0, align 1 - %ptr1 = getelementptr inbounds i16, i16* %ptr0, i32 1 - %val1 = load i16, i16* %ptr1, align 1 + %alloca = alloca [128 x i16], align 1, addrspace(5) + %ptr0 = getelementptr inbounds [128 x i16], [128 x i16] addrspace(5)* %alloca, i32 0, i32 %offset + %val0 = load i16, i16 addrspace(5)* %ptr0, align 1 + %ptr1 = getelementptr inbounds i16, i16 addrspace(5)* %ptr0, i32 1 + %val1 = load i16, i16 addrspace(5)* %ptr1, align 1 %add = add i16 %val0, %val1 store i16 %add, i16 addrspace(1)* %out ret void @@ -43,16 +43,16 @@ ; ALL-LABEL: @load_unknown_offset_align1_i32( ; ALL: alloca [128 x i32], align 1 -; UNALIGNED: load <2 x i32>, <2 x i32>* %{{[0-9]+}}, align 1{{$}} +; UNALIGNED: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 1{{$}} -; ALIGNED: load i32, i32* %ptr0, align 1 -; ALIGNED: load i32, i32* %ptr1, align 1 +; ALIGNED: load i32, i32 addrspace(5)* %ptr0, align 1 +; ALIGNED: load i32, i32 addrspace(5)* %ptr1, align 1 define amdgpu_kernel void @load_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 { - %alloca = alloca [128 x i32], align 1 - %ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset - %val0 = load i32, i32* %ptr0, align 1 - %ptr1 = getelementptr inbounds i32, i32* %ptr0, i32 1 - %val1 = load i32, i32* %ptr1, align 1 + %alloca = alloca [128 x i32], align 1, addrspace(5) + %ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset + %val0 = load i32, i32 addrspace(5)* %ptr0, align 1 + %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %ptr0, i32 1 + %val1 = load i32, i32 addrspace(5)* %ptr1, align 1 %add = add i32 %val0, %val1 store i32 %add, i32 addrspace(1)* %out ret void @@ -63,17 +63,17 @@ ; ALL-LABEL: @load_alloca16_unknown_offset_align1_i32( ; ALL: alloca [128 x i32], align 16 -; UNALIGNED: load <2 x i32>, <2 x i32>* %{{[0-9]+}}, align 1{{$}} +; UNALIGNED: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 1{{$}} ; FIXME: Should change alignment ; ALIGNED: load i32 ; ALIGNED: load i32 define amdgpu_kernel void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 { - %alloca = alloca [128 x i32], align 16 - %ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset - %val0 = load i32, i32* %ptr0, align 1 - %ptr1 = getelementptr inbounds i32, i32* %ptr0, i32 1 - %val1 = load i32, i32* %ptr1, align 1 + %alloca = alloca [128 x i32], align 16, addrspace(5) + %ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset + %val0 = load i32, i32 addrspace(5)* %ptr0, align 1 + %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %ptr0, i32 1 + %val1 = load i32, i32 addrspace(5)* %ptr1, align 1 %add = add i32 %val0, %val1 store i32 %add, i32 addrspace(1)* %out ret void @@ -81,31 +81,31 @@ ; ALL-LABEL: @store_unknown_offset_align1_i8( ; ALL: alloca [128 x i8], align 1 -; UNALIGNED: store <2 x i8> , <2 x i8>* %{{[0-9]+}}, align 1{{$}} +; UNALIGNED: store <2 x i8> , <2 x i8> addrspace(5)* %{{[0-9]+}}, align 1{{$}} -; ALIGNED: store i8 9, i8* %ptr0, align 1{{$}} -; ALIGNED: store i8 10, i8* %ptr1, align 1{{$}} +; ALIGNED: store i8 9, i8 addrspace(5)* %ptr0, align 1{{$}} +; ALIGNED: store i8 10, i8 addrspace(5)* %ptr1, align 1{{$}} define amdgpu_kernel void @store_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 { - %alloca = alloca [128 x i8], align 1 - %ptr0 = getelementptr inbounds [128 x i8], [128 x i8]* %alloca, i32 0, i32 %offset - store i8 9, i8* %ptr0, align 1 - %ptr1 = getelementptr inbounds i8, i8* %ptr0, i32 1 - store i8 10, i8* %ptr1, align 1 + %alloca = alloca [128 x i8], align 1, addrspace(5) + %ptr0 = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* %alloca, i32 0, i32 %offset + store i8 9, i8 addrspace(5)* %ptr0, align 1 + %ptr1 = getelementptr inbounds i8, i8 addrspace(5)* %ptr0, i32 1 + store i8 10, i8 addrspace(5)* %ptr1, align 1 ret void } ; ALL-LABEL: @store_unknown_offset_align1_i16( ; ALL: alloca [128 x i16], align 1 -; UNALIGNED: store <2 x i16> , <2 x i16>* %{{[0-9]+}}, align 1{{$}} +; UNALIGNED: store <2 x i16> , <2 x i16> addrspace(5)* %{{[0-9]+}}, align 1{{$}} -; ALIGNED: store i16 9, i16* %ptr0, align 1{{$}} -; ALIGNED: store i16 10, i16* %ptr1, align 1{{$}} +; ALIGNED: store i16 9, i16 addrspace(5)* %ptr0, align 1{{$}} +; ALIGNED: store i16 10, i16 addrspace(5)* %ptr1, align 1{{$}} define amdgpu_kernel void @store_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 { - %alloca = alloca [128 x i16], align 1 - %ptr0 = getelementptr inbounds [128 x i16], [128 x i16]* %alloca, i32 0, i32 %offset - store i16 9, i16* %ptr0, align 1 - %ptr1 = getelementptr inbounds i16, i16* %ptr0, i32 1 - store i16 10, i16* %ptr1, align 1 + %alloca = alloca [128 x i16], align 1, addrspace(5) + %ptr0 = getelementptr inbounds [128 x i16], [128 x i16] addrspace(5)* %alloca, i32 0, i32 %offset + store i16 9, i16 addrspace(5)* %ptr0, align 1 + %ptr1 = getelementptr inbounds i16, i16 addrspace(5)* %ptr0, i32 1 + store i16 10, i16 addrspace(5)* %ptr1, align 1 ret void } @@ -115,16 +115,16 @@ ; ALL-LABEL: @store_unknown_offset_align1_i32( ; ALL: alloca [128 x i32], align 1 -; UNALIGNED: store <2 x i32> , <2 x i32>* %{{[0-9]+}}, align 1{{$}} +; UNALIGNED: store <2 x i32> , <2 x i32> addrspace(5)* %{{[0-9]+}}, align 1{{$}} -; ALIGNED: store i32 9, i32* %ptr0, align 1 -; ALIGNED: store i32 10, i32* %ptr1, align 1 +; ALIGNED: store i32 9, i32 addrspace(5)* %ptr0, align 1 +; ALIGNED: store i32 10, i32 addrspace(5)* %ptr1, align 1 define amdgpu_kernel void @store_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 { - %alloca = alloca [128 x i32], align 1 - %ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset - store i32 9, i32* %ptr0, align 1 - %ptr1 = getelementptr inbounds i32, i32* %ptr0, i32 1 - store i32 10, i32* %ptr1, align 1 + %alloca = alloca [128 x i32], align 1, addrspace(5) + %ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset + store i32 9, i32 addrspace(5)* %ptr0, align 1 + %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %ptr0, i32 1 + store i32 10, i32 addrspace(5)* %ptr1, align 1 ret void } Index: test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll @@ -5,7 +5,6 @@ ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ELT8-UNALIGNED,UNALIGNED,ALL %s ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ELT16-UNALIGNED,UNALIGNED,ALL %s -target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32 ; ALIGNED: store i32 @@ -17,52 +16,52 @@ ; ELT8-UNALIGNED: store <2 x i32> ; ELT16-UNALIGNED: store <4 x i32> -define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32(i32* %out) #0 { - %out.gep.1 = getelementptr i32, i32* %out, i32 1 - %out.gep.2 = getelementptr i32, i32* %out, i32 2 - %out.gep.3 = getelementptr i32, i32* %out, i32 3 - - store i32 9, i32* %out - store i32 1, i32* %out.gep.1 - store i32 23, i32* %out.gep.2 - store i32 19, i32* %out.gep.3 +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32(i32 addrspace(5)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3 + + store i32 9, i32 addrspace(5)* %out + store i32 1, i32 addrspace(5)* %out.gep.1 + store i32 23, i32 addrspace(5)* %out.gep.2 + store i32 19, i32 addrspace(5)* %out.gep.3 ret void } ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32_align1( -; ALIGNED: store i32 9, i32* %out, align 1 -; ALIGNED: store i32 1, i32* %out.gep.1, align 1 -; ALIGNED: store i32 23, i32* %out.gep.2, align 1 -; ALIGNED: store i32 19, i32* %out.gep.3, align 1 +; ALIGNED: store i32 9, i32 addrspace(5)* %out, align 1 +; ALIGNED: store i32 1, i32 addrspace(5)* %out.gep.1, align 1 +; ALIGNED: store i32 23, i32 addrspace(5)* %out.gep.2, align 1 +; ALIGNED: store i32 19, i32 addrspace(5)* %out.gep.3, align 1 -; ELT16-UNALIGNED: store <4 x i32> , <4 x i32>* %1, align 1 +; ELT16-UNALIGNED: store <4 x i32> , <4 x i32> addrspace(5)* %1, align 1 -; ELT8-UNALIGNED: store <2 x i32> , <2 x i32>* %1, align 1 -; ELT8-UNALIGNED: store <2 x i32> , <2 x i32>* %2, align 1 +; ELT8-UNALIGNED: store <2 x i32> , <2 x i32> addrspace(5)* %1, align 1 +; ELT8-UNALIGNED: store <2 x i32> , <2 x i32> addrspace(5)* %2, align 1 ; ELT4-UNALIGNED: store i32 ; ELT4-UNALIGNED: store i32 ; ELT4-UNALIGNED: store i32 ; ELT4-UNALIGNED: store i32 -define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align1(i32* %out) #0 { - %out.gep.1 = getelementptr i32, i32* %out, i32 1 - %out.gep.2 = getelementptr i32, i32* %out, i32 2 - %out.gep.3 = getelementptr i32, i32* %out, i32 3 - - store i32 9, i32* %out, align 1 - store i32 1, i32* %out.gep.1, align 1 - store i32 23, i32* %out.gep.2, align 1 - store i32 19, i32* %out.gep.3, align 1 +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align1(i32 addrspace(5)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3 + + store i32 9, i32 addrspace(5)* %out, align 1 + store i32 1, i32 addrspace(5)* %out.gep.1, align 1 + store i32 23, i32 addrspace(5)* %out.gep.2, align 1 + store i32 19, i32 addrspace(5)* %out.gep.3, align 1 ret void } ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32_align2( -; ALIGNED: store i32 9, i32* %out, align 2 -; ALIGNED: store i32 1, i32* %out.gep.1, align 2 -; ALIGNED: store i32 23, i32* %out.gep.2, align 2 -; ALIGNED: store i32 19, i32* %out.gep.3, align 2 +; ALIGNED: store i32 9, i32 addrspace(5)* %out, align 2 +; ALIGNED: store i32 1, i32 addrspace(5)* %out.gep.1, align 2 +; ALIGNED: store i32 23, i32 addrspace(5)* %out.gep.2, align 2 +; ALIGNED: store i32 19, i32 addrspace(5)* %out.gep.3, align 2 -; ELT16-UNALIGNED: store <4 x i32> , <4 x i32>* %1, align 2 +; ELT16-UNALIGNED: store <4 x i32> , <4 x i32> addrspace(5)* %1, align 2 ; ELT8-UNALIGNED: store <2 x i32> ; ELT8-UNALIGNED: store <2 x i32> @@ -71,29 +70,29 @@ ; ELT4-UNALIGNED: store i32 ; ELT4-UNALIGNED: store i32 ; ELT4-UNALIGNED: store i32 -define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align2(i32* %out) #0 { - %out.gep.1 = getelementptr i32, i32* %out, i32 1 - %out.gep.2 = getelementptr i32, i32* %out, i32 2 - %out.gep.3 = getelementptr i32, i32* %out, i32 3 - - store i32 9, i32* %out, align 2 - store i32 1, i32* %out.gep.1, align 2 - store i32 23, i32* %out.gep.2, align 2 - store i32 19, i32* %out.gep.3, align 2 +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align2(i32 addrspace(5)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3 + + store i32 9, i32 addrspace(5)* %out, align 2 + store i32 1, i32 addrspace(5)* %out.gep.1, align 2 + store i32 23, i32 addrspace(5)* %out.gep.2, align 2 + store i32 19, i32 addrspace(5)* %out.gep.3, align 2 ret void } ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8( ; ALL: store <4 x i8> -define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8(i8* %out) #0 { - %out.gep.1 = getelementptr i8, i8* %out, i32 1 - %out.gep.2 = getelementptr i8, i8* %out, i32 2 - %out.gep.3 = getelementptr i8, i8* %out, i32 3 - - store i8 9, i8* %out, align 4 - store i8 1, i8* %out.gep.1 - store i8 23, i8* %out.gep.2 - store i8 19, i8* %out.gep.3 +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8(i8 addrspace(5)* %out) #0 { + %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i32 1 + %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i32 2 + %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i32 3 + + store i8 9, i8 addrspace(5)* %out, align 4 + store i8 1, i8 addrspace(5)* %out.gep.1 + store i8 23, i8 addrspace(5)* %out.gep.2 + store i8 19, i8 addrspace(5)* %out.gep.3 ret void } @@ -103,26 +102,26 @@ ; ALIGNED: store i8 ; ALIGNED: store i8 -; UNALIGNED: store <4 x i8> , <4 x i8>* %1, align 1 -define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8_align1(i8* %out) #0 { - %out.gep.1 = getelementptr i8, i8* %out, i32 1 - %out.gep.2 = getelementptr i8, i8* %out, i32 2 - %out.gep.3 = getelementptr i8, i8* %out, i32 3 +; UNALIGNED: store <4 x i8> , <4 x i8> addrspace(5)* %1, align 1 +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8_align1(i8 addrspace(5)* %out) #0 { + %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i32 1 + %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i32 2 + %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i32 3 - store i8 9, i8* %out, align 1 - store i8 1, i8* %out.gep.1, align 1 - store i8 23, i8* %out.gep.2, align 1 - store i8 19, i8* %out.gep.3, align 1 + store i8 9, i8 addrspace(5)* %out, align 1 + store i8 1, i8 addrspace(5)* %out.gep.1, align 1 + store i8 23, i8 addrspace(5)* %out.gep.2, align 1 + store i8 19, i8 addrspace(5)* %out.gep.3, align 1 ret void } ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16( ; ALL: store <2 x i16> -define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16(i16* %out) #0 { - %out.gep.1 = getelementptr i16, i16* %out, i32 1 +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16(i16 addrspace(5)* %out) #0 { + %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1 - store i16 9, i16* %out, align 4 - store i16 12, i16* %out.gep.1 + store i16 9, i16 addrspace(5)* %out, align 4 + store i16 12, i16 addrspace(5)* %out.gep.1 ret void } @@ -130,12 +129,12 @@ ; ALIGNED: store i16 ; ALIGNED: store i16 -; UNALIGNED: store <2 x i16> , <2 x i16>* %1, align 2 -define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align2(i16* %out) #0 { - %out.gep.1 = getelementptr i16, i16* %out, i32 1 +; UNALIGNED: store <2 x i16> , <2 x i16> addrspace(5)* %1, align 2 +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align2(i16 addrspace(5)* %out) #0 { + %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1 - store i16 9, i16* %out, align 2 - store i16 12, i16* %out.gep.1, align 2 + store i16 9, i16 addrspace(5)* %out, align 2 + store i16 12, i16 addrspace(5)* %out.gep.1, align 2 ret void } @@ -143,22 +142,22 @@ ; ALIGNED: store i16 ; ALIGNED: store i16 -; UNALIGNED: store <2 x i16> , <2 x i16>* %1, align 1 -define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align1(i16* %out) #0 { - %out.gep.1 = getelementptr i16, i16* %out, i32 1 +; UNALIGNED: store <2 x i16> , <2 x i16> addrspace(5)* %1, align 1 +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align1(i16 addrspace(5)* %out) #0 { + %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1 - store i16 9, i16* %out, align 1 - store i16 12, i16* %out.gep.1, align 1 + store i16 9, i16 addrspace(5)* %out, align 1 + store i16 12, i16 addrspace(5)* %out.gep.1, align 1 ret void } ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align8( -; ALL: store <2 x i16> , <2 x i16>* %1, align 8 -define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align8(i16* %out) #0 { - %out.gep.1 = getelementptr i16, i16* %out, i32 1 +; ALL: store <2 x i16> , <2 x i16> addrspace(5)* %1, align 8 +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align8(i16 addrspace(5)* %out) #0 { + %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1 - store i16 9, i16* %out, align 8 - store i16 12, i16* %out.gep.1, align 2 + store i16 9, i16 addrspace(5)* %out, align 8 + store i16 12, i16 addrspace(5)* %out.gep.1, align 2 ret void } @@ -179,13 +178,13 @@ ; ELT16-ALIGNED: store i32 ; ELT16-UNALIGNED: store <3 x i32> -define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32(i32* %out) #0 { - %out.gep.1 = getelementptr i32, i32* %out, i32 1 - %out.gep.2 = getelementptr i32, i32* %out, i32 2 +define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32(i32 addrspace(5)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 - store i32 9, i32* %out - store i32 1, i32* %out.gep.1 - store i32 23, i32* %out.gep.2 + store i32 9, i32 addrspace(5)* %out + store i32 1, i32 addrspace(5)* %out.gep.1 + store i32 23, i32 addrspace(5)* %out.gep.2 ret void } @@ -202,13 +201,13 @@ ; ELT8-UNALIGNED: store i32 ; ELT16-UNALIGNED: store <3 x i32> -define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32_align1(i32* %out) #0 { - %out.gep.1 = getelementptr i32, i32* %out, i32 1 - %out.gep.2 = getelementptr i32, i32* %out, i32 2 +define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32_align1(i32 addrspace(5)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 - store i32 9, i32* %out, align 1 - store i32 1, i32* %out.gep.1, align 1 - store i32 23, i32* %out.gep.2, align 1 + store i32 9, i32 addrspace(5)* %out, align 1 + store i32 1, i32 addrspace(5)* %out.gep.1, align 1 + store i32 23, i32 addrspace(5)* %out.gep.2, align 1 ret void } @@ -218,13 +217,13 @@ ; ALIGNED: store i8 ; UNALIGNED: store <3 x i8> -define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i8_align1(i8* %out) #0 { - %out.gep.1 = getelementptr i8, i8* %out, i8 1 - %out.gep.2 = getelementptr i8, i8* %out, i8 2 +define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i8_align1(i8 addrspace(5)* %out) #0 { + %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1 + %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2 - store i8 9, i8* %out, align 1 - store i8 1, i8* %out.gep.1, align 1 - store i8 23, i8* %out.gep.2, align 1 + store i8 9, i8 addrspace(5)* %out, align 1 + store i8 1, i8 addrspace(5)* %out.gep.1, align 1 + store i8 23, i8 addrspace(5)* %out.gep.2, align 1 ret void } Index: test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll =================================================================== --- test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll +++ test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll @@ -1,6 +1,5 @@ ; RUN: llc < %s | FileCheck %s -target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" target triple = "amdgcn--" ; We need to compile this for a target where we have different address spaces, @@ -21,9 +20,9 @@ loop: %idx0 = phi i32 [ %next_idx0, %loop ], [ 0, %entry ] - %0 = getelementptr inbounds i32, i32* null, i32 %idx0 + %0 = getelementptr inbounds i32, i32 addrspace(5)* null, i32 %idx0 %1 = getelementptr inbounds i32, i32 addrspace(1)* null, i32 %idx0 - store i32 1, i32* %0 + store i32 1, i32 addrspace(5)* %0 store i32 7, i32 addrspace(1)* %1 %next_idx0 = add nuw nsw i32 %idx0, 1 br label %loop Index: test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll =================================================================== --- test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll +++ test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll @@ -1,4 +1,5 @@ ; RUN: opt -mtriple=amdgcn-unknown-amdhsa -loop-unroll -S -amdgpu-unroll-threshold-private=20000 %s | FileCheck %s +target datalayout = "A5" ; Check that we full unroll loop to be able to eliminate alloca ; CHECK-LABEL: @non_invariant_ind @@ -9,13 +10,13 @@ define amdgpu_kernel void @non_invariant_ind(i32 addrspace(1)* nocapture %a, i32 %x) { entry: - %arr = alloca [64 x i32], align 4 + %arr = alloca [64 x i32], align 4, addrspace(5) %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1 br label %for.body for.cond.cleanup: ; preds = %for.body - %arrayidx5 = getelementptr inbounds [64 x i32], [64 x i32]* %arr, i32 0, i32 %x - %tmp15 = load i32, i32* %arrayidx5, align 4 + %arrayidx5 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %arr, i32 0, i32 %x + %tmp15 = load i32, i32 addrspace(5)* %arrayidx5, align 4 %arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 %tmp1 store i32 %tmp15, i32 addrspace(1)* %arrayidx7, align 4 ret void @@ -27,8 +28,8 @@ %tmp16 = load i32, i32 addrspace(1)* %arrayidx, align 4 %add = add nsw i32 %i.015, %tmp1 %rem = srem i32 %add, 64 - %arrayidx3 = getelementptr inbounds [64 x i32], [64 x i32]* %arr, i32 0, i32 %rem - store i32 %tmp16, i32* %arrayidx3, align 4 + %arrayidx3 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %arr, i32 0, i32 %rem + store i32 %tmp16, i32 addrspace(5)* %arrayidx3, align 4 %inc = add nuw nsw i32 %i.015, 1 %exitcond = icmp eq i32 %inc, 100 br i1 %exitcond, label %for.cond.cleanup, label %for.body @@ -42,7 +43,7 @@ define amdgpu_kernel void @invariant_ind(i32 addrspace(1)* nocapture %a, i32 %x) { entry: - %arr = alloca [64 x i32], align 4 + %arr = alloca [64 x i32], align 4, addrspace(5) %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1 br label %for.cond2.preheader @@ -54,8 +55,8 @@ br label %for.body6 for.cond.cleanup: ; preds = %for.cond.cleanup5 - %arrayidx13 = getelementptr inbounds [64 x i32], [64 x i32]* %arr, i32 0, i32 %x - %tmp16 = load i32, i32* %arrayidx13, align 4 + %arrayidx13 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %arr, i32 0, i32 %x + %tmp16 = load i32, i32 addrspace(5)* %arrayidx13, align 4 %arrayidx15 = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 %tmp1 store i32 %tmp16, i32 addrspace(1)* %arrayidx15, align 4 ret void @@ -69,8 +70,8 @@ %j.025 = phi i32 [ 0, %for.cond2.preheader ], [ %inc, %for.body6 ] %add = add nsw i32 %j.025, %tmp1 %rem = srem i32 %add, 64 - %arrayidx8 = getelementptr inbounds [64 x i32], [64 x i32]* %arr, i32 0, i32 %rem - store i32 %tmp15, i32* %arrayidx8, align 4 + %arrayidx8 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %arr, i32 0, i32 %rem + store i32 %tmp15, i32 addrspace(5)* %arrayidx8, align 4 %inc = add nuw nsw i32 %j.025, 1 %exitcond = icmp eq i32 %inc, 100 br i1 %exitcond, label %for.cond.cleanup5, label %for.body6 @@ -84,13 +85,13 @@ define amdgpu_kernel void @too_big(i32 addrspace(1)* nocapture %a, i32 %x) { entry: - %arr = alloca [256 x i32], align 4 + %arr = alloca [256 x i32], align 4, addrspace(5) %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1 br label %for.body for.cond.cleanup: ; preds = %for.body - %arrayidx5 = getelementptr inbounds [256 x i32], [256 x i32]* %arr, i32 0, i32 %x - %tmp15 = load i32, i32* %arrayidx5, align 4 + %arrayidx5 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(5)* %arr, i32 0, i32 %x + %tmp15 = load i32, i32 addrspace(5)* %arrayidx5, align 4 %arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 %tmp1 store i32 %tmp15, i32 addrspace(1)* %arrayidx7, align 4 ret void @@ -102,8 +103,8 @@ %tmp16 = load i32, i32 addrspace(1)* %arrayidx, align 4 %add = add nsw i32 %i.015, %tmp1 %rem = srem i32 %add, 64 - %arrayidx3 = getelementptr inbounds [256 x i32], [256 x i32]* %arr, i32 0, i32 %rem - store i32 %tmp16, i32* %arrayidx3, align 4 + %arrayidx3 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(5)* %arr, i32 0, i32 %rem + store i32 %tmp16, i32 addrspace(5)* %arrayidx3, align 4 %inc = add nuw nsw i32 %i.015, 1 %exitcond = icmp eq i32 %inc, 100 br i1 %exitcond, label %for.cond.cleanup, label %for.body @@ -118,13 +119,13 @@ define amdgpu_kernel void @dynamic_size_alloca(i32 addrspace(1)* nocapture %a, i32 %n, i32 %x) { entry: - %arr = alloca i32, i32 %n, align 4 + %arr = alloca i32, i32 %n, align 4, addrspace(5) %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1 br label %for.body for.cond.cleanup: ; preds = %for.body - %arrayidx5 = getelementptr inbounds i32, i32* %arr, i32 %x - %tmp15 = load i32, i32* %arrayidx5, align 4 + %arrayidx5 = getelementptr inbounds i32, i32 addrspace(5)* %arr, i32 %x + %tmp15 = load i32, i32 addrspace(5)* %arrayidx5, align 4 %arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 %tmp1 store i32 %tmp15, i32 addrspace(1)* %arrayidx7, align 4 ret void @@ -136,8 +137,8 @@ %tmp16 = load i32, i32 addrspace(1)* %arrayidx, align 4 %add = add nsw i32 %i.015, %tmp1 %rem = srem i32 %add, 64 - %arrayidx3 = getelementptr inbounds i32, i32* %arr, i32 %rem - store i32 %tmp16, i32* %arrayidx3, align 4 + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(5)* %arr, i32 %rem + store i32 %tmp16, i32 addrspace(5)* %arrayidx3, align 4 %inc = add nuw nsw i32 %i.015, 1 %exitcond = icmp eq i32 %inc, 100 br i1 %exitcond, label %for.cond.cleanup, label %for.body