Index: docs/AMDGPUUsage.rst =================================================================== --- docs/AMDGPUUsage.rst +++ docs/AMDGPUUsage.rst @@ -270,26 +270,16 @@ .. table:: Address Space Mapping :name: amdgpu-address-space-mapping-table - ================== ================= ================= + ================== ================= LLVM Address Space Memory Space - ------------------ ----------------------------------- - \ Current Default Future Default - ================== ================= ================= - 0 Generic (Flat) Generic (Flat) - 1 Global Global - 2 Constant Region (GDS) - 3 Local (group/LDS) Local (group/LDS) - 4 Region (GDS) Constant - 5 Private (Scratch) Private (Scratch) - ================== ================= ================= - -Current Default - This is the current default address space mapping used for all languages. - This will shortly be deprecated. - -Future Default - This will shortly be the only address space mapping for all languages using - AMDGPU backend. + ================== ================= + 0 Generic (Flat) + 1 Global + 2 Region (GDS) + 3 Local (group/LDS) + 4 Constant + 5 Private (Scratch) + ================== ================= .. _amdgpu-memory-scopes: Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -83,22 +83,22 @@ def int_amdgcn_dispatch_ptr : GCCBuiltin<"__builtin_amdgcn_dispatch_ptr">, - Intrinsic<[LLVMQualPointerType], [], + Intrinsic<[LLVMQualPointerType], [], [IntrNoMem, IntrSpeculatable]>; def int_amdgcn_queue_ptr : GCCBuiltin<"__builtin_amdgcn_queue_ptr">, - Intrinsic<[LLVMQualPointerType], [], + Intrinsic<[LLVMQualPointerType], [], [IntrNoMem, IntrSpeculatable]>; def int_amdgcn_kernarg_segment_ptr : GCCBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">, - Intrinsic<[LLVMQualPointerType], [], + Intrinsic<[LLVMQualPointerType], [], [IntrNoMem, IntrSpeculatable]>; def int_amdgcn_implicitarg_ptr : GCCBuiltin<"__builtin_amdgcn_implicitarg_ptr">, - Intrinsic<[LLVMQualPointerType], [], + Intrinsic<[LLVMQualPointerType], [], [IntrNoMem, IntrSpeculatable]>; def int_amdgcn_groupstaticsize : @@ -111,7 +111,7 @@ def int_amdgcn_implicit_buffer_ptr : GCCBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">, - Intrinsic<[LLVMQualPointerType], [], + Intrinsic<[LLVMQualPointerType], [], [IntrNoMem, IntrSpeculatable]>; // Set EXEC to the 64-bit value given. Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -222,7 +222,7 @@ MAX_COMMON_ADDRESS = 5, GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). - CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2) + CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2) LOCAL_ADDRESS = 3, ///< Address space for local memory. /// Address space for direct addressible parameter memory (CONST0) PARAM_D_ADDRESS = 6, Index: lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp +++ lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp @@ -61,7 +61,7 @@ /* Region */ {NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, MayAlias} }; static const AliasResult ASAliasRulesGenIsZero[6][6] = { - /* Flat Global Constant Group Region Private */ + /* Flat Global Region Group Constant Private */ /* Flat */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias}, /* Global */ {MayAlias, MayAlias, NoAlias , NoAlias , NoAlias , NoAlias}, /* Constant */ {MayAlias, NoAlias , MayAlias, NoAlias , NoAlias, NoAlias}, @@ -72,9 +72,9 @@ assert(AS.MAX_COMMON_ADDRESS <= 5); if (AS.FLAT_ADDRESS == 0) { assert(AS.GLOBAL_ADDRESS == 1 && - AS.REGION_ADDRESS == 4 && + AS.REGION_ADDRESS == 2 && AS.LOCAL_ADDRESS == 3 && - AS.CONSTANT_ADDRESS == 2 && + AS.CONSTANT_ADDRESS == 4 && AS.PRIVATE_ADDRESS == 5); ASAliasRules = &ASAliasRulesGenIsZero; } else { Index: lib/Target/AMDGPU/AMDGPUCallLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -116,7 +116,7 @@ if (Info->hasKernargSegmentPtr()) { unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); - const LLT P2 = LLT::pointer(2, 64); + const LLT P2 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); unsigned VReg = MRI.createGenericVirtualRegister(P2); MRI.addLiveIn(InputPtrReg, VReg); MIRBuilder.getMBB().addLiveIn(InputPtrReg); Index: lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -12,6 +12,7 @@ /// \todo This should be generated by TableGen. //===----------------------------------------------------------------------===// +#include "AMDGPU.h" #include "AMDGPULegalizerInfo.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/ValueTypes.h" @@ -29,8 +30,8 @@ const LLT V2S16 = LLT::vector(2, 16); const LLT S32 = LLT::scalar(32); const LLT S64 = LLT::scalar(64); - const LLT P1 = LLT::pointer(1, 64); - const LLT P2 = LLT::pointer(2, 64); + const LLT P1 = LLT::pointer(AMDGPUAS::GLOBAL_ADDRESS, 64); + const LLT P2 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); setAction({G_ADD, S32}, Legal); setAction({G_AND, S32}, Legal); Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -266,7 +266,7 @@ // 32-bit private, local, and region pointers. 64-bit global, constant and // flat. - return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32" + return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32" "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"; } Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -946,7 +946,7 @@ AMDGPUAS AS; AS.FLAT_ADDRESS = 0; AS.PRIVATE_ADDRESS = 5; - AS.REGION_ADDRESS = 4; + AS.REGION_ADDRESS = 2; return AS; } Index: test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir =================================================================== --- test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir +++ test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir @@ -5,7 +5,7 @@ # REQUIRES: global-isel --- | - define amdgpu_kernel void @smrd_imm(i32 addrspace(2)* %const0) { ret void } + define amdgpu_kernel void @smrd_imm(i32 addrspace(4)* %const0) { ret void } ... --- @@ -91,50 +91,50 @@ bb.0: liveins: $sgpr0_sgpr1 - %0:sgpr(p2) = COPY $sgpr0_sgpr1 + %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 4 - %2:sgpr(p2) = G_GEP %0, %1 + %2:sgpr(p4) = G_GEP %0, %1 %3:sgpr(s32) = G_LOAD %2 :: (load 4 from %ir.const0) $sgpr0 = COPY %3 %4:sgpr(s64) = G_CONSTANT i64 1020 - %5:sgpr(p2) = G_GEP %0, %4 + %5:sgpr(p4) = G_GEP %0, %4 %6:sgpr(s32) = G_LOAD %5 :: (load 4 from %ir.const0) $sgpr0 = COPY %6 %7:sgpr(s64) = G_CONSTANT i64 1024 - %8:sgpr(p2) = G_GEP %0, %7 + %8:sgpr(p4) = G_GEP %0, %7 %9:sgpr(s32) = G_LOAD %8 :: (load 4 from %ir.const0) $sgpr0 = COPY %9 %10:sgpr(s64) = G_CONSTANT i64 1048572 - %11:sgpr(p2) = G_GEP %0, %10 + %11:sgpr(p4) = G_GEP %0, %10 %12:sgpr(s32) = G_LOAD %11 :: (load 4 from %ir.const0) $sgpr0 = COPY %12 %13:sgpr(s64) = G_CONSTANT i64 1048576 - %14:sgpr(p2) = G_GEP %0, %13 + %14:sgpr(p4) = G_GEP %0, %13 %15:sgpr(s32) = G_LOAD %14 :: (load 4 from %ir.const0) $sgpr0 = COPY %15 %16:sgpr(s64) = G_CONSTANT i64 17179869180 - %17:sgpr(p2) = G_GEP %0, %16 + %17:sgpr(p4) = G_GEP %0, %16 %18:sgpr(s32) = G_LOAD %17 :: (load 4 from %ir.const0) $sgpr0 = COPY %18 %19:sgpr(s64) = G_CONSTANT i64 17179869184 - %20:sgpr(p2) = G_GEP %0, %19 + %20:sgpr(p4) = G_GEP %0, %19 %21:sgpr(s32) = G_LOAD %20 :: (load 4 from %ir.const0) $sgpr0 = COPY %21 %22:sgpr(s64) = G_CONSTANT i64 4294967292 - %23:sgpr(p2) = G_GEP %0, %22 + %23:sgpr(p4) = G_GEP %0, %22 %24:sgpr(s32) = G_LOAD %23 :: (load 4 from %ir.const0) $sgpr0 = COPY %24 %25:sgpr(s64) = G_CONSTANT i64 4294967296 - %26:sgpr(p2) = G_GEP %0, %25 + %26:sgpr(p4) = G_GEP %0, %25 %27:sgpr(s32) = G_LOAD %26 :: (load 4 from %ir.const0) $sgpr0 = COPY %27 Index: test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll =================================================================== --- test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll +++ test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll @@ -18,28 +18,28 @@ } ; CHECK-LABEL: name: test_ptr2_byval -; CHECK: [[S01:%[0-9]+]]:_(p2) = COPY $sgpr0_sgpr1 +; CHECK: [[S01:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; CHECK: G_LOAD [[S01]] -define amdgpu_vs void @test_ptr2_byval(i32 addrspace(2)* byval %arg0) { - %tmp0 = load volatile i32, i32 addrspace(2)* %arg0 +define amdgpu_vs void @test_ptr2_byval(i32 addrspace(4)* byval %arg0) { + %tmp0 = load volatile i32, i32 addrspace(4)* %arg0 ret void } ; CHECK-LABEL: name: test_ptr2_inreg -; CHECK: [[S01:%[0-9]+]]:_(p2) = COPY $sgpr0_sgpr1 +; CHECK: [[S01:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; CHECK: G_LOAD [[S01]] -define amdgpu_vs void @test_ptr2_inreg(i32 addrspace(2)* inreg %arg0) { - %tmp0 = load volatile i32, i32 addrspace(2)* %arg0 +define amdgpu_vs void @test_ptr2_inreg(i32 addrspace(4)* inreg %arg0) { + %tmp0 = load volatile i32, i32 addrspace(4)* %arg0 ret void } ; CHECK-LABEL: name: test_sgpr_alignment0 ; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr0 -; CHECK: [[S23:%[0-9]+]]:_(p2) = COPY $sgpr2_sgpr3 +; CHECK: [[S23:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; CHECK: G_LOAD [[S23]] ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), %{{[0-9]+}}(s32), %{{[0-9]+}}(s32), [[S0]] -define amdgpu_vs void @test_sgpr_alignment0(float inreg %arg0, i32 addrspace(2)* inreg %arg1) { - %tmp0 = load volatile i32, i32 addrspace(2)* %arg1 +define amdgpu_vs void @test_sgpr_alignment0(float inreg %arg0, i32 addrspace(4)* inreg %arg1) { + %tmp0 = load volatile i32, i32 addrspace(4)* %arg1 call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %arg0, float undef, float undef, float undef, i1 false, i1 false) #0 ret void } Index: test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir =================================================================== --- test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir +++ test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir @@ -3,7 +3,7 @@ # REQUIRES: global-isel --- | - define amdgpu_kernel void @load_constant(i32 addrspace(2)* %ptr0) { ret void } + define amdgpu_kernel void @load_constant(i32 addrspace(4)* %ptr0) { ret void } define amdgpu_kernel void @load_global_uniform(i32 addrspace(1)* %ptr1) { %tmp0 = load i32, i32 addrspace(1)* %ptr1 ret void @@ -30,7 +30,7 @@ body: | bb.0: liveins: $sgpr0_sgpr1 - %0:_(p2) = COPY $sgpr0_sgpr1 + %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (load 4 from %ir.ptr0) ... Index: test/CodeGen/AMDGPU/GlobalISel/smrd.ll =================================================================== --- test/CodeGen/AMDGPU/GlobalISel/smrd.ll +++ test/CodeGen/AMDGPU/GlobalISel/smrd.ll @@ -9,10 +9,10 @@ ; GCN-LABEL: {{^}}smrd0: ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 -define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) { entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1 - %1 = load i32, i32 addrspace(2)* %0 + %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 1 + %1 = load i32, i32 addrspace(4)* %0 store i32 %1, i32 addrspace(1)* %out ret void } @@ -21,10 +21,10 @@ ; GCN-LABEL: {{^}}smrd1: ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}} ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc -define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) { entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255 - %1 = load i32, i32 addrspace(2)* %0 + %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 255 + %1 = load i32, i32 addrspace(4)* %0 store i32 %1, i32 addrspace(1)* %out ret void } @@ -36,10 +36,10 @@ ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 ; GCN: s_endpgm -define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) { entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256 - %1 = load i32, i32 addrspace(2)* %0 + %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 256 + %1 = load i32, i32 addrspace(4)* %0 store i32 %1, i32 addrspace(1)* %out ret void } @@ -51,10 +51,10 @@ ; XSI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b ; TODO: Add VI checks ; XGCN: s_endpgm -define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) { entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32 - %1 = load i32, i32 addrspace(2)* %0 + %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 4294967296 ; 2 ^ 32 + %1 = load i32, i32 addrspace(4)* %0 store i32 %1, i32 addrspace(1)* %out ret void } @@ -65,10 +65,10 @@ ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc -define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) { entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143 - %1 = load i32, i32 addrspace(2)* %0 + %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 262143 + %1 = load i32, i32 addrspace(4)* %0 store i32 %1, i32 addrspace(1)* %out ret void } @@ -79,10 +79,10 @@ ; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; GCN: s_endpgm -define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) { entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144 - %1 = load i32, i32 addrspace(2)* %0 + %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 262144 + %1 = load i32, i32 addrspace(4)* %0 store i32 %1, i32 addrspace(1)* %out ret void } Index: test/CodeGen/AMDGPU/add.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/add.v2i16.ll +++ test/CodeGen/AMDGPU/add.v2i16.ll @@ -27,9 +27,9 @@ ; VI: s_add_i32 ; VI: s_add_i32 -define amdgpu_kernel void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 { - %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0 - %b = load <2 x i16>, <2 x i16> addrspace(2)* %in1 +define amdgpu_kernel void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0, <2 x i16> addrspace(4)* %in1) #1 { + %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0 + %b = load <2 x i16>, <2 x i16> addrspace(4)* %in1 %add = add <2 x i16> %a, %b store <2 x i16> %add, <2 x i16> addrspace(1)* %out ret void @@ -41,8 +41,8 @@ ; VI: s_add_i32 ; VI: s_add_i32 -define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 { - %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0 +define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0) #1 { + %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0 %add = add <2 x i16> %a, %a store <2 x i16> %add, <2 x i16> addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/addrspacecast.ll =================================================================== --- test/CodeGen/AMDGPU/addrspacecast.ll +++ test/CodeGen/AMDGPU/addrspacecast.ll @@ -100,8 +100,8 @@ ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] ; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}} -define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 { - %stof = addrspacecast i32 addrspace(2)* %ptr to i32* +define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(4)* %ptr) #0 { + %stof = addrspacecast i32 addrspace(4)* %ptr to i32* %ld = load volatile i32, i32* %stof ret void } @@ -160,8 +160,8 @@ ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0 ; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0 define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32* %ptr) #0 { - %ftos = addrspacecast i32* %ptr to i32 addrspace(2)* - load volatile i32, i32 addrspace(2)* %ftos + %ftos = addrspacecast i32* %ptr to i32 addrspace(4)* + load volatile i32, i32 addrspace(4)* %ftos ret void } Index: test/CodeGen/AMDGPU/amdgcn.bitcast.ll =================================================================== --- test/CodeGen/AMDGPU/amdgcn.bitcast.ll +++ test/CodeGen/AMDGPU/amdgcn.bitcast.ll @@ -4,9 +4,9 @@ ; This test just checks that the compiler doesn't crash. ; FUNC-LABEL: {{^}}v32i8_to_v8i32: -define amdgpu_ps float @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 { +define amdgpu_ps float @v32i8_to_v8i32(<32 x i8> addrspace(4)* inreg) #0 { entry: - %1 = load <32 x i8>, <32 x i8> addrspace(2)* %0 + %1 = load <32 x i8>, <32 x i8> addrspace(4)* %0 %2 = bitcast <32 x i8> %1 to <8 x i32> %3 = extractelement <8 x i32> %2, i32 1 %4 = icmp ne i32 %3, 0 Index: test/CodeGen/AMDGPU/amdgpu.private-memory.ll =================================================================== --- test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -48,12 +48,12 @@ ; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0 -; HSAOPT: [[DISPATCH_PTR:%[0-9]+]] = call noalias nonnull dereferenceable(64) i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() -; HSAOPT: [[CAST_DISPATCH_PTR:%[0-9]+]] = bitcast i8 addrspace(2)* [[DISPATCH_PTR]] to i32 addrspace(2)* -; HSAOPT: [[GEP0:%[0-9]+]] = getelementptr inbounds i32, i32 addrspace(2)* [[CAST_DISPATCH_PTR]], i64 1 -; HSAOPT: [[LDXY:%[0-9]+]] = load i32, i32 addrspace(2)* [[GEP0]], align 4, !invariant.load !0 -; HSAOPT: [[GEP1:%[0-9]+]] = getelementptr inbounds i32, i32 addrspace(2)* [[CAST_DISPATCH_PTR]], i64 2 -; HSAOPT: [[LDZU:%[0-9]+]] = load i32, i32 addrspace(2)* [[GEP1]], align 4, !range !1, !invariant.load !0 +; HSAOPT: [[DISPATCH_PTR:%[0-9]+]] = call noalias nonnull dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() +; HSAOPT: [[CAST_DISPATCH_PTR:%[0-9]+]] = bitcast i8 addrspace(4)* [[DISPATCH_PTR]] to i32 addrspace(4)* +; HSAOPT: [[GEP0:%[0-9]+]] = getelementptr inbounds i32, i32 addrspace(4)* [[CAST_DISPATCH_PTR]], i64 1 +; HSAOPT: [[LDXY:%[0-9]+]] = load i32, i32 addrspace(4)* [[GEP0]], align 4, !invariant.load !0 +; HSAOPT: [[GEP1:%[0-9]+]] = getelementptr inbounds i32, i32 addrspace(4)* [[CAST_DISPATCH_PTR]], i64 2 +; HSAOPT: [[LDZU:%[0-9]+]] = load i32, i32 addrspace(4)* [[GEP1]], align 4, !range !1, !invariant.load !0 ; HSAOPT: [[EXTRACTY:%[0-9]+]] = lshr i32 [[LDXY]], 16 ; HSAOPT: [[WORKITEM_ID_X:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.x(), !range !2 Index: test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll =================================================================== --- test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll +++ test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -8,10 +8,10 @@ declare i32 @llvm.amdgcn.workitem.id.y() #0 declare i32 @llvm.amdgcn.workitem.id.z() #0 -declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 -declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 -declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 -declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #0 +declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 +declare i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 +declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 +declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0 declare i64 @llvm.amdgcn.dispatch.id() #0 ; HSA: define void @use_workitem_id_x() #1 { @@ -58,15 +58,15 @@ ; HSA: define void @use_dispatch_ptr() #7 { define void @use_dispatch_ptr() #1 { - %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() - store volatile i8 addrspace(2)* %dispatch.ptr, i8 addrspace(2)* addrspace(1)* undef + %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + store volatile i8 addrspace(4)* %dispatch.ptr, i8 addrspace(4)* addrspace(1)* undef ret void } ; HSA: define void @use_queue_ptr() #8 { define void @use_queue_ptr() #1 { - %queue.ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr() - store volatile i8 addrspace(2)* %queue.ptr, i8 addrspace(2)* addrspace(1)* undef + %queue.ptr = call i8 addrspace(4)* @llvm.amdgcn.queue.ptr() + store volatile i8 addrspace(4)* %queue.ptr, i8 addrspace(4)* addrspace(1)* undef ret void } @@ -186,22 +186,22 @@ ; HSA: define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #8 { define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 { - %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* - store volatile i32 0, i32 addrspace(4)* %stof + %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(2)* + store volatile i32 0, i32 addrspace(2)* %stof ret void } ; HSA: define void @use_group_to_flat_addrspacecast_gfx9(i32 addrspace(3)* %ptr) #12 { define void @use_group_to_flat_addrspacecast_gfx9(i32 addrspace(3)* %ptr) #2 { - %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* - store volatile i32 0, i32 addrspace(4)* %stof + %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(2)* + store volatile i32 0, i32 addrspace(2)* %stof ret void } ; HSA: define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(i32 addrspace(3)* %ptr) #13 { define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(i32 addrspace(3)* %ptr) #2 { - %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* - store volatile i32 0, i32 addrspace(4)* %stof + %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(2)* + store volatile i32 0, i32 addrspace(2)* %stof call void @func_indirect_use_queue_ptr() ret void } @@ -226,8 +226,8 @@ ; HSA: define void @use_kernarg_segment_ptr() #14 { define void @use_kernarg_segment_ptr() #1 { - %kernarg.segment.ptr = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() - store volatile i8 addrspace(2)* %kernarg.segment.ptr, i8 addrspace(2)* addrspace(1)* undef + %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() + store volatile i8 addrspace(4)* %kernarg.segment.ptr, i8 addrspace(4)* addrspace(1)* undef ret void } @@ -239,15 +239,15 @@ ; HSA: define amdgpu_kernel void @kern_use_implicitarg_ptr() #15 { define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 { - %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() - store volatile i8 addrspace(2)* %implicitarg.ptr, i8 addrspace(2)* addrspace(1)* undef + %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + store volatile i8 addrspace(4)* %implicitarg.ptr, i8 addrspace(4)* addrspace(1)* undef ret void } ; HSA: define void @use_implicitarg_ptr() #15 { define void @use_implicitarg_ptr() #1 { - %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() - store volatile i8 addrspace(2)* %implicitarg.ptr, i8 addrspace(2)* addrspace(1)* undef + %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + store volatile i8 addrspace(4)* %implicitarg.ptr, i8 addrspace(4)* addrspace(1)* undef ret void } Index: test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll =================================================================== --- test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -8,9 +8,9 @@ declare i32 @llvm.amdgcn.workitem.id.y() #0 declare i32 @llvm.amdgcn.workitem.id.z() #0 -declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 -declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 -declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 +declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 +declare i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 +declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 ; HSA: define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { @@ -149,27 +149,27 @@ ; HSA: define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #10 { define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 { - %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() - %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)* - %val = load i32, i32 addrspace(2)* %bc + %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %bc = bitcast i8 addrspace(4)* %dispatch.ptr to i32 addrspace(4)* + %val = load i32, i32 addrspace(4)* %bc store i32 %val, i32 addrspace(1)* %ptr ret void } ; HSA: define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #11 { define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 { - %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr() - %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)* - %val = load i32, i32 addrspace(2)* %bc + %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.queue.ptr() + %bc = bitcast i8 addrspace(4)* %dispatch.ptr to i32 addrspace(4)* + %val = load i32, i32 addrspace(4)* %bc store i32 %val, i32 addrspace(1)* %ptr ret void } ; HSA: define amdgpu_kernel void @use_kernarg_segment_ptr(i32 addrspace(1)* %ptr) #12 { define amdgpu_kernel void @use_kernarg_segment_ptr(i32 addrspace(1)* %ptr) #1 { - %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() - %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)* - %val = load i32, i32 addrspace(2)* %bc + %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() + %bc = bitcast i8 addrspace(4)* %dispatch.ptr to i32 addrspace(4)* + %val = load i32, i32 addrspace(4)* %bc store i32 %val, i32 addrspace(1)* %ptr ret void } @@ -210,9 +210,9 @@ ret void } -; HSA: define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 { -define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 { - %stof = addrspacecast i32 addrspace(2)* %ptr to i32* +; HSA: define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(4)* %ptr) #1 { +define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(4)* %ptr) #1 { + %stof = addrspacecast i32 addrspace(4)* %ptr to i32* %ld = load volatile i32, i32* %stof ret void } @@ -226,8 +226,8 @@ ; HSA: define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32* %ptr) #1 { define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32* %ptr) #1 { - %ftos = addrspacecast i32* %ptr to i32 addrspace(2)* - %ld = load volatile i32, i32 addrspace(2)* %ftos + %ftos = addrspacecast i32* %ptr to i32 addrspace(4)* + %ld = load volatile i32, i32 addrspace(4)* %ftos ret void } Index: test/CodeGen/AMDGPU/branch-relaxation.ll =================================================================== --- test/CodeGen/AMDGPU/branch-relaxation.ll +++ test/CodeGen/AMDGPU/branch-relaxation.ll @@ -358,7 +358,7 @@ br i1 %cmp0, label %bb2, label %bb1 bb1: - %val = load volatile i32, i32 addrspace(2)* undef + %val = load volatile i32, i32 addrspace(4)* undef %cmp1 = icmp eq i32 %val, 3 br i1 %cmp1, label %bb3, label %bb2 Index: test/CodeGen/AMDGPU/call-argument-types.ll =================================================================== --- test/CodeGen/AMDGPU/call-argument-types.ll +++ test/CodeGen/AMDGPU/call-argument-types.ll @@ -345,7 +345,7 @@ ; GCN: s_waitcnt ; GCN-NEXT: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { - %ptr = load <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(2)* undef + %ptr = load <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(4)* undef %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr call void @external_void_func_v8i32(<8 x i32> %val) ret void @@ -359,7 +359,7 @@ ; GCN: s_waitcnt ; GCN-NEXT: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { - %ptr = load <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(2)* undef + %ptr = load <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(4)* undef %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr call void @external_void_func_v16i32(<16 x i32> %val) ret void @@ -377,7 +377,7 @@ ; GCN: s_waitcnt ; GCN-NEXT: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { - %ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef + %ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr call void @external_void_func_v32i32(<32 x i32> %val) ret void @@ -405,7 +405,7 @@ ; GCN: s_swappc_b64 ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { - %ptr0 = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef + %ptr0 = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef %val0 = load <32 x i32>, <32 x i32> addrspace(1)* %ptr0 %val1 = load i32, i32 addrspace(1)* undef call void @external_void_func_v32i32_i32(<32 x i32> %val0, i32 %val1) @@ -430,7 +430,7 @@ ; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { - %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(2)* undef + %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0 call void @external_void_func_struct_i8_i32({ i8, i32 } %val) ret void @@ -516,7 +516,7 @@ ; GCN-LABEL: {{^}}test_call_external_void_func_v16i8: define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { - %ptr = load <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(2)* undef + %ptr = load <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(4)* undef %val = load <16 x i8>, <16 x i8> addrspace(1)* %ptr call void @external_void_func_v16i8(<16 x i8> %val) ret void Index: test/CodeGen/AMDGPU/callee-special-input-sgprs.ll =================================================================== --- test/CodeGen/AMDGPU/callee-special-input-sgprs.ll +++ test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -4,9 +4,9 @@ ; GCN-LABEL: {{^}}use_dispatch_ptr: ; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 define void @use_dispatch_ptr() #1 { - %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 - %header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* - %value = load volatile i32, i32 addrspace(2)* %header_ptr + %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 + %header_ptr = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)* + %value = load volatile i32, i32 addrspace(4)* %header_ptr ret void } @@ -21,9 +21,9 @@ ; GCN-LABEL: {{^}}use_queue_ptr: ; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 define void @use_queue_ptr() #1 { - %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 - %header_ptr = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)* - %value = load volatile i32, i32 addrspace(2)* %header_ptr + %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 + %header_ptr = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)* + %value = load volatile i32, i32 addrspace(4)* %header_ptr ret void } @@ -62,9 +62,9 @@ ; GCN-LABEL: {{^}}use_kernarg_segment_ptr: ; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 define void @use_kernarg_segment_ptr() #1 { - %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 - %header_ptr = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)* - %value = load volatile i32, i32 addrspace(2)* %header_ptr + %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 + %header_ptr = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)* + %value = load volatile i32, i32 addrspace(4)* %header_ptr ret void } @@ -435,17 +435,17 @@ %alloca = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca - %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 - %dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* - %val0 = load volatile i32, i32 addrspace(2)* %dispatch_ptr.bc + %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 + %dispatch_ptr.bc = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)* + %val0 = load volatile i32, i32 addrspace(4)* %dispatch_ptr.bc - %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 - %queue_ptr.bc = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)* - %val1 = load volatile i32, i32 addrspace(2)* %queue_ptr.bc + %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 + %queue_ptr.bc = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)* + %val1 = load volatile i32, i32 addrspace(4)* %queue_ptr.bc - %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 - %kernarg_segment_ptr.bc = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)* - %val2 = load volatile i32, i32 addrspace(2)* %kernarg_segment_ptr.bc + %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 + %kernarg_segment_ptr.bc = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)* + %val2 = load volatile i32, i32 addrspace(4)* %kernarg_segment_ptr.bc %val3 = call i64 @llvm.amdgcn.dispatch.id() call void asm sideeffect "; use $0", "s"(i64 %val3) @@ -515,17 +515,17 @@ %alloca = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca - %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 - %dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* - %val0 = load volatile i32, i32 addrspace(2)* %dispatch_ptr.bc + %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 + %dispatch_ptr.bc = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)* + %val0 = load volatile i32, i32 addrspace(4)* %dispatch_ptr.bc - %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 - %queue_ptr.bc = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)* - %val1 = load volatile i32, i32 addrspace(2)* %queue_ptr.bc + %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 + %queue_ptr.bc = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)* + %val1 = load volatile i32, i32 addrspace(4)* %queue_ptr.bc - %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 - %kernarg_segment_ptr.bc = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)* - %val2 = load volatile i32, i32 addrspace(2)* %kernarg_segment_ptr.bc + %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 + %kernarg_segment_ptr.bc = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)* + %val2 = load volatile i32, i32 addrspace(4)* %kernarg_segment_ptr.bc %val3 = call i64 @llvm.amdgcn.dispatch.id() call void asm sideeffect "; use $0", "s"(i64 %val3) @@ -573,17 +573,17 @@ store volatile i32 0, i32 addrspace(5)* %alloca - %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 - %dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* - %val0 = load volatile i32, i32 addrspace(2)* %dispatch_ptr.bc + %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 + %dispatch_ptr.bc = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)* + %val0 = load volatile i32, i32 addrspace(4)* %dispatch_ptr.bc - %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 - %queue_ptr.bc = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)* - %val1 = load volatile i32, i32 addrspace(2)* %queue_ptr.bc + %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 + %queue_ptr.bc = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)* + %val1 = load volatile i32, i32 addrspace(4)* %queue_ptr.bc - %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 - %kernarg_segment_ptr.bc = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)* - %val2 = load volatile i32, i32 addrspace(2)* %kernarg_segment_ptr.bc + %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 + %kernarg_segment_ptr.bc = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)* + %val2 = load volatile i32, i32 addrspace(4)* %kernarg_segment_ptr.bc %val3 = call i64 @llvm.amdgcn.dispatch.id() call void asm sideeffect "; use $0", "s"(i64 %val3) @@ -603,10 +603,10 @@ declare i32 @llvm.amdgcn.workgroup.id.x() #0 declare i32 @llvm.amdgcn.workgroup.id.y() #0 declare i32 @llvm.amdgcn.workgroup.id.z() #0 -declare noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 -declare noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 +declare noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 +declare noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 declare i64 @llvm.amdgcn.dispatch.id() #0 -declare noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 +declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 attributes #0 = { nounwind readnone speculatable } attributes #1 = { nounwind noinline } Index: test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll =================================================================== --- test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll +++ test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll @@ -87,12 +87,12 @@ entry: %out.gep = getelementptr i32, i32* %out, i64 999999 %in.gep = getelementptr i32, i32* %in, i64 7 - %cast = addrspacecast i32* %in.gep to i32 addrspace(2)* + %cast = addrspacecast i32* %in.gep to i32 addrspace(4)* %tmp0 = icmp eq i32 %cond, 0 br i1 %tmp0, label %endif, label %if if: - %tmp1 = load i32, i32 addrspace(2)* %cast + %tmp1 = load i32, i32 addrspace(4)* %cast br label %endif endif: Index: test/CodeGen/AMDGPU/cgp-addressing-modes.ll =================================================================== --- test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -268,23 +268,23 @@ } ; OPT-LABEL: @test_sink_constant_small_offset_i32 -; OPT-NOT: getelementptr i32, i32 addrspace(2)* +; OPT-NOT: getelementptr i32, i32 addrspace(4)* ; OPT: br i1 ; GCN-LABEL: {{^}}test_sink_constant_small_offset_i32: ; GCN: s_and_saveexec_b64 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}} ; GCN: s_or_b64 exec, exec -define amdgpu_kernel void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 - %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 7 + %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: - %tmp1 = load i32, i32 addrspace(2)* %in.gep + %tmp1 = load i32, i32 addrspace(4)* %in.gep br label %endif endif: @@ -297,23 +297,23 @@ } ; OPT-LABEL: @test_sink_constant_max_8_bit_offset_i32 -; OPT-NOT: getelementptr i32, i32 addrspace(2)* +; OPT-NOT: getelementptr i32, i32 addrspace(4)* ; OPT: br i1 ; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_i32: ; GCN: s_and_saveexec_b64 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}} ; GCN: s_or_b64 exec, exec -define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 - %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 255 + %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 255 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: - %tmp1 = load i32, i32 addrspace(2)* %in.gep + %tmp1 = load i32, i32 addrspace(4)* %in.gep br label %endif endif: @@ -326,9 +326,9 @@ } ; OPT-LABEL: @test_sink_constant_max_8_bit_offset_p1_i32 -; OPT-SI: getelementptr i32, i32 addrspace(2)* -; OPT-CI-NOT: getelementptr i32, i32 addrspace(2)* -; OPT-VI-NOT: getelementptr i32, i32 addrspace(2)* +; OPT-SI: getelementptr i32, i32 addrspace(4)* +; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)* +; OPT-VI-NOT: getelementptr i32, i32 addrspace(4)* ; OPT: br i1 ; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_p1_i32: @@ -337,16 +337,16 @@ ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} ; GCN: s_or_b64 exec, exec -define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 - %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 256 + %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 256 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: - %tmp1 = load i32, i32 addrspace(2)* %in.gep + %tmp1 = load i32, i32 addrspace(4)* %in.gep br label %endif endif: @@ -359,8 +359,8 @@ } ; OPT-LABEL: @test_sink_constant_max_32_bit_offset_i32 -; OPT-SI: getelementptr i32, i32 addrspace(2)* -; OPT-CI-NOT: getelementptr i32, i32 addrspace(2)* +; OPT-SI: getelementptr i32, i32 addrspace(4)* +; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)* ; OPT: br i1 ; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_i32: @@ -369,16 +369,16 @@ ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}} ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} ; GCN: s_or_b64 exec, exec -define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 - %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 4294967295 + %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 4294967295 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: - %tmp1 = load i32, i32 addrspace(2)* %in.gep + %tmp1 = load i32, i32 addrspace(4)* %in.gep br label %endif endif: @@ -391,7 +391,7 @@ } ; OPT-LABEL: @test_sink_constant_max_32_bit_offset_p1_i32 -; OPT: getelementptr i32, i32 addrspace(2)* +; OPT: getelementptr i32, i32 addrspace(4)* ; OPT: br i1 ; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_p1_i32: @@ -400,16 +400,16 @@ ; GCN: s_addc_u32 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} ; GCN: s_or_b64 exec, exec -define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 - %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 17179869181 + %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 17179869181 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: - %tmp1 = load i32, i32 addrspace(2)* %in.gep + %tmp1 = load i32, i32 addrspace(4)* %in.gep br label %endif endif: @@ -430,16 +430,16 @@ ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}} ; GCN: s_or_b64 exec, exec -define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 - %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262143 + %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 262143 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: - %tmp1 = load i32, i32 addrspace(2)* %in.gep + %tmp1 = load i32, i32 addrspace(4)* %in.gep br label %endif endif: @@ -452,9 +452,9 @@ } ; OPT-LABEL: @test_sink_constant_max_20_bit_byte_offset_p1_i32 -; OPT-SI: getelementptr i32, i32 addrspace(2)* -; OPT-CI-NOT: getelementptr i32, i32 addrspace(2)* -; OPT-VI: getelementptr i32, i32 addrspace(2)* +; OPT-SI: getelementptr i32, i32 addrspace(4)* +; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)* +; OPT-VI: getelementptr i32, i32 addrspace(4)* ; OPT: br i1 ; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_p1_i32: @@ -468,16 +468,16 @@ ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} ; GCN: s_or_b64 exec, exec -define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 - %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262144 + %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 262144 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: - %tmp1 = load i32, i32 addrspace(2)* %in.gep + %tmp1 = load i32, i32 addrspace(4)* %in.gep br label %endif endif: @@ -524,17 +524,17 @@ ; OPT: br i1 %tmp0, ; OPT: if: ; OPT: getelementptr i8, {{.*}} 4095 -define amdgpu_kernel void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(4)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024 - %in.gep = getelementptr i8, i8 addrspace(2)* %in, i64 4095 + %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 4095 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: - %bitcast = bitcast i8 addrspace(2)* %in.gep to i32 addrspace(2)* - %tmp1 = load i32, i32 addrspace(2)* %bitcast, align 1 + %bitcast = bitcast i8 addrspace(4)* %in.gep to i32 addrspace(4)* + %tmp1 = load i32, i32 addrspace(4)* %bitcast, align 1 br label %endif endif: Index: test/CodeGen/AMDGPU/early-if-convert-cost.ll =================================================================== --- test/CodeGen/AMDGPU/early-if-convert-cost.ll +++ test/CodeGen/AMDGPU/early-if-convert-cost.ll @@ -32,9 +32,9 @@ ; GCN: v_add_f64 ; GCN: v_cndmask_b32_e32 ; GCN: v_cndmask_b32_e32 -define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(2)* %in) #0 { +define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(4)* %in) #0 { entry: - %v = load double, double addrspace(2)* %in + %v = load double, double addrspace(4)* %in %cc = fcmp oeq double %v, 1.000000e+00 br i1 %cc, label %if, label %endif Index: test/CodeGen/AMDGPU/early-if-convert.ll =================================================================== --- test/CodeGen/AMDGPU/early-if-convert.ll +++ test/CodeGen/AMDGPU/early-if-convert.ll @@ -187,9 +187,9 @@ ; GCN: [[ENDIF]]: ; GCN: buffer_store_dword -define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(2)* %in, float %cnd) #0 { +define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(4)* %in, float %cnd) #0 { entry: - %v = load i32, i32 addrspace(2)* %in + %v = load i32, i32 addrspace(4)* %in %cc = fcmp oeq float %cnd, 1.000000e+00 br i1 %cc, label %if, label %endif @@ -206,9 +206,9 @@ ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_constant_load: ; GCN: v_cndmask_b32 -define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(2)* %in) #0 { +define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(4)* %in) #0 { entry: - %v = load float, float addrspace(2)* %in + %v = load float, float addrspace(4)* %in %cc = fcmp oeq float %v, 1.000000e+00 br i1 %cc, label %if, label %endif @@ -248,9 +248,9 @@ ; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], [[VAL]] ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 ; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[ADD]], [[VAL]] -define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(2)* %in, i32 %cond) #0 { +define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(4)* %in, i32 %cond) #0 { entry: - %v = load i32, i32 addrspace(2)* %in + %v = load i32, i32 addrspace(4)* %in %cc = icmp eq i32 %cond, 1 br i1 %cc, label %if, label %endif @@ -295,9 +295,9 @@ ; GCN: s_addc_u32 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} -define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(2)* %in, i32 %cond) #0 { +define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(4)* %in, i32 %cond) #0 { entry: - %v = load i64, i64 addrspace(2)* %in + %v = load i64, i64 addrspace(4)* %in %cc = icmp eq i32 %cond, 1 br i1 %cc, label %if, label %endif @@ -320,9 +320,9 @@ ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} -define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(2)* %in, i32 %cond) #0 { +define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(4)* %in, i32 %cond) #0 { entry: - %v = load <3 x i32>, <3 x i32> addrspace(2)* %in + %v = load <3 x i32>, <3 x i32> addrspace(4)* %in %cc = icmp eq i32 %cond, 1 br i1 %cc, label %if, label %endif @@ -345,9 +345,9 @@ ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} -define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(2)* %in, i32 %cond) #0 { +define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(4)* %in, i32 %cond) #0 { entry: - %v = load <4 x i32>, <4 x i32> addrspace(2)* %in + %v = load <4 x i32>, <4 x i32> addrspace(4)* %in %cc = icmp eq i32 %cond, 1 br i1 %cc, label %if, label %endif Index: test/CodeGen/AMDGPU/extract_vector_elt-f16.ll =================================================================== --- test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -8,8 +8,8 @@ ; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] ; GCN-DAG: buffer_store_short [[VELT0]] ; GCN-DAG: buffer_store_short [[VELT1]] -define amdgpu_kernel void @extract_vector_elt_v2f16(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 { - %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr +define amdgpu_kernel void @extract_vector_elt_v2f16(half addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 { + %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr %p0 = extractelement <2 x half> %vec, i32 0 %p1 = extractelement <2 x half> %vec, i32 1 %out1 = getelementptr half, half addrspace(1)* %out, i32 10 @@ -26,8 +26,8 @@ ; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] ; GCN: buffer_store_short [[VELT1]] ; GCN: ScratchSize: 0 -define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 %idx) #0 { - %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr +define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(half addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr, i32 %idx) #0 { + %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr %elt = extractelement <2 x half> %vec, i32 %idx store half %elt, half addrspace(1)* %out, align 2 ret void @@ -45,12 +45,12 @@ ; SI: buffer_store_short [[ELT]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]] ; GCN: ScratchSize: 0{{$}} -define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 { +define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext - %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr + %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr %idx = load i32, i32 addrspace(1)* %gep %elt = extractelement <2 x half> %vec, i32 %idx store half %elt, half addrspace(1)* %out.gep, align 2 Index: test/CodeGen/AMDGPU/extract_vector_elt-i16.ll =================================================================== --- test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -9,8 +9,8 @@ ; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] ; GCN-DAG: buffer_store_short [[VELT0]] ; GCN-DAG: buffer_store_short [[VELT1]] -define amdgpu_kernel void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 { - %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr +define amdgpu_kernel void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 { + %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %p0 = extractelement <2 x i16> %vec, i32 0 %p1 = extractelement <2 x i16> %vec, i32 1 %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 10 @@ -27,8 +27,8 @@ ; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] ; GCN: buffer_store_short [[VELT1]] ; GCN: ScratchSize: 0 -define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %idx) #0 { - %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr +define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %idx) #0 { + %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %elt = extractelement <2 x i16> %vec, i32 %idx store i16 %elt, i16 addrspace(1)* %out, align 2 ret void @@ -45,13 +45,13 @@ ; SI: buffer_store_short [[ELT]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]] ; GCN: ScratchSize: 0{{$}} -define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 { +define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext %idx = load volatile i32, i32 addrspace(1)* %gep - %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr + %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %elt = extractelement <2 x i16> %vec, i32 %idx store i16 %elt, i16 addrspace(1)* %out.gep, align 2 ret void Index: test/CodeGen/AMDGPU/fence-barrier.ll =================================================================== --- test/CodeGen/AMDGPU/fence-barrier.ll +++ test/CodeGen/AMDGPU/fence-barrier.ll @@ -1,8 +1,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa-amdgiz -mcpu=gfx803 -enable-si-insert-waitcnts=1 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s ; RUN: llvm-as -data-layout=A5 < %s | llc -mtriple=amdgcn-amd-amdhsa-amdgiz -mcpu=gfx803 -enable-si-insert-waitcnts=1 -verify-machineinstrs | FileCheck --check-prefix=GCN %s -declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() -declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() +declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() +declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.workgroup.id.x() declare void @llvm.amdgcn.s.barrier() @@ -34,19 +34,19 @@ fence syncscope("workgroup") acquire %8 = load i32, i32 addrspace(3)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(3)* @test_local.temp, i64 0, i64 0), align 4 %9 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %2, align 4 - %10 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() + %10 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() %11 = call i32 @llvm.amdgcn.workitem.id.x() %12 = call i32 @llvm.amdgcn.workgroup.id.x() - %13 = getelementptr inbounds i8, i8 addrspace(2)* %10, i64 4 - %14 = bitcast i8 addrspace(2)* %13 to i16 addrspace(2)* - %15 = load i16, i16 addrspace(2)* %14, align 4 + %13 = getelementptr inbounds i8, i8 addrspace(4)* %10, i64 4 + %14 = bitcast i8 addrspace(4)* %13 to i16 addrspace(4)* + %15 = load i16, i16 addrspace(4)* %14, align 4 %16 = zext i16 %15 to i32 %17 = mul i32 %12, %16 %18 = add i32 %17, %11 - %19 = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() + %19 = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() %20 = zext i32 %18 to i64 - %21 = bitcast i8 addrspace(2)* %19 to i64 addrspace(2)* - %22 = load i64, i64 addrspace(2)* %21, align 8 + %21 = bitcast i8 addrspace(4)* %19 to i64 addrspace(4)* + %22 = load i64, i64 addrspace(4)* %21, align 8 %23 = add i64 %22, %20 %24 = getelementptr inbounds i32, i32 addrspace(1)* %9, i64 %23 store i32 %8, i32 addrspace(1)* %24, align 4 @@ -68,56 +68,56 @@ ;