diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -383,6 +383,10 @@ else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32")) MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32)); } + // Set -fixed-function-abi to true if not provided.. + if (TT.getOS() == Triple::AMDHSA && + EnableAMDGPUFixedFunctionABIOpt.getNumOccurrences() == 0) + EnableFixedFunctionABI = true; } bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll @@ -53,9 +53,9 @@ ; CHECK: bb.1 (%ir-block.0): ; CHECK: liveins: $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %2, !0 - ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1 - ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %2 + ; CHECK: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %8, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %9, !0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %9 ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY2]] ; CHECK: $vgpr0 = COPY [[ADD]](s32) ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] @@ -87,8 +87,8 @@ ; CHECK: bb.1.entry: ; CHECK: liveins: $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1 - ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1 + ; CHECK: INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8 ; CHECK: $vgpr0 = COPY [[COPY1]](s32) ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 @@ -102,8 +102,8 @@ ; CHECK: bb.1.entry: ; CHECK: liveins: $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %1 - ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1 + ; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %8 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8 ; CHECK: $vgpr0 = COPY [[COPY1]](s32) ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 @@ -118,9 +118,9 @@ ; CHECK: bb.1 (%ir-block.0): ; CHECK: liveins: $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1, 1835018 /* regdef:VGPR_32 */, def %2 - ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1 - ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %2 + ; CHECK: INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8, 1835018 /* regdef:VGPR_32 */, def %9 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %9 ; CHECK: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY1]], [[COPY2]] ; CHECK: $vgpr0 = COPY [[FADD]](s32) ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] @@ -138,9 +138,9 @@ ; CHECK: bb.1 (%ir-block.0): ; CHECK: liveins: $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1, 2883594 /* regdef:VReg_64 */, def %2 - ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1 - ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY %2 + ; CHECK: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8, 2883594 /* regdef:VReg_64 */, def %9 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8 + ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY %9 ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](s64) ; CHECK: $vgpr0 = COPY [[UV]](s32) ; CHECK: $vgpr1 = COPY [[UV1]](s32) @@ -209,8 +209,8 @@ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32) - ; CHECK: INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %2, 1835017 /* reguse:VGPR_32 */, [[COPY2]] - ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %2 + ; CHECK: INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %9, 1835017 /* reguse:VGPR_32 */, [[COPY2]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %9 ; CHECK: $vgpr0 = COPY [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] ; CHECK: S_SETPC_B64_return [[COPY4]], implicit $vgpr0 @@ -225,8 +225,8 @@ ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 1835018 /* regdef:VGPR_32 */, def %2, 196622 /* mem:m */, [[COPY]](p3) - ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %2 + ; CHECK: INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 1835018 /* regdef:VGPR_32 */, def %9, 196622 /* mem:m */, [[COPY]](p3) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %9 ; CHECK: $vgpr0 = COPY [[COPY2]](s32) ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] ; CHECK: S_SETPC_B64_return [[COPY3]], implicit $vgpr0 @@ -243,8 +243,8 @@ ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[AND]](s32) - ; CHECK: INLINEASM &";", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %4, 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3) - ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %4 + ; CHECK: INLINEASM &";", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11, 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %11 ; CHECK: $vgpr0 = COPY [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] ; CHECK: S_SETPC_B64_return [[COPY4]], implicit $vgpr0 @@ -258,14 +258,14 @@ ; CHECK: bb.1.entry: ; CHECK: liveins: $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %1 - ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1 - ; CHECK: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %3 - ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %3 + ; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %8 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8 + ; CHECK: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %10 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %10 ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY1]](s32) ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY2]](s32) - ; CHECK: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %5, 1966089 /* reguse:SReg_32 */, [[COPY3]], 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3) - ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY %5 + ; CHECK: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %12, 1966089 /* reguse:SReg_32 */, [[COPY3]], 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY %12 ; CHECK: $vgpr0 = COPY [[COPY5]](s32) ; CHECK: [[COPY6:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] ; CHECK: S_SETPC_B64_return [[COPY6]], implicit $vgpr0 @@ -288,10 +288,10 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]](s32) ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32) ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32) - ; CHECK: INLINEASM &"; ", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %4, 1835018 /* regdef:VGPR_32 */, def %5, 1835018 /* regdef:VGPR_32 */, def %6, 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY5]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY6]](tied-def 5) - ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY %4 - ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY %5 - ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY %6 + ; CHECK: INLINEASM &"; ", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11, 1835018 /* regdef:VGPR_32 */, def %12, 1835018 /* regdef:VGPR_32 */, def %13, 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY5]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY6]](tied-def 5) + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY %11 + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY %12 + ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY %13 ; CHECK: G_STORE [[COPY7]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; CHECK: G_STORE [[COPY8]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; CHECK: G_STORE [[COPY9]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) @@ -312,11 +312,11 @@ ; CHECK: bb.1.entry: ; CHECK: liveins: $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %1 - ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1 + ; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %8 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8 ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32) - ; CHECK: INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %3, 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3) - ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %3 + ; CHECK: INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %10 ; CHECK: $vgpr0 = COPY [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] ; CHECK: S_SETPC_B64_return [[COPY4]], implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll @@ -14,7 +14,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX8-NEXT: s_trap 2 ; GFX8-NEXT: ds_write_b32 v0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -24,7 +24,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX9-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX9-NEXT: s_trap 2 ; GFX9-NEXT: ds_write_b32 v0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -38,7 +38,7 @@ ; GFX8-LABEL: func_use_lds_global_constexpr_cast: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX8-NEXT: s_trap 2 ; GFX8-NEXT: flat_store_dword v[0:1], v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -47,7 +47,7 @@ ; GFX9-LABEL: func_use_lds_global_constexpr_cast: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX9-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX9-NEXT: s_trap 2 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll @@ -1,5 +1,5 @@ -; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,CO-V2 %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=carrizo -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,CO-V2 %s +; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,HSA,CO-V2 %s +; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=carrizo -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,HSA,CO-V2 %s ; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA %s ; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA %s ; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2 %s @@ -90,7 +90,8 @@ ; ALL-LABEL: {{^}}test_workitem_id_x_func: ; ALL: s_waitcnt -; ALL-NEXT: v_and_b32_e32 v2, 0x3ff, v2 +; HSA-NEXT: v_and_b32_e32 v2, 0x3ff, v31 +; MESA-NEXT: v_and_b32_e32 v2, 0x3ff, v2 define void @test_workitem_id_x_func(i32 addrspace(1)* %out) #1 { %id = call i32 @llvm.amdgcn.workitem.id.x() store i32 %id, i32 addrspace(1)* %out @@ -98,8 +99,8 @@ } ; ALL-LABEL: {{^}}test_workitem_id_y_func: -; ALL: v_lshrrev_b32_e32 v2, 10, v2 -; ALL-NEXT: v_and_b32_e32 v2, 0x3ff, v2 +; HSA: v_lshrrev_b32_e32 v2, 10, v31 +; MESA: v_lshrrev_b32_e32 v2, 10, v2 define void @test_workitem_id_y_func(i32 addrspace(1)* %out) #1 { %id = call i32 @llvm.amdgcn.workitem.id.y() store i32 %id, i32 addrspace(1)* %out @@ -107,8 +108,8 @@ } ; ALL-LABEL: {{^}}test_workitem_id_z_func: -; ALL: v_lshrrev_b32_e32 v2, 20, v2 -; ALL-NEXT: v_and_b32_e32 v2, 0x3ff, v2 +; HSA: v_lshrrev_b32_e32 v2, 20, v31 +; MESA: v_lshrrev_b32_e32 v2, 20, v2 define void @test_workitem_id_z_func(i32 addrspace(1)* %out) #1 { %id = call i32 @llvm.amdgcn.workitem.id.z() store i32 %id, i32 addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -187,7 +187,7 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v4 ; GCN-NEXT: v_add_u32_e32 v2, s6, v2 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v5 +; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v31 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_u32_e32 v2, v2, v3 ; GCN-NEXT: global_store_dword v[0:1], v2, off @@ -243,15 +243,15 @@ ; GCN-NEXT: s_and_b32 s6, s6, 0xfffff000 ; GCN-NEXT: s_add_u32 s7, s6, 4 ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mov_b32_e32 v5, s6 -; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v2, 1 -; GCN-NEXT: v_mov_b32_e32 v5, s7 -; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v4, s7 +; GCN-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v3 ; GCN-NEXT: v_add_u32_e32 v2, s6, v2 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v31 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_u32_e32 v2, v2, v3 ; GCN-NEXT: global_store_dword v[0:1], v2, off diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -42,7 +42,7 @@ ; Test handling inside a non-kernel ; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast_func: -; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}} +; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x10{{$}} ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] ; CI-DAG: v_cmp_ne_u32_e32 vcc, -1, v0 ; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc diff --git a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll --- a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll @@ -93,7 +93,7 @@ ; GCN-LABEL: {{^}}kernel_call_func_32_agprs: ; GCN: .amdhsa_next_free_vgpr 32 -; GCN: NumVgprs: 9 +; GCN: NumVgprs: 32 ; GCN: NumAgprs: 32 ; GCN: TotalNumVgprs: 32 ; GCN: VGPRBlocks: 7 @@ -106,7 +106,7 @@ } ; GCN-LABEL: {{^}}func_call_func_32_agprs: -; GCN: NumVgprs: 9 +; GCN: NumVgprs: 32 ; GCN: NumAgprs: 32 ; GCN: TotalNumVgprs: 32 define void @func_call_func_32_agprs() { @@ -118,13 +118,13 @@ declare void @undef_func() ; GCN-LABEL: {{^}}kernel_call_undef_func: -; GCN: .amdhsa_next_free_vgpr 24 -; GCN: NumVgprs: 24 +; GCN: .amdhsa_next_free_vgpr 32 +; GCN: NumVgprs: 32 ; GCN: NumAgprs: 24 -; GCN: TotalNumVgprs: 24 -; GCN: VGPRBlocks: 5 -; GCN: NumVGPRsForWavesPerEU: 24 -; GCN: Occupancy: 10 +; GCN: TotalNumVgprs: 32 +; GCN: VGPRBlocks: 7 +; GCN: NumVGPRsForWavesPerEU: 32 +; GCN: Occupancy: 8 define amdgpu_kernel void @kernel_call_undef_func() { bb: call void @undef_func() diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -80,14 +80,15 @@ ; GCN-LABEL: {{^}}test_call_external_void_func_i1_signext: ; HSA: buffer_load_ubyte [[VAR:v[0-9]+]] -; HSA: s_mov_b32 s32, 0 +; HSA-DAG: s_mov_b32 s32, 0 ; MESA-DAG: buffer_load_ubyte [[VAR:v[0-9]+]] ; MESA-DAG: s_mov_b32 s32, 0{{$}} ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_signext@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_signext@rel32@hi+12 -; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1 +; MESA-DAG: v_bfe_i32 v0, v0, 0, 1 +; HSA: v_bfe_i32 v0, v3, 0, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { @@ -99,18 +100,24 @@ ; FIXME: load should be scheduled before getpc ; GCN-LABEL: {{^}}test_call_external_void_func_i1_zeroext: -; HSA: buffer_load_ubyte v0 +; HSA: buffer_load_ubyte v3 ; HSA-DAG: s_mov_b32 s32, 0{{$}} ; MESA: buffer_load_ubyte v0 ; MESA-DAG: s_mov_b32 s32, 0{{$}} -; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} -; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext@rel32@hi+12 -; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} -; GCN-NEXT: s_endpgm +; MESA: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; MESA-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext@rel32@hi+12 +; MESA-NEXT: v_and_b32_e32 v0, 1, v0 +; MESA-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} +; MESA-NEXT: s_endpgm +; HSA: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; HSA-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext@rel32@hi+12 +; HSA-NEXT: v_and_b32_e32 v0, 1, v3 +; HSA-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} +; HSA-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { %var = load volatile i1, i1 addrspace(1)* undef call void @external_void_func_i1_zeroext(i1 %var) @@ -136,7 +143,8 @@ ; FIXME: don't wait before call ; GCN-LABEL: {{^}}test_call_external_void_func_i8_signext: -; GCN-DAG: buffer_load_sbyte v0 +; MESA-DAG: buffer_load_sbyte v0 +; HSA-DAG: buffer_load_sbyte v3 ; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_signext@rel32@lo+4 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext@rel32@hi+12 @@ -144,7 +152,7 @@ ; GCN-DAG: s_mov_b32 s32, 0 ; GCN-NOT: s_waitcnt -; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} +; GCN-DAG: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { %var = load volatile i8, i8 addrspace(1)* undef @@ -154,7 +162,8 @@ ; GCN-LABEL: {{^}}test_call_external_void_func_i8_zeroext: -; GCN-DAG: buffer_load_ubyte v0 +; MESA-DAG: buffer_load_ubyte v0 +; HSA-DAG: buffer_load_ubyte v3 ; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_zeroext@rel32@lo+4 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_zeroext@rel32@hi+12 @@ -162,7 +171,7 @@ ; GCN-DAG: s_mov_b32 s32, 0 ; GCN-NOT: s_waitcnt -; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} +; GCN-DAG: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { %var = load volatile i8, i8 addrspace(1)* undef @@ -183,7 +192,8 @@ ; GCN-LABEL: {{^}}test_call_external_void_func_i16_signext: -; GCN-DAG: buffer_load_sshort v0 +; MESA-DAG: buffer_load_sshort v0 +; HSA-DAG: buffer_load_sshort v3 ; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_signext@rel32@lo+4 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_signext@rel32@hi+12 @@ -191,7 +201,7 @@ ; GCN-DAG: s_mov_b32 s32, 0 ; GCN-NOT: s_waitcnt -; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} +; GCN-DAG: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { %var = load volatile i16, i16 addrspace(1)* undef @@ -208,7 +218,7 @@ ; GCN-DAG: s_mov_b32 s32, 0 ; GCN-NOT: s_waitcnt -; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} +; GCN-DAG: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { %var = load volatile i16, i16 addrspace(1)* undef @@ -481,7 +491,7 @@ ; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm: {{.*}} -; GCN-NOT: v3 +; GCN-NOT: v3, ; GCN-DAG: v_mov_b32_e32 v0, 3 ; GCN-DAG: v_mov_b32_e32 v1, 4 ; GCN-DAG: v_mov_b32_e32 v2, 5 @@ -586,7 +596,7 @@ ; GCN-DAG: buffer_load_dwordx4 v[20:23], off ; GCN-DAG: buffer_load_dwordx4 v[24:27], off ; GCN-DAG: buffer_load_dwordx4 v[28:31], off -; GCN-NOT: s_waitcnt +; MESA-NOT: s_waitcnt ; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { %ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef @@ -611,7 +621,8 @@ ; GCN-DAG: buffer_load_dwordx4 v[28:31], off ; GCN: s_waitcnt -; GCN: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32{{$}} +; MESA: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32{{$}} +; HSA: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32 offset:4 ; GCN: s_swappc_b64 ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { @@ -634,9 +645,11 @@ } ; GCN-LABEL: {{^}}test_call_external_void_func_struct_i8_i32: -; GCN: buffer_load_ubyte v0, off -; GCN: buffer_load_dword v1, off -; GCN-NOT: s_waitcnt +; MESA: buffer_load_ubyte v0, off +; MESA-DAG: buffer_load_dword v1, off +; HSA: buffer_load_ubyte v3, off +; HSA-DAG: buffer_load_dword v4, off +; MESA-NOT: s_waitcnt ; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef @@ -738,15 +751,19 @@ } ; GCN-LABEL: {{^}}tail_call_byval_align16: -; GCN-NOT: s32 -; GCN: buffer_load_dword [[VREG1:v[0-9]+]], off, s[0:3], s32 offset:8 -; GCN: buffer_load_dword [[VREG2:v[0-9]+]], off, s[0:3], s32 offset:12 +; GCN-NOT: s32, +; MESA: buffer_load_dword [[VREG1:v[0-9]+]], off, s[0:3], s32 offset:8 +; MESA: buffer_load_dword [[VREG2:v[0-9]+]], off, s[0:3], s32 offset:12 +; HSA: buffer_load_dword [[VREG1:v[0-9]+]], off, s[0:3], s32 +; HSA: buffer_load_dword [[VREG2:v[0-9]+]], off, s[0:3], s32 offset:24 ; GCN: s_getpc_b64 -; GCN: buffer_store_dword [[VREG2]], off, s[0:3], s32 offset:4 -; GCN: buffer_store_dword [[VREG1]], off, s[0:3], s32{{$}} -; GCN-NOT: s32 +; MESA: buffer_store_dword [[VREG2]], off, s[0:3], s32 offset:4 +; MESA: buffer_store_dword [[VREG1]], off, s[0:3], s32{{$}} +; HSA: buffer_store_dword [[VREG2]], off, s[0:3], s32 offset:16 +; HSA: buffer_store_dword [[VREG1]], off, s[0:3], s32 +; GCN-NOT: s32, ; GCN: s_setpc_b64 define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { entry: @@ -757,11 +774,16 @@ ; GCN-LABEL: {{^}}tail_call_stack_passed_arg_alignment_v32i32_f64: ; GCN-NOT: s32 -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN: buffer_load_dword v33, off, s[0:3], s32{{$}} -; GCN: s_getpc_b64 -; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}} -; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; MESA: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; MESA: buffer_load_dword v33, off, s[0:3], s32{{$}} +; MESA: s_getpc_b64 +; MESA: buffer_store_dword v33, off, s[0:3], s32{{$}} +; MESA: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; HSA: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; HSA: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; HSA: s_getpc_b64 +; HSA: buffer_store_dword v33, off, s[0:3], s32 offset:4 +; HSA: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; GCN-NOT: s32 ; GCN: s_setpc_b64 define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 { @@ -771,16 +793,27 @@ } ; GCN-LABEL: {{^}}stack_12xv3i32: -; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 -; GCN: buffer_store_dword [[REG12]], {{.*$}} -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:4 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:8 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:12 -; GCN: v_mov_b32_e32 v31, 11 -; GCN: s_getpc +; MESA: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 +; MESA: buffer_store_dword [[REG12]], {{.*$}} +; MESA: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 +; MESA: buffer_store_dword [[REG13]], {{.*}} offset:4 +; MESA: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 +; MESA: buffer_store_dword [[REG14]], {{.*}} offset:8 +; MESA: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 +; MESA: buffer_store_dword [[REG15]], {{.*}} offset:12 +; MESA: v_mov_b32_e32 v31, 11 +; MESA: s_getpc +; HSA: v_mov_b32_e32 [[REG12:v[0-9]+]], 11 +; HSA: buffer_store_dword [[REG12]], {{.*$}} +; HSA: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 +; HSA: buffer_store_dword [[REG12]], {{.*}} offset:4 +; HSA: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 +; HSA: buffer_store_dword [[REG13]], {{.*}} offset:8 +; HSA: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 +; HSA: buffer_store_dword [[REG14]], {{.*}} offset:12 +; HSA: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 +; HSA: buffer_store_dword [[REG15]], {{.*}} offset:16 +; HSA: s_getpc define void @stack_12xv3i32() #0 { entry: call void @external_void_func_12xv3i32( @@ -800,16 +833,25 @@ } ; GCN-LABEL: {{^}}stack_12xv3f32: -; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 -; GCN: buffer_store_dword [[REG12]], {{.*$}} -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:4 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:8 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:12 -; GCN: v_mov_b32_e32 v31, 0x41300000 -; GCN: s_getpc +; MESA: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 +; MESA: buffer_store_dword [[REG12]], {{.*$}} +; MESA: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 +; MESA: buffer_store_dword [[REG13]], {{.*}} offset:4 +; MESA: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 +; MESA: buffer_store_dword [[REG14]], {{.*}} offset:8 +; MESA: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 +; MESA: buffer_store_dword [[REG15]], {{.*}} offset:12 +; MESA: v_mov_b32_e32 v31, 0x41300000 +; MESA: s_getpc +; HSA: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 +; HSA: buffer_store_dword [[REG12]], {{.*}} offset:4 +; HSA: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 +; HSA: buffer_store_dword [[REG13]], {{.*}} offset:8 +; HSA: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 +; HSA: buffer_store_dword [[REG14]], {{.*}} offset:12 +; HSA: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 +; HSA: buffer_store_dword [[REG15]], {{.*}} offset:16 +; HSA: s_getpc define void @stack_12xv3f32() #0 { entry: call void @external_void_func_12xv3f32( @@ -830,24 +872,41 @@ ; GCN-LABEL: {{^}}stack_8xv5i32: -; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 -; GCN: buffer_store_dword [[REG8]], {{.*$}} -; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 -; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4 -; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 -; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8 -; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 -; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12 -; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 -; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28 - -; GCN: v_mov_b32_e32 v31, 7 +; MESA: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 +; MESA: buffer_store_dword [[REG8]], {{.*$}} +; MESA: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 +; MESA: buffer_store_dword [[REG9]], {{.*}} offset:4 +; MESA: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 +; MESA: buffer_store_dword [[REG10]], {{.*}} offset:8 +; MESA: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 +; MESA: buffer_store_dword [[REG11]], {{.*}} offset:12 +; MESA: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 +; MESA: buffer_store_dword [[REG12]], {{.*}} offset:16 +; MESA: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 +; MESA: buffer_store_dword [[REG13]], {{.*}} offset:20 +; MESA: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 +; MESA: buffer_store_dword [[REG14]], {{.*}} offset:24 +; MESA: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 +; MESA: buffer_store_dword [[REG15]], {{.*}} offset:28 +; HSA: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 +; HSA: buffer_store_dword [[REG8]], {{.*}} offset:4 +; HSA: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 +; HSA: buffer_store_dword [[REG9]], {{.*}} offset:8 +; HSA: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 +; HSA: buffer_store_dword [[REG10]], {{.*}} offset:12 +; HSA: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 +; HSA: buffer_store_dword [[REG11]], {{.*}} offset:16 +; HSA: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 +; HSA: buffer_store_dword [[REG12]], {{.*}} offset:20 +; HSA: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 +; HSA: buffer_store_dword [[REG13]], {{.*}} offset:24 +; HSA: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 +; HSA: buffer_store_dword [[REG14]], {{.*}} offset:28 +; HSA: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 +; HSA: buffer_store_dword [[REG15]], {{.*}} offset:32 + + +; MESA: v_mov_b32_e32 v31, 7 ; GCN: s_getpc define void @stack_8xv5i32() #0 { entry: @@ -864,24 +923,42 @@ } ; GCN-LABEL: {{^}}stack_8xv5f32: -; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000 -; GCN: buffer_store_dword [[REG8]], {{.*$}} -; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000 -; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4 -; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000 -; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8 -; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000 -; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12 -; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 -; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28 - -; GCN: v_mov_b32_e32 v31, 0x40e00000 +; MESA: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000 +; MESA: buffer_store_dword [[REG8]], {{.*$}} +; MESA: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000 +; MESA: buffer_store_dword [[REG9]], {{.*}} offset:4 +; MESA: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000 +; MESA: buffer_store_dword [[REG10]], {{.*}} offset:8 +; MESA: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000 +; MESA: buffer_store_dword [[REG11]], {{.*}} offset:12 +; MESA: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 +; MESA: buffer_store_dword [[REG12]], {{.*}} offset:16 +; MESA: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 +; MESA: buffer_store_dword [[REG13]], {{.*}} offset:20 +; MESA: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 +; MESA: buffer_store_dword [[REG14]], {{.*}} offset:24 +; MESA: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 +; MESA: buffer_store_dword [[REG15]], {{.*}} offset:28 +; MESA: v_mov_b32_e32 v31, 0x40e00000 + +; HSA: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x40e00000 +; HSA: buffer_store_dword [[REG8]], {{.*$}} +; HSA: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000 +; HSA: buffer_store_dword [[REG8]], {{.*}} offset:4 +; HSA: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000 +; HSA: buffer_store_dword [[REG9]], {{.*}} offset:8 +; HSA: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000 +; HSA: buffer_store_dword [[REG10]], {{.*}} offset:12 +; HSA: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000 +; HSA: buffer_store_dword [[REG11]], {{.*}} offset:16 +; HSA: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 +; HSA: buffer_store_dword [[REG12]], {{.*}} offset:20 +; HSA: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 +; HSA: buffer_store_dword [[REG13]], {{.*}} offset:24 +; HSA: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 +; HSA: buffer_store_dword [[REG14]], {{.*}} offset:28 +; HSA: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 +; HSA: buffer_store_dword [[REG15]], {{.*}} offset:32 ; GCN: s_getpc define void @stack_8xv5f32() #0 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/call-constexpr.ll b/llvm/test/CodeGen/AMDGPU/call-constexpr.ll --- a/llvm/test/CodeGen/AMDGPU/call-constexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/call-constexpr.ll @@ -65,7 +65,7 @@ ; GCN-LABEL: {{^}}use_workitem_id_x: ; GCN: s_waitcnt -; GCN-NEXT: v_and_b32_e32 v1, 0x3ff, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0x3ff, v31 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: s_setpc_b64 define hidden i32 @use_workitem_id_x(i32 %arg0) #0 { @@ -75,7 +75,6 @@ } ; GCN-LABEL: {{^}}test_bitcast_use_workitem_id_x: -; GCN: v_mov_b32_e32 v1, v0 ; GCN: s_getpc_b64 ; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@lo+4 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@hi+12 diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -230,7 +230,7 @@ ; CI: NumSgprs: 48 ; VI-NOBUG: NumSgprs: 48 ; VI-BUG: NumSgprs: 96 -; GCN: NumVgprs: 24 +; GCN: NumVgprs: 32 define amdgpu_kernel void @count_use_sgpr96_external_call() { entry: tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> ) #1 @@ -244,7 +244,7 @@ ; CI: NumSgprs: 48 ; VI-NOBUG: NumSgprs: 48 ; VI-BUG: NumSgprs: 96 -; GCN: NumVgprs: 24 +; GCN: NumVgprs: 32 define amdgpu_kernel void @count_use_sgpr160_external_call() { entry: tail call void asm sideeffect "; sgpr160 $0", "s"(<5 x i32> ) #1 @@ -258,7 +258,7 @@ ; CI: NumSgprs: 48 ; VI-NOBUG: NumSgprs: 48 ; VI-BUG: NumSgprs: 96 -; GCN: NumVgprs: 24 +; GCN: NumVgprs: 32 define amdgpu_kernel void @count_use_vgpr160_external_call() { entry: tail call void asm sideeffect "; vgpr160 $0", "v"(<5 x i32> ) #1 diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll --- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -6,15 +6,15 @@ declare hidden void @external_void_func_void() #0 ; GCN-LABEL: {{^}}test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: -; GCN: s_getpc_b64 s[34:35] -; GCN-NEXT: s_add_u32 s34, s34, -; GCN-NEXT: s_addc_u32 s35, s35, +; GCN: s_getpc_b64 s[44:45] +; GCN-NEXT: s_add_u32 s44, s44, +; GCN-NEXT: s_addc_u32 s45, s45, ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN: s_swappc_b64 s[30:31], s[34:35] +; GCN: s_swappc_b64 s[30:31], s[44:45] -; GCN-NEXT: #ASMSTART -; GCN-NEXT: #ASMEND -; GCN-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GCN-DAG: #ASMSTART +; GCN-DAG: #ASMEND +; GCN-DAG: s_swappc_b64 s[30:31], s[44:45] define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 { call void @external_void_func_void() call void asm sideeffect "", ""() #0 @@ -25,24 +25,60 @@ ; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: ; MUBUF: buffer_store_dword ; FLATSCR: scratch_store_dword -; GCN: v_writelane_b32 v40, s33, 4 -; GCN: v_writelane_b32 v40, s34, 0 -; GCN: v_writelane_b32 v40, s35, 1 -; GCN: v_writelane_b32 v40, s30, 2 -; GCN: v_writelane_b32 v40, s31, 3 +; GCN: v_writelane_b32 v41, s33, 15 +; GCN-NEXT: v_writelane_b32 v41, s34, 0 +; GCN-NEXT: v_writelane_b32 v41, s35, 1 +; GCN-NEXT: v_writelane_b32 v41, s36, 2 +; GCN-NEXT: v_writelane_b32 v41, s37, 3 +; GCN-NEXT: v_writelane_b32 v41, s38, 4 +; GCN-NEXT: v_writelane_b32 v41, s39, 5 +; GCN-NEXT: v_writelane_b32 v41, s40, 6 +; GCN-NEXT: v_writelane_b32 v41, s41, 7 +; GCN-NEXT: v_writelane_b32 v41, s42, 8 +; GCN-NEXT: v_writelane_b32 v41, s43, 9 +; GCN-NEXT: v_writelane_b32 v41, s44, 10 +; GCN-NEXT: v_writelane_b32 v41, s46, 11 +; GCN-NEXT: v_writelane_b32 v41, s47, 12 +; GCN-NEXT: v_writelane_b32 v41, s30, 13 ; GCN: s_swappc_b64 -; GCN-NEXT: ;;#ASMSTART +; GCN-DAG: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_swappc_b64 -; MUBUF-DAG: v_readlane_b32 s4, v40, 2 -; MUBUF-DAG: v_readlane_b32 s5, v40, 3 -; FLATSCR-DAG: v_readlane_b32 s0, v40, 2 -; FLATSCR-DAG: v_readlane_b32 s1, v40, 3 -; GCN: v_readlane_b32 s35, v40, 1 -; GCN: v_readlane_b32 s34, v40, 0 - -; GCN: v_readlane_b32 s33, v40, 4 + +; MUBUF-DAG: v_readlane_b32 s4, v41, 13 +; MUBUF-DAG: v_readlane_b32 s5, v41, 14 +; MUBUF-DAG: v_readlane_b32 s47, v41, 12 +; MUBUF-DAG: v_readlane_b32 s46, v41, 11 +; MUBUF-DAG: v_readlane_b32 s44, v41, 10 +; MUBUF-DAG: v_readlane_b32 s43, v41, 9 +; MUBUF-DAG: v_readlane_b32 s42, v41, 8 +; MUBUF-DAG: v_readlane_b32 s41, v41, 7 +; MUBUF-DAG: v_readlane_b32 s40, v41, 6 +; MUBUF-DAG: v_readlane_b32 s39, v41, 5 +; MUBUF-DAG: v_readlane_b32 s38, v41, 4 +; MUBUF-DAG: v_readlane_b32 s37, v41, 3 +; MUBUF-DAG: v_readlane_b32 s36, v41, 2 +; MUBUF-DAG: v_readlane_b32 s35, v41, 1 +; MUBUF-DAG: v_readlane_b32 s34, v41, 0 + +; FLATSCR: v_readlane_b32 s0, v41, 13 +; FLATSCR-DAG: v_readlane_b32 s1, v41, 14 +; FLATSCR-DAG: v_readlane_b32 s47, v41, 12 +; FLATSCR-DAG: v_readlane_b32 s46, v41, 11 +; FLATSCR-DAG: v_readlane_b32 s44, v41, 10 +; FLATSCR-DAG: v_readlane_b32 s43, v41, 9 +; FLATSCR-DAG: v_readlane_b32 s42, v41, 8 +; FLATSCR-DAG: v_readlane_b32 s41, v41, 7 +; FLATSCR-DAG: v_readlane_b32 s40, v41, 6 +; FLATSCR-DAG: v_readlane_b32 s39, v41, 5 +; FLATSCR-DAG: v_readlane_b32 s38, v41, 4 +; FLATSCR-DAG: v_readlane_b32 s37, v41, 3 +; FLATSCR-DAG: v_readlane_b32 s36, v41, 2 +; FLATSCR-DAG: v_readlane_b32 s35, v41, 1 +; FLATSCR-DAG: v_readlane_b32 s34, v41, 0 +; FLATSCR-DAG: v_readlane_b32 s33, v41, 15 + ; MUBUF: buffer_load_dword ; FLATSCR: scratch_load_dword ; GCN: s_setpc_b64 @@ -54,19 +90,19 @@ } ; GCN-LABEL: {{^}}test_func_call_external_void_funcx2: -; MUBUF: buffer_store_dword v40 -; FLATSCR: scratch_store_dword off, v40 -; GCN: v_writelane_b32 v40, s33, 4 +; MUBUF: buffer_store_dword v41 +; GCN: v_writelane_b32 v41, s33, 15 ; GCN: s_mov_b32 s33, s32 -; MUBUF: s_add_u32 s32, s32, 0x400 ; FLATSCR: s_add_u32 s32, s32, 16 +; FLATSCR: scratch_store_dword off, v40 +; MUBUF: s_add_u32 s32, s32, 0x400 ; GCN: s_swappc_b64 -; GCN-NEXT: s_swappc_b64 +; GCN-DAG: s_swappc_b64 -; GCN: v_readlane_b32 s33, v40, 4 -; MUBUF: buffer_load_dword v40 -; FLATSCR: scratch_load_dword v40 +; GCN: v_readlane_b32 s33, v41, 15 +; MUBUF: buffer_load_dword v41 +; FLATSCR: scratch_load_dword v41 define void @test_func_call_external_void_funcx2() #0 { call void @external_void_func_void() call void @external_void_func_void() @@ -124,7 +160,7 @@ ; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_v31: ; GCN: v_mov_b32_e32 v40, v31 -; GCN-NEXT: s_swappc_b64 +; GCN-DAG: s_swappc_b64 ; GCN-NEXT: v_mov_b32_e32 v31, v40 define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)* %out) #0 { %v31 = call i32 asm sideeffect "; def $0", "={v31}"() @@ -136,18 +172,18 @@ ; FIXME: What is the expected behavior for reserved registers here? ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33: -; MUBUF: s_getpc_b64 s[4:5] -; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; FLATSCR: s_getpc_b64 s[0:1] -; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 -; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; MUBUF: s_getpc_b64 s[18:19] +; MUBUF-NEXT: s_add_u32 s18, s18, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s19, s19, external_void_func_void@rel32@hi+12 +; FLATSCR: s_getpc_b64 s[16:17] +; FLATSCR-NEXT: s_add_u32 s16, s16, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s17, s17, external_void_func_void@rel32@hi+12 ; GCN: s_mov_b32 s32, 0 ; GCN: #ASMSTART ; GCN-NEXT: ; def s33 ; GCN-NEXT: #ASMEND -; MUBUF: s_swappc_b64 s[30:31], s[4:5] -; FLATSCR: s_swappc_b64 s[30:31], s[0:1] +; MUBUF: s_swappc_b64 s[30:31], s[18:19] +; FLATSCR: s_swappc_b64 s[30:31], s[16:17] ; GCN: ;;#ASMSTART ; GCN-NEXT: ; use s33 ; GCN-NEXT: ;;#ASMEND @@ -163,12 +199,12 @@ ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: {{.*}} ; GCN-NOT: s34 -; MUBUF: s_getpc_b64 s[4:5] -; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; FLATSCR: s_getpc_b64 s[0:1] -; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 -; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; MUBUF: s_getpc_b64 s[18:19] +; MUBUF-NEXT: s_add_u32 s18, s18, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s19, s19, external_void_func_void@rel32@hi+12 +; FLATSCR: s_getpc_b64 s[16:17] +; FLATSCR-NEXT: s_add_u32 s16, s16, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s17, s17, external_void_func_void@rel32@hi+12 ; GCN: s_mov_b32 s32, 0 ; GCN-NOT: s34 @@ -177,8 +213,8 @@ ; GCN-NEXT: ;;#ASMEND ; GCN-NOT: s34 -; MUBUF: s_swappc_b64 s[30:31], s[4:5] -; FLATSCR: s_swappc_b64 s[30:31], s[0:1] +; MUBUF: s_swappc_b64 s[30:31], s[18:19] +; FLATSCR: s_swappc_b64 s[30:31], s[16:17] ; GCN-NOT: s34 @@ -196,12 +232,12 @@ ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v40: {{.*}} ; GCN-NOT: v32 -; MUBUF: s_getpc_b64 s[4:5] -; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; FLATSCR: s_getpc_b64 s[0:1] -; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 -; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; MUBUF: s_getpc_b64 s[18:19] +; MUBUF-NEXT: s_add_u32 s18, s18, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s19, s19, external_void_func_void@rel32@hi+12 +; FLATSCR: s_getpc_b64 s[16:17] +; FLATSCR-NEXT: s_add_u32 s16, s16, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s17, s17, external_void_func_void@rel32@hi+12 ; GCN: s_mov_b32 s32, 0 ; GCN-NOT: v40 @@ -209,8 +245,8 @@ ; GCN-NEXT: ; def v40 ; GCN-NEXT: ;;#ASMEND -; MUBUF: s_swappc_b64 s[30:31], s[4:5] -; FLATSCR: s_swappc_b64 s[30:31], s[0:1] +; MUBUF: s_swappc_b64 s[30:31], s[18:19] +; FLATSCR: s_swappc_b64 s[30:31], s[16:17] ; GCN-NOT: v40 diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll --- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll @@ -5,20 +5,30 @@ define amdgpu_kernel void @call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0 { ; GCN-LABEL: call_memory_arg_load: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: ds_read_b32 v0, v0 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12 -; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: s_endpgm +; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_load_dword s14, s[8:9], 0x0 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_add_u32 s8, s8, 8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v3, s14 +; GCN-NEXT: ds_read_b32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, v3 +; GCN-NEXT: s_getpc_b64 s[18:19] +; GCN-NEXT: s_add_u32 s18, s18, func@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s19, s19, func@rel32@hi+12 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GCN-NEXT: s_endpgm %vgpr = load volatile i32, i32 addrspace(3)* %ptr call void @func(i32 %vgpr) ret void @@ -28,21 +38,29 @@ define amdgpu_kernel void @call_memory_no_dep(i32 addrspace(1)* %ptr, i32) #0 { ; GCN-LABEL: call_memory_no_dep: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_store_dword v0, v0, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, func@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, func@rel32@hi+12 -; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GCN-NEXT: s_endpgm +; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_load_dwordx2 s[14:15], s[8:9], 0x0 +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_add_u32 s8, s8, 16 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_dword v3, v3, s[14:15] +; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_getpc_b64 s[18:19] +; GCN-NEXT: s_add_u32 s18, s18, func@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s19, s19, func@rel32@hi+12 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GCN-NEXT: s_endpgm store i32 0, i32 addrspace(1)* %ptr call void @func(i32 0) ret void @@ -51,21 +69,29 @@ ; Should not wait after the call before memory define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32) #0 { ; GCN-LABEL: call_no_wait_after_call: -; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12 -; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: v_mov_b32_e32 v40, 0 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: global_store_dword v40, v40, s[34:35] -; GCN-NEXT: s_endpgm +; GCN: %bb.0: +; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_load_dwordx2 s[34:35], s[8:9], 0x0 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_add_u32 s8, s8, 16 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_getpc_b64 s[18:19] +; GCN-NEXT: s_add_u32 s18, s18, func@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s19, s19, func@rel32@hi+12 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: v_mov_b32_e32 v40, 0 +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GCN-NEXT: global_store_dword v40, v40, s[34:35] +; GCN-NEXT: s_endpgm call void @func(i32 0) store i32 0, i32 addrspace(1)* %ptr ret void @@ -74,20 +100,28 @@ define amdgpu_kernel void @call_no_wait_after_call_return_val(i32 addrspace(1)* %ptr, i32) #0 { ; GCN-LABEL: call_no_wait_after_call_return_val: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, func.return@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, func.return@rel32@hi+12 -; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: v_mov_b32_e32 v40, 0 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: global_store_dword v40, v0, s[34:35] -; GCN-NEXT: s_endpgm +; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_load_dwordx2 s[34:35], s[8:9], 0x0 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_add_u32 s8, s8, 16 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_getpc_b64 s[18:19] +; GCN-NEXT: s_add_u32 s18, s18, func.return@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s19, s19, func.return@rel32@hi+12 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: v_mov_b32_e32 v40, 0 +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GCN-NEXT: global_store_dword v40, v0, s[34:35] +; GCN-NEXT: s_endpgm %rv = call i32 @func.return(i32 0) store i32 %rv, i32 addrspace(1)* %ptr ret void @@ -97,19 +131,27 @@ define amdgpu_kernel void @call_got_load(i32 addrspace(1)* %ptr, i32) #0 { ; GCN-LABEL: call_got_load: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, got.func@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, got.func@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: s_endpgm +; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_add_u32 s8, s8, 16 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_getpc_b64 s[14:15] +; GCN-NEXT: s_add_u32 s14, s14, got.func@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s15, s15, got.func@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GCN-NEXT: s_endpgm call void @got.func(i32 0) ret void } @@ -118,14 +160,14 @@ define void @tailcall_got_load(i32 addrspace(1)* %ptr, i32) #0 { ; GCN-LABEL: tailcall_got_load: ; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, got.func@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, got.func@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, got.func@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, got.func@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[16:17] tail call void @got.func(i32 0) ret void } @@ -134,12 +176,12 @@ define void @tail_call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0 { ; GCN-LABEL: tail_call_memory_arg_load: ; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: ds_read_b32 v0, v0 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12 -; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, func@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, func@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] %vgpr = load volatile i32, i32 addrspace(3)* %ptr tail call void @func(i32 %vgpr) ret void diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,CIVI %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -amdgpu-fixed-function-abi=0 --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,CIVI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-fixed-function-abi=0 --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: {{^}}use_dispatch_ptr: ; GCN: s_load_dword s{{[0-9]+}}, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -1,10 +1,8 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VARABI %s -; RUN: llc -amdgpu-fixed-function-abi -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FIXEDABI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s ; GCN-LABEL: {{^}}use_workitem_id_x: ; GCN: s_waitcnt -; VARABI: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v0 -; FIXEDABI: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31 +; GCN: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31 ; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -16,8 +14,7 @@ ; GCN-LABEL: {{^}}use_workitem_id_y: ; GCN: s_waitcnt -; VARABI: v_bfe_u32 [[ID:v[0-9]+]], v0, 10, 10 -; FIXEDABI: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10 +; GCN: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10 ; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -29,8 +26,7 @@ ; GCN-LABEL: {{^}}use_workitem_id_z: ; GCN: s_waitcnt -; VARABI: v_bfe_u32 [[ID:v[0-9]+]], v0, 20, 10 -; FIXEDABI: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10 +; GCN: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10 ; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -42,11 +38,9 @@ ; GCN-LABEL: {{^}}use_workitem_id_xy: ; GCN: s_waitcnt -; VARABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0 -; VARABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10 -; FIXEDABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 -; FIXEDABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 +; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 +; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] @@ -63,13 +57,10 @@ ; GCN-LABEL: {{^}}use_workitem_id_xyz: ; GCN: s_waitcnt -; VARABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0 -; VARABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10 -; VARABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10 -; FIXEDABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 -; FIXEDABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 -; FIXEDABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 +; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 +; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 +; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] @@ -89,11 +80,9 @@ ; GCN-LABEL: {{^}}use_workitem_id_xz: ; GCN: s_waitcnt -; VARABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0 -; VARABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10 -; FIXEDABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 -; FIXEDABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 +; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 +; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] @@ -109,11 +98,9 @@ ; GCN-LABEL: {{^}}use_workitem_id_yz: ; GCN: s_waitcnt -; VARABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10 -; VARABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10 -; FIXEDABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 -; FIXEDABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 +; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 +; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] @@ -128,38 +115,31 @@ } ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_x: -; VARABI: enable_vgpr_workitem_id = 0 -; FIXEDABI: enable_vgpr_workitem_id = 2 +; GCN: enable_vgpr_workitem_id = 2 ; FIXEDA-NOT: v0 -; VARABI-NOT: v31 ; GCN: s_swappc_b64 -; FIXEDABI-NOT: v0 -; VARABI-NOT: v31 +; GCN-NOT: v0 define amdgpu_kernel void @kern_indirect_use_workitem_id_x() #1 { call void @use_workitem_id_x() ret void } ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_y: -; VARABI: enable_vgpr_workitem_id = 1 -; FIXEDABI: enable_vgpr_workitem_id = 2 +; GCN: enable_vgpr_workitem_id = 2 -; FIXEDABI-NOT: v0 -; FIXEDABI-NOT: v1 +; GCN-NOT: v0 +; GCN-NOT: v1 -; VARABI-NOT: v31 -; VARABI: v_lshlrev_b32_e32 v0, 10, v1 -; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 -; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 -; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] -; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] +; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] -; FIXEDABI-NOT: v0 -; FIXEDABI-NOT: v1 -; VARABI-NOT: v31 +; GCN-NOT: v0 +; GCN-NOT: v1 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_y() #1 { @@ -170,16 +150,11 @@ ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_z: ; GCN: enable_vgpr_workitem_id = 2 -; VARABI-NOT: v0 -; VARABI-NOT: v2 -; VARABI: v_lshlrev_b32_e32 v0, 20, v2 -; VARABI-NOT: v0 -; VARABI-NOT: v1 -; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 -; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 -; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] -; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] +; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 { @@ -188,17 +163,11 @@ } ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xy: -; VARABI-NOT: v0 -; VARABI-NOT: v1 -; VARABI: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1 -; VARABI: v_or_b32_e32 v0, v0, [[IDY]] -; VARABI-NOT: v0 -; VARABI-NOT: v1 - -; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 -; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 -; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] -; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] + +; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 { @@ -207,18 +176,12 @@ } ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xz: -; VARABI-NOT: v0 -; VARABI-NOT: v2 -; VARABI: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2 -; VARABI: v_or_b32_e32 v0, v0, [[IDZ]] -; VARABI-NOT: v0 -; VARABI-NOT: v2 -; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 -; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 -; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] -; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] +; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 { @@ -227,19 +190,12 @@ } ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_yz: -; VARABI-NOT: v1 -; VARABI-NOT: v2 -; VARABI-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1 -; VARABI-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2 -; VARABI: v_or_b32_e32 v0, [[IDY]], [[IDZ]] -; VARABI-NOT: v1 -; VARABI-NOT: v2 -; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 -; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 -; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] -; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] +; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_yz() #1 { @@ -248,21 +204,11 @@ } ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xyz: -; VARABI-NOT: v0 -; VARABI-NOT: v1 -; VARABI-NOT: v2 -; VARABI-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1 -; VARABI-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2 -; VARABI-DAG: v_or_b32_e32 v0, v0, [[IDY]] -; VARABI-DAG: v_or_b32_e32 v0, v0, [[IDZ]] -; VARABI-NOT: v0 -; VARABI-NOT: v1 -; VARABI-NOT: v2 - -; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 -; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 -; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] -; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] + +; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_xyz() #1 { @@ -299,8 +245,7 @@ ; GCN-LABEL: {{^}}other_arg_use_workitem_id_x: ; GCN: s_waitcnt -; VARABI-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v1 -; FIXEDABI-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31 +; GCN-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]] @@ -313,8 +258,7 @@ ; GCN-LABEL: {{^}}other_arg_use_workitem_id_y: ; GCN: s_waitcnt -; VARABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 10, 10 -; FIXEDABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10 +; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]] define void @other_arg_use_workitem_id_y(i32 %arg0) #1 { @@ -326,8 +270,7 @@ ; GCN-LABEL: {{^}}other_arg_use_workitem_id_z: ; GCN: s_waitcnt -; VARABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 20, 10 -; FIXEDABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10 +; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]] define void @other_arg_use_workitem_id_z(i32 %arg0) #1 { @@ -339,16 +282,13 @@ ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_x: -; VARABI: enable_vgpr_workitem_id = 0 -; FIXEDABI: enable_vgpr_workitem_id = 2 +; GCN: enable_vgpr_workitem_id = 2 -; VARABI: v_mov_b32_e32 v1, v0 -; VARABI: v_mov_b32_e32 v0, 0x22b -; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 -; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 -; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] -; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] +; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_x() #1 { @@ -358,20 +298,13 @@ ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_y: -; VARABI: enable_vgpr_workitem_id = 1 - -; VARABI: v_lshlrev_b32_e32 v1, 10, v1 -; VARABI-NOT: v1 -; VARABI: v_mov_b32_e32 v0, 0x22b -; VARABI-NOT: v1 -; VARABI: s_swappc_b64 -; VARABI-NOT: v0 - -; FIXEDABI: enable_vgpr_workitem_id = 2 -; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 -; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 -; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] -; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] + + +; GCN: enable_vgpr_workitem_id = 2 +; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_y() #1 { call void @other_arg_use_workitem_id_y(i32 555) ret void @@ -380,29 +313,21 @@ ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_z: ; GCN: enable_vgpr_workitem_id = 2 -; VARABI-DAG: v_mov_b32_e32 v0, 0x22b -; VARABI-DAG: v_lshlrev_b32_e32 v1, 20, v2 -; VARABI: s_swappc_b64 -; VARABI-NOT: v0 -; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 -; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 -; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] -; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] +; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 { call void @other_arg_use_workitem_id_z(i32 555) ret void } ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x: -; VARABI: buffer_load_dword v32, off, s[0:3], s32{{$}} -; VARABI: v_and_b32_e32 v32, 0x3ff, v32 -; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 -; VARABI: s_setpc_b64 -; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31 -; FIXEDABI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}} +; GCN: v_and_b32_e32 v31, 0x3ff, v31 +; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}} define void @too_many_args_use_workitem_id_x( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, @@ -451,23 +376,19 @@ } ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x: -; VARABI: enable_vgpr_workitem_id = 0 -; VARABI: s_mov_b32 s32, 0 -; VARABI: buffer_store_dword v0, off, s[0:3], s32{{$}} -; VARABI: s_swappc_b64 -; FIXEDABI: enable_vgpr_workitem_id = 2 -; FIXEDABI-DAG: s_mov_b32 s32, 0 -; FIXEDABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}} -; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 -; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 -; FIXEDABI-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] -; FIXEDABI-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] -; FIXEDABI: buffer_store_dword [[K]], off, s[0:3], s32{{$}} +; GCN: enable_vgpr_workitem_id = 2 +; GCN-DAG: s_mov_b32 s32, 0 +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}} +; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; GCN-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; GCN-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] +; GCN: buffer_store_dword [[K]], off, s[0:3], s32{{$}} -; FIXEDABI: s_swappc_b64 +; GCN: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 { call void @too_many_args_use_workitem_id_x( i32 10, i32 20, i32 30, i32 40, @@ -482,15 +403,13 @@ } ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x: -; VARABI: s_mov_b32 s33, s32 -; VARABI: buffer_store_dword v1, off, s[0:3], s32{{$}} ; Touching the workitem id register is not necessary. -; FIXEDABI-NOT: v31 -; FIXEDABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}} -; FIXEDABI-NOT: v31 -; FIXEDABI: buffer_store_dword [[K]], off, s[0:3], s32{{$}} -; FIXEDABI-NOT: v31 +; GCN-NOT: v31 +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}} +; GCN-NOT: v31 +; GCN: buffer_store_dword [[K]], off, s[0:3], s32{{$}} +; GCN-NOT: v31 ; GCN: s_swappc_b64 define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 { @@ -539,21 +458,15 @@ ; frame[2] = VGPR spill slot ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval: -; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VARABI-NEXT: s_waitcnt -; VARABI-NEXT: v_and_b32_e32 v32, 0x3ff, v32 -; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32 -; VARABI: buffer_load_dword v0, off, s[0:3], s32 glc{{$}} -; VARABI: s_setpc_b64 -; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31 -; FIXEDABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v31 +; GCN: v_and_b32_e32 v31, 0x3ff, v31 +; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v31 -; FIXEDABI: buffer_load_dword v0, off, s[0:3], s32{{$}} -; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; FIXEDABI: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc{{$}} -; FIXEDABI: s_setpc_b64 +; GCN: buffer_load_dword v0, off, s[0:3], s32{{$}} +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GCN: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc{{$}} +; GCN: s_setpc_b64 define void @too_many_args_use_workitem_id_x_byval( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, @@ -607,36 +520,27 @@ ; sp[2] = stack passed workitem ID x ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval: -; VARABI: enable_vgpr_workitem_id = 0 -; VARABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} -; VARABI: s_movk_i32 s32, 0x400{{$}} -; VARABI: buffer_store_dword [[K]], off, s[0:3], 0 offset:4 -; VARABI: buffer_store_dword v0, off, s[0:3], s32 offset:4 -; VARABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], 0 offset:4 -; VARABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}} -; VARABI: v_mov_b32_e32 [[RELOAD_BYVAL]], -; VARABI: s_swappc_b64 -; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7 -; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}} -; FIXEDABI: s_movk_i32 s32, 0x400{{$}} -; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140 +; GCN: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7 +; GCN: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}} +; GCN: s_movk_i32 s32, 0x400{{$}} +; GCN: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140 -; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}} +; GCN: buffer_store_dword [[K1]], off, s[0:3], s32{{$}} ; FIXME: Why this reload? -; FIXEDABI: buffer_load_dword [[RELOAD:v[0-9]+]], off, s[0:3], 0 offset:4{{$}} +; GCN: buffer_load_dword [[RELOAD:v[0-9]+]], off, s[0:3], 0 offset:4{{$}} -; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 -; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 -; FIXEDABI-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] -; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] +; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; GCN-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] -; FIXEDABI-NOT: s32 -; FIXEDABI: buffer_store_dword [[RELOAD]], off, s[0:3], s32 offset:4 -; FIXEDABI: s_swappc_b64 +; GCN-NOT: s32 +; GCN: buffer_store_dword [[RELOAD]], off, s[0:3], s32 offset:4 +; GCN: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 999, i32 addrspace(5)* %alloca @@ -654,26 +558,19 @@ } ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval: -; VARABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} -; VARABI: buffer_store_dword [[K]], off, s[0:3], s33{{$}} -; VARABI: buffer_store_dword v0, off, s[0:3], s32 offset:4 -; VARABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}} -; VARABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}} -; VARABI: v_mov_b32_e32 [[RELOAD_BYVAL]], -; VARABI: s_swappc_b64 ; FIXED-ABI-NOT: v31 -; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7{{$}} -; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33{{$}} -; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}} -; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}} -; FIXEDABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}} +; GCN: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7{{$}} +; GCN: buffer_store_dword [[K0]], off, s[0:3], s33{{$}} +; GCN: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}} +; GCN: buffer_store_dword [[K1]], off, s[0:3], s32{{$}} +; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}} ; FIXED-ABI-NOT: v31 -; FIXEDABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}} +; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}} ; FIXED-ABI-NOT: v31 -; FIXEDABI: s_swappc_b64 +; GCN: s_swappc_b64 define void @func_call_too_many_args_use_workitem_id_x_byval() #1 { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 999, i32 addrspace(5)* %alloca @@ -691,29 +588,17 @@ } ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz: -; VARABI-NOT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}} -; VARABI: buffer_load_dword v32, off, s[0:3], s32{{$}} -; VARABI-NOT: buffer_load_dword - -; VARABI: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v32 -; VARABI-NOT: buffer_load_dword -; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[AND_X]] -; VARABI-NOT: buffer_load_dword -; VARABI: v_bfe_u32 [[BFE_Y:v[0-9]+]], v32, 10, 10 -; VARABI-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v32, 20, 10 -; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]] -; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]] -; VARABI: s_setpc_b64 - - -; FIXEDABI: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v31 -; FIXEDABI-NOT: buffer_load_dword -; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[AND_X]] -; FIXEDABI-NOT: buffer_load_dword -; FIXEDABI: v_bfe_u32 [[BFE_Y:v[0-9]+]], v31, 10, 10 -; FIXEDABI-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v31, 20, 10 -; FIXEDABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]] -; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]] + + + +; GCN: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v31 +; GCN-NOT: buffer_load_dword +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[AND_X]] +; GCN-NOT: buffer_load_dword +; GCN: v_bfe_u32 [[BFE_Y:v[0-9]+]], v31, 10, 10 +; GCN-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v31, 20, 10 +; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]] +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]] define void @too_many_args_use_workitem_id_xyz( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, @@ -774,12 +659,10 @@ ; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 ; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 ; GCN-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] -; VARABI-DAG: v_or_b32_e32 [[PACKEDID:v[0-9]+]], [[TMP2]], [[TMP0]] -; VARABI: buffer_store_dword [[PACKEDID]], off, s[0:3], s32{{$}} -; FIXEDABI-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] -; FIXEDABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140 -; FIXEDABI: buffer_store_dword [[K]], off, s[0:3], s32{{$}} +; GCN-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140 +; GCN: buffer_store_dword [[K]], off, s[0:3], s32{{$}} ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 { diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll --- a/llvm/test/CodeGen/AMDGPU/cc-update.ll +++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -64,45 +64,64 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 { ; GFX803-LABEL: test_kern_call: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_u32 s4, s4, s7 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; GFX803-NEXT: s_add_u32 s0, s0, s7 +; GFX803-NEXT: s_add_u32 s12, s12, s17 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX803-NEXT: s_add_u32 s0, s0, s17 +; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 -; GFX803-NEXT: s_getpc_b64 s[4:5] -; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 -; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 +; GFX803-NEXT: s_mov_b32 s12, s14 +; GFX803-NEXT: s_mov_b32 s13, s15 +; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX803-NEXT: s_mov_b32 s14, s16 +; GFX803-NEXT: s_getpc_b64 s[18:19] +; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 ; GFX803-NEXT: s_mov_b32 s32, 0 -; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX803-NEXT: s_endpgm ; ; GFX900-LABEL: test_kern_call: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; GFX900-NEXT: s_add_u32 s0, s0, s7 +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX900-NEXT: s_add_u32 s0, s0, s17 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 -; GFX900-NEXT: s_getpc_b64 s[4:5] -; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 -; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_getpc_b64 s[18:19] +; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 ; GFX900-NEXT: s_mov_b32 s32, 0 -; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX900-NEXT: s_endpgm -; + ; GFX1010-LABEL: test_kern_call: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_add_u32 s4, s4, s7 -; GFX1010-NEXT: s_mov_b32 s32, 0 -; GFX1010-NEXT: s_addc_u32 s5, s5, 0 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 -; GFX1010-NEXT: s_add_u32 s0, s0, s7 -; GFX1010-NEXT: s_addc_u32 s1, s1, 0 -; GFX1010-NEXT: s_getpc_b64 s[4:5] -; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 -; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 -; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX1010-NEXT: s_endpgm +; GFX1010-NEXT: s_add_u32 s12, s12, s17 +; GFX1010-NEXT: s_mov_b32 s32, 0 +; GFX1010-NEXT: s_addc_u32 s13, s13, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1010-NEXT: s_add_u32 s0, s0, s17 +; GFX1010-NEXT: s_addc_u32 s1, s1, 0 +; GFX1010-NEXT: s_mov_b32 s12, s14 +; GFX1010-NEXT: s_mov_b32 s13, s15 +; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1010-NEXT: s_mov_b32 s14, s16 +; GFX1010-NEXT: s_getpc_b64 s[18:19] +; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GFX1010-NEXT: s_endpgm entry: tail call void @ex() #0 ret void @@ -111,54 +130,73 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { ; GFX803-LABEL: test_kern_stack_and_call: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_u32 s4, s4, s7 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; GFX803-NEXT: s_add_u32 s0, s0, s7 +; GFX803-NEXT: s_add_u32 s12, s12, s17 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX803-NEXT: s_add_u32 s0, s0, s17 +; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 -; GFX803-NEXT: v_mov_b32_e32 v0, 0 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 -; GFX803-NEXT: s_getpc_b64 s[4:5] -; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 -; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 +; GFX803-NEXT: s_mov_b32 s12, s14 +; GFX803-NEXT: v_mov_b32_e32 v3, 0 +; GFX803-NEXT: s_mov_b32 s13, s15 +; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX803-NEXT: s_mov_b32 s14, s16 +; GFX803-NEXT: s_getpc_b64 s[18:19] +; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 ; GFX803-NEXT: s_movk_i32 s32, 0x400 -; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX803-NEXT: s_endpgm ; ; GFX900-LABEL: test_kern_stack_and_call: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; GFX900-NEXT: s_add_u32 s0, s0, s7 +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX900-NEXT: s_add_u32 s0, s0, s17 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: s_getpc_b64 s[4:5] -; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 -; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_getpc_b64 s[18:19] +; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 ; GFX900-NEXT: s_movk_i32 s32, 0x400 -; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX900-NEXT: s_endpgm -; + ; GFX1010-LABEL: test_kern_stack_and_call: -; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_add_u32 s4, s4, s7 -; GFX1010-NEXT: s_movk_i32 s32, 0x200 -; GFX1010-NEXT: s_addc_u32 s5, s5, 0 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 -; GFX1010-NEXT: v_mov_b32_e32 v0, 0 -; GFX1010-NEXT: s_add_u32 s0, s0, s7 -; GFX1010-NEXT: s_addc_u32 s1, s1, 0 -; GFX1010-NEXT: s_getpc_b64 s[4:5] -; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 -; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 -; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 -; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX1010-NEXT: s_endpgm +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_add_u32 s12, s12, s17 +; GFX1010-NEXT: s_movk_i32 s32, 0x200 +; GFX1010-NEXT: s_addc_u32 s13, s13, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1010-NEXT: v_mov_b32_e32 v3, 0 +; GFX1010-NEXT: s_add_u32 s0, s0, s17 +; GFX1010-NEXT: s_addc_u32 s1, s1, 0 +; GFX1010-NEXT: s_mov_b32 s12, s14 +; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1010-NEXT: s_mov_b32 s13, s15 +; GFX1010-NEXT: s_mov_b32 s14, s16 +; GFX1010-NEXT: s_getpc_b64 s[18:19] +; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 +; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GFX1010-NEXT: s_endpgm entry: %x = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %x, align 4 @@ -171,7 +209,7 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_mov_b32 s33, 0 ; GFX803-NEXT: s_endpgm -; + ; GFX900-LABEL: test_force_fp_kern_empty: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_mov_b32 s33, 0 @@ -233,48 +271,67 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 { ; GFX803-LABEL: test_force_fp_kern_call: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_u32 s4, s4, s7 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; GFX803-NEXT: s_add_u32 s0, s0, s7 +; GFX803-NEXT: s_add_u32 s12, s12, s17 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX803-NEXT: s_add_u32 s0, s0, s17 +; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 -; GFX803-NEXT: s_getpc_b64 s[4:5] -; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 -; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 +; GFX803-NEXT: s_mov_b32 s12, s14 +; GFX803-NEXT: s_mov_b32 s13, s15 +; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX803-NEXT: s_mov_b32 s14, s16 +; GFX803-NEXT: s_getpc_b64 s[18:19] +; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 ; GFX803-NEXT: s_mov_b32 s32, 0 ; GFX803-NEXT: s_mov_b32 s33, 0 -; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX803-NEXT: s_endpgm ; ; GFX900-LABEL: test_force_fp_kern_call: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; GFX900-NEXT: s_add_u32 s0, s0, s7 +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX900-NEXT: s_add_u32 s0, s0, s17 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 -; GFX900-NEXT: s_getpc_b64 s[4:5] -; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 -; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_getpc_b64 s[18:19] +; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 ; GFX900-NEXT: s_mov_b32 s32, 0 ; GFX900-NEXT: s_mov_b32 s33, 0 -; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX900-NEXT: s_endpgm ; ; GFX1010-LABEL: test_force_fp_kern_call: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_add_u32 s4, s4, s7 -; GFX1010-NEXT: s_mov_b32 s32, 0 -; GFX1010-NEXT: s_mov_b32 s33, 0 -; GFX1010-NEXT: s_addc_u32 s5, s5, 0 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 -; GFX1010-NEXT: s_add_u32 s0, s0, s7 -; GFX1010-NEXT: s_addc_u32 s1, s1, 0 -; GFX1010-NEXT: s_getpc_b64 s[4:5] -; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 -; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 -; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX1010-NEXT: s_endpgm +; GFX1010-NEXT s_add_u32 s12, s12, s17 +; GFX1010-NEXT s_mov_b32 s32, 0 +; GFX1010-NEXT s_mov_b32 s33, 0 +; GFX1010-NEXT s_addc_u32 s13, s13, 0 +; GFX1010-NEXT s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX1010-NEXT s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX1010-NEXT v_lshlrev_b32_e32 v2, 20, v2 +; GFX1010-NEXT v_lshlrev_b32_e32 v1, 10, v1 +; GFX1010-NEXT s_add_u32 s0, s0, s17 +; GFX1010-NEXT s_addc_u32 s1, s1, 0 +; GFX1010-NEXT s_mov_b32 s12, s14 +; GFX1010-NEXT s_mov_b32 s13, s15 +; GFX1010-NEXT v_or3_b32 v31, v0, v1, v2 +; GFX1010-NEXT s_mov_b32 s14, s16 +; GFX1010-NEXT s_getpc_b64 s[18:19] +; GFX1010-NEXT s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX1010-NEXT s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX1010-NEXT s_swappc_b64 s[30:31], s[18:19] +; GFX1010-NEXT s_endpgm entry: tail call void @ex() #2 ret void @@ -283,57 +340,76 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_addr #2 { ; GFX803-LABEL: test_force_fp_kern_stack_and_call: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_u32 s4, s4, s7 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; GFX803-NEXT: s_add_u32 s0, s0, s7 -; GFX803-NEXT: s_mov_b32 s33, 0 +; GFX803-NEXT: s_add_u32 s12, s12, s17 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX803-NEXT: s_add_u32 s0, s0, s17 +; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 -; GFX803-NEXT: v_mov_b32_e32 v0, 0 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 -; GFX803-NEXT: s_getpc_b64 s[4:5] -; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 -; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 +; GFX803-NEXT: s_mov_b32 s12, s14 +; GFX803-NEXT: s_mov_b32 s33, 0 +; GFX803-NEXT: v_mov_b32_e32 v3, 0 +; GFX803-NEXT: s_mov_b32 s13, s15 +; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX803-NEXT: s_mov_b32 s14, s16 +; GFX803-NEXT: s_getpc_b64 s[18:19] +; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 ; GFX803-NEXT: s_movk_i32 s32, 0x400 -; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX803-NEXT: s_endpgm ; ; GFX900-LABEL: test_force_fp_kern_stack_and_call: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; GFX900-NEXT: s_add_u32 s0, s0, s7 +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX900-NEXT: s_add_u32 s0, s0, s17 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 +; GFX900-NEXT: s_mov_b32 s12, s14 ; GFX900-NEXT: s_mov_b32 s33, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: s_getpc_b64 s[4:5] -; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 -; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_getpc_b64 s[18:19] +; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 ; GFX900-NEXT: s_movk_i32 s32, 0x400 -; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX900-NEXT: s_endpgm ; ; GFX1010-LABEL: test_force_fp_kern_stack_and_call: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_add_u32 s4, s4, s7 -; GFX1010-NEXT: s_movk_i32 s32, 0x200 -; GFX1010-NEXT: s_mov_b32 s33, 0 -; GFX1010-NEXT: s_addc_u32 s5, s5, 0 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 -; GFX1010-NEXT: v_mov_b32_e32 v0, 0 -; GFX1010-NEXT: s_add_u32 s0, s0, s7 -; GFX1010-NEXT: s_addc_u32 s1, s1, 0 -; GFX1010-NEXT: s_getpc_b64 s[4:5] -; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 -; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 -; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 -; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX1010-NEXT: s_endpgm +; GFX1010-NEXT: s_add_u32 s12, s12, s17 +; GFX1010-NEXT: s_movk_i32 s32, 0x200 +; GFX1010-NEXT: s_mov_b32 s33, 0 +; GFX1010-NEXT: s_addc_u32 s13, s13, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1010-NEXT: v_mov_b32_e32 v3, 0 +; GFX1010-NEXT: s_add_u32 s0, s0, s17 +; GFX1010-NEXT: s_addc_u32 s1, s1, 0 +; GFX1010-NEXT: s_mov_b32 s12, s14 +; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1010-NEXT: s_mov_b32 s13, s15 +; GFX1010-NEXT: s_mov_b32 s14, s16 +; GFX1010-NEXT: s_getpc_b64 s[18:19] +; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4 +; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GFX1010-NEXT: s_endpgm entry: %x = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %x, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -27,18 +27,18 @@ ; GCN-LABEL: call_split_type_used_outside_block_v2f32: ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[16:17] ; GCN-NEXT: v_writelane_b32 v40, s33, 2 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, func_v2f32@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, func_v2f32@rel32@hi+12 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, func_v2f32@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, func_v2f32@rel32@hi+12 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s4, v40, 0 ; GCN-NEXT: v_readlane_b32 s5, v40, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 @@ -59,30 +59,29 @@ define float @call_split_type_used_outside_block_v3f32() #0 { ; GCN-LABEL: call_split_type_used_outside_block_v3f32: -; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v40, s33, 2 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_add_u32 s32, s32, 0x400 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, func_v3f32@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, func_v3f32@rel32@hi+12 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s4, v40, 0 -; GCN-NEXT: v_readlane_b32 s5, v40, 1 -; GCN-NEXT: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: v_readlane_b32 s33, v40, 2 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[4:5] -bb0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: v_writelane_b32 v40, s33, 2 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_add_u32 s32, s32, 0x400 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, func_v3f32@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, func_v3f32@rel32@hi+12 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: v_readlane_b32 s4, v40, 0 +; GCN-NEXT: v_readlane_b32 s5, v40, 1 +; GCN-NEXT: s_sub_u32 s32, s32, 0x400 +; GCN-NEXT: v_readlane_b32 s33, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] %split.ret.type = call <3 x float> @func_v3f32() br label %bb1 @@ -94,28 +93,29 @@ define half @call_split_type_used_outside_block_v4f16() #0 { ; GCN-LABEL: call_split_type_used_outside_block_v4f16: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v40, s33, 2 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_add_u32 s32, s32, 0x400 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, func_v4f16@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, func_v4f16@rel32@hi+12 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s4, v40, 0 -; GCN-NEXT: v_readlane_b32 s5, v40, 1 -; GCN-NEXT: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: v_readlane_b32 s33, v40, 2 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: v_writelane_b32 v40, s33, 2 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_add_u32 s32, s32, 0x400 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, func_v4f16@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, func_v4f16@rel32@hi+12 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: v_readlane_b32 s4, v40, 0 +; GCN-NEXT: v_readlane_b32 s5, v40, 1 +; GCN-NEXT: s_sub_u32 s32, s32, 0x400 +; GCN-NEXT: v_readlane_b32 s33, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] + bb0: %split.ret.type = call <4 x half> @func_v4f16() br label %bb1 @@ -128,29 +128,29 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 { ; GCN-LABEL: call_split_type_used_outside_block_struct: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v40, s33, 2 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_add_u32 s32, s32, 0x400 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, func_struct@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, func_struct@rel32@hi+12 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s4, v40, 0 -; GCN-NEXT: v_mov_b32_e32 v1, v4 -; GCN-NEXT: v_readlane_b32 s5, v40, 1 -; GCN-NEXT: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: v_readlane_b32 s33, v40, 2 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: v_writelane_b32 v40, s33, 2 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_add_u32 s32, s32, 0x400 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, func_struct@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, func_struct@rel32@hi+12 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: v_readlane_b32 s4, v40, 0 +; GCN-NEXT: v_mov_b32_e32 v1, v4 +; GCN-NEXT: v_readlane_b32 s5, v40, 1 +; GCN-NEXT: s_sub_u32 s32, s32, 0x400 +; GCN-NEXT: v_readlane_b32 s33, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] bb0: %split.ret.type = call { <4 x i32>, <4 x half> } @func_struct() br label %bb1 @@ -168,32 +168,40 @@ define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 { ; GCN-LABEL: v3i16_registers: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s4, 1, s4 -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 1 -; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_cbranch_vccnz BB4_2 -; GCN-NEXT: ; %bb.1: ; %if.else -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, func_v3i16@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, func_v3i16@rel32@hi+12 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: s_branch BB4_3 -; GCN-NEXT: BB4_2: -; GCN-NEXT: s_mov_b32 s4, 0 -; GCN-NEXT: s_mov_b32 s5, s4 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: BB4_3: ; %if.end -; GCN-NEXT: global_store_short v[0:1], v1, off -; GCN-NEXT: global_store_dword v[0:1], v0, off -; GCN-NEXT: s_endpgm +; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GCN-NEXT: s_load_dword s12, s[8:9], 0x0 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_and_b32 s12, 1, s12 +; GCN-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 1 +; GCN-NEXT: s_and_b64 vcc, exec, s[12:13] +; GCN-NEXT: s_cbranch_vccnz BB4_2 +; GCN-NEXT: ; %bb.1: ; %if.else +; GCN-NEXT: s_add_u32 s8, s8, 8 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: s_getpc_b64 s[18:19] +; GCN-NEXT: s_add_u32 s18, s18, func_v3i16@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s19, s19, func_v3i16@rel32@hi+12 +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GCN-NEXT: s_branch BB4_3 +; GCN-NEXT: BB4_2: +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: BB4_3: ; %if.end +; GCN-NEXT: global_store_short v[0:1], v1, off +; GCN-NEXT: global_store_dword v[0:1], v0, off +; GCN-NEXT: s_endpgm entry: br i1 %cond, label %if.then, label %if.else @@ -213,32 +221,36 @@ define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 { ; GCN-LABEL: v3f16_registers: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s4, 1, s4 -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 1 -; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_cbranch_vccnz BB5_2 -; GCN-NEXT: ; %bb.1: ; %if.else -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, func_v3f16@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, func_v3f16@rel32@hi+12 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: s_branch BB5_3 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GCN-NEXT: s_load_dword s12, s[8:9], 0x0 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_and_b32 s12, 1, s12 +; GCN-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 1 +; GCN-NEXT: s_and_b64 vcc, exec, s[12:13] +; GCN-NEXT: s_cbranch_vccnz BB5_2 +; GCN-NEXT: %bb.1: ; %if.else +; GCN-NEXT: s_add_u32 s8, s8, 8 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: s_getpc_b64 s[18:19] +; GCN-NEXT: s_add_u32 s18, s18, func_v3f16@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s19, s19, func_v3f16@rel32@hi+12 +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GCN-NEXT: s_branch BB5_3 ; GCN-NEXT: BB5_2: -; GCN-NEXT: s_mov_b32 s4, 0 -; GCN-NEXT: s_mov_b32 s5, s4 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: BB5_3: ; %if.end -; GCN-NEXT: global_store_short v[0:1], v1, off -; GCN-NEXT: global_store_dword v[0:1], v0, off -; GCN-NEXT: s_endpgm +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 entry: br i1 %cond, label %if.then, label %if.else diff --git a/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll b/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll --- a/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll @@ -11,25 +11,25 @@ ; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN: %6:vgpr_32, %7:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GCN: %8:vgpr_32, %9:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GCN: %10:vgpr_32 = nofpexcept V_RCP_F32_e64 0, %8, 0, 0, implicit $mode, implicit $exec + ; GCN: %13:vgpr_32, %14:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GCN: %15:vgpr_32, %16:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GCN: %17:vgpr_32 = nofpexcept V_RCP_F32_e64 0, %15, 0, 0, implicit $mode, implicit $exec ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3 ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216 ; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GCN: S_SETREG_B32_mode killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode - ; GCN: %14:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %8, 0, %10, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec - ; GCN: %15:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %14, 0, %10, 0, %10, 0, 0, implicit $mode, implicit $exec - ; GCN: %16:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %6, 0, %15, 0, 0, implicit $mode, implicit $exec - ; GCN: %17:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %8, 0, %16, 0, %6, 0, 0, implicit $mode, implicit $exec - ; GCN: %18:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %17, 0, %15, 0, %16, 0, 0, implicit $mode, implicit $exec - ; GCN: %19:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %8, 0, %18, 0, %6, 0, 0, implicit $mode, implicit $exec + ; GCN: %21:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %15, 0, %17, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec + ; GCN: %22:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %21, 0, %17, 0, %17, 0, 0, implicit $mode, implicit $exec + ; GCN: %23:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %13, 0, %22, 0, 0, implicit $mode, implicit $exec + ; GCN: %24:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %15, 0, %23, 0, %13, 0, 0, implicit $mode, implicit $exec + ; GCN: %25:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %24, 0, %22, 0, %23, 0, 0, implicit $mode, implicit $exec + ; GCN: %26:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %15, 0, %25, 0, %13, 0, 0, implicit $mode, implicit $exec ; GCN: S_SETREG_B32_mode killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode - ; GCN: $vcc = COPY %7 - ; GCN: %20:vgpr_32 = nofpexcept V_DIV_FMAS_F32_e64 0, killed %19, 0, %15, 0, %18, 0, 0, implicit $mode, implicit $vcc, implicit $exec - ; GCN: %21:vgpr_32 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed %20, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GCN: $vcc = COPY %14 + ; GCN: %27:vgpr_32 = nofpexcept V_DIV_FMAS_F32_e64 0, killed %26, 0, %22, 0, %25, 0, 0, implicit $mode, implicit $vcc, implicit $exec + ; GCN: %28:vgpr_32 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed %27, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec ; GCN: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] - ; GCN: $vgpr0 = COPY %21 + ; GCN: $vgpr0 = COPY %28 ; GCN: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] ; GCN: S_SETPC_B64_return [[COPY4]], implicit $vgpr0 entry: @@ -44,25 +44,25 @@ ; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN: %6:vgpr_32, %7:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GCN: %8:vgpr_32, %9:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GCN: %10:vgpr_32 = nnan nofpexcept V_RCP_F32_e64 0, %8, 0, 0, implicit $mode, implicit $exec + ; GCN: %13:vgpr_32, %14:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GCN: %15:vgpr_32, %16:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GCN: %17:vgpr_32 = nnan nofpexcept V_RCP_F32_e64 0, %15, 0, 0, implicit $mode, implicit $exec ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3 ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216 ; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GCN: S_SETREG_B32_mode killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode - ; GCN: %14:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %8, 0, %10, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec - ; GCN: %15:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %14, 0, %10, 0, %10, 0, 0, implicit $mode, implicit $exec - ; GCN: %16:vgpr_32 = nnan nofpexcept V_MUL_F32_e64 0, %6, 0, %15, 0, 0, implicit $mode, implicit $exec - ; GCN: %17:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %8, 0, %16, 0, %6, 0, 0, implicit $mode, implicit $exec - ; GCN: %18:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %17, 0, %15, 0, %16, 0, 0, implicit $mode, implicit $exec - ; GCN: %19:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %8, 0, %18, 0, %6, 0, 0, implicit $mode, implicit $exec + ; GCN: %21:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %15, 0, %17, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec + ; GCN: %22:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %21, 0, %17, 0, %17, 0, 0, implicit $mode, implicit $exec + ; GCN: %23:vgpr_32 = nnan nofpexcept V_MUL_F32_e64 0, %13, 0, %22, 0, 0, implicit $mode, implicit $exec + ; GCN: %24:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %15, 0, %23, 0, %13, 0, 0, implicit $mode, implicit $exec + ; GCN: %25:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %24, 0, %22, 0, %23, 0, 0, implicit $mode, implicit $exec + ; GCN: %26:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %15, 0, %25, 0, %13, 0, 0, implicit $mode, implicit $exec ; GCN: S_SETREG_B32_mode killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode - ; GCN: $vcc = COPY %7 - ; GCN: %20:vgpr_32 = nnan nofpexcept V_DIV_FMAS_F32_e64 0, killed %19, 0, %15, 0, %18, 0, 0, implicit $mode, implicit $vcc, implicit $exec - ; GCN: %21:vgpr_32 = nnan nofpexcept V_DIV_FIXUP_F32_e64 0, killed %20, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GCN: $vcc = COPY %14 + ; GCN: %27:vgpr_32 = nnan nofpexcept V_DIV_FMAS_F32_e64 0, killed %26, 0, %22, 0, %25, 0, 0, implicit $mode, implicit $vcc, implicit $exec + ; GCN: %28:vgpr_32 = nnan nofpexcept V_DIV_FIXUP_F32_e64 0, killed %27, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec ; GCN: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] - ; GCN: $vgpr0 = COPY %21 + ; GCN: $vgpr0 = COPY %28 ; GCN: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] ; GCN: S_SETPC_B64_return [[COPY4]], implicit $vgpr0 entry: diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll --- a/llvm/test/CodeGen/AMDGPU/ipra.ll +++ b/llvm/test/CodeGen/AMDGPU/ipra.ll @@ -31,7 +31,7 @@ ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8 ; GCN: ; NumSgprs: 37 -; GCN: ; NumVgprs: 9 +; GCN: ; NumVgprs: 32 define amdgpu_kernel void @kernel_call() #0 { %vgpr = load volatile i32, i32 addrspace(1)* undef tail call void @func() @@ -53,7 +53,7 @@ ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8 ; GCN: ; NumSgprs: 32 -; GCN: ; NumVgprs: 9 +; GCN: ; NumVgprs: 32 define void @func_regular_call() #1 { %vgpr = load volatile i32, i32 addrspace(1)* undef tail call void @func() @@ -63,13 +63,13 @@ ; GCN-LABEL: {{^}}func_tail_call: ; GCN: s_waitcnt -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, -; GCN-NEXT: s_addc_u32 s5, -; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, +; GCN-NEXT: s_addc_u32 s17, +; GCN-NEXT: s_setpc_b64 s[16:17] ; GCN: ; NumSgprs: 32 -; GCN: ; NumVgprs: 8 +; GCN: ; NumVgprs: 32 define void @func_tail_call() #1 { tail call void @func() ret void @@ -82,7 +82,7 @@ ; GCN: s_setpc_b64 ; GCN: ; NumSgprs: 32 -; GCN: ; NumVgprs: 9 +; GCN: ; NumVgprs: 32 define void @func_call_tail_call() #1 { %vgpr = load volatile i32, i32 addrspace(1)* undef tail call void @func() diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll --- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll @@ -15,7 +15,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_write_b32 v0, v0 -; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX8-NEXT: s_trap 2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -25,7 +25,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: ds_write_b32 v0, v0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX9-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX9-NEXT: s_trap 2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -38,7 +38,7 @@ ; GCN-LABEL: func_use_lds_global_constexpr_cast: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[0:1], s[4:5] +; GCN-NEXT: s_mov_b64 s[0:1], s[6:7] ; GCN-NEXT: s_trap 2 ; GCN-NEXT: s_setpc_b64 s[30:31] store i32 ptrtoint (float addrspace(3)* @lds to i32), i32 addrspace(1)* undef, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll @@ -59,7 +59,8 @@ ; GCN-LABEL: {{^}}func_implicitarg_ptr: ; GCN: s_waitcnt -; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 +; MESA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @func_implicitarg_ptr() #0 { @@ -71,7 +72,8 @@ ; GCN-LABEL: {{^}}opencl_func_implicitarg_ptr: ; GCN: s_waitcnt -; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 +; MESA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @opencl_func_implicitarg_ptr() #0 { @@ -112,10 +114,11 @@ ; HSA: kernarg_segment_byte_size = 112 ; MESA: kernarg_segment_byte_size = 128 -; HSA: s_add_u32 s4, s4, 0x70 +; HSA: s_add_u32 s8, s8, 0x70 ; MESA: s_add_u32 s4, s4, 0x70 -; GCN: s_addc_u32 s5, s5, 0{{$}} +; HSA: s_addc_u32 s9, s9, 0{{$}} +; MESA: s_addc_u32 s5, s5, 0{{$}} ; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_implicitarg_ptr_func([112 x i8]) #0 { call void @func_implicitarg_ptr() @@ -127,8 +130,10 @@ ; HSA: kernarg_segment_byte_size = 160 ; MESA: kernarg_segment_byte_size = 128 -; GCN: s_add_u32 s4, s4, 0x70 -; GCN: s_addc_u32 s5, s5, 0{{$}} +; HSA: s_add_u32 s8, s8, 0x70 +; HSA: s_addc_u32 s9, s9, 0{{$}} +; MESA: s_add_u32 s4, s4, 0x70 +; MESA: s_addc_u32 s5, s5, 0{{$}} ; GCN: s_swappc_b64 define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func([112 x i8]) #1 { call void @func_implicitarg_ptr() @@ -136,18 +141,24 @@ } ; GCN-LABEL: {{^}}func_call_implicitarg_ptr_func: -; GCN-NOT: s4 -; GCN-NOT: s5 -; GCN-NOT: s[4:5] +; HSA-NOT: s8 +; HSA-NOT: s9 +; HSA-NOT: s[8:9] +; MESA-NOT: s4 +; MESA-NOT: s5 +; MESA-NOT: s[4:5] define void @func_call_implicitarg_ptr_func() #0 { call void @func_implicitarg_ptr() ret void } ; GCN-LABEL: {{^}}opencl_func_call_implicitarg_ptr_func: -; GCN-NOT: s4 -; GCN-NOT: s5 -; GCN-NOT: s[4:5] +; HSA-NOT: s8 +; HSA-NOT: s9 +; HSA-NOT: s[8:9] +; MESA-NOT: s4 +; MESA-NOT: s5 +; MESA-NOT: s[4:5] define void @opencl_func_call_implicitarg_ptr_func() #0 { call void @func_implicitarg_ptr() ret void @@ -157,7 +168,8 @@ ; GCN: s_waitcnt ; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0 ; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0 -; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 +; MESA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 ; GCN: s_waitcnt lgkmcnt(0) define void @func_kernarg_implicitarg_ptr() #0 { %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() @@ -173,7 +185,8 @@ ; GCN: s_waitcnt ; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0 ; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0 -; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 +; MESA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 ; GCN: s_waitcnt lgkmcnt(0) define void @opencl_func_kernarg_implicitarg_ptr() #0 { %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() @@ -186,8 +199,10 @@ } ; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func: -; GCN: s_add_u32 s4, s4, 0x70 -; GCN: s_addc_u32 s5, s5, 0 +; HSA: s_add_u32 s8, s8, 0x70 +; HSA: s_addc_u32 s9, s9, 0 +; MESA: s_add_u32 s4, s4, 0x70 +; MESA: s_addc_u32 s5, s5, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 { call void @func_kernarg_implicitarg_ptr() diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -186,50 +186,98 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 { ; GFX9-LABEL: slsr1_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v43, s33, 4 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_add_u32 s32, s32, 0x800 -; GFX9-NEXT: v_writelane_b32 v43, s34, 0 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v43, s35, 1 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, v1 -; GFX9-NEXT: v_mov_b32_e32 v41, v0 -; GFX9-NEXT: v_writelane_b32 v43, s30, 2 -; GFX9-NEXT: v_mul_u32_u24_e32 v0, v41, v40 -; GFX9-NEXT: v_writelane_b32 v43, s31, 3 -; GFX9-NEXT: v_and_b32_e32 v42, 0xffffff, v40 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_mad_u32_u24 v40, v41, v40, v42 -; GFX9-NEXT: v_mov_b32_e32 v0, v40 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_add_u32_e32 v0, v40, v42 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s4, v43, 2 -; GFX9-NEXT: v_readlane_b32 s5, v43, 3 -; GFX9-NEXT: v_readlane_b32 s35, v43, 1 -; GFX9-NEXT: v_readlane_b32 s34, v43, 0 -; GFX9-NEXT: s_sub_u32 s32, s32, 0x800 -; GFX9-NEXT: v_readlane_b32 s33, v43, 4 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX-NEXT: s_or_saveexec_b64 s[16:17], -1 +; GFX-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX-NEXT: s_mov_b64 exec, s[16:17] +; GFX-NEXT: v_writelane_b32 v44, s33, 15 +; GFX-NEXT: v_writelane_b32 v44, s34, 0 +; GFX-NEXT: v_writelane_b32 v44, s35, 1 +; GFX-NEXT: v_writelane_b32 v44, s36, 2 +; GFX-NEXT: v_writelane_b32 v44, s37, 3 +; GFX-NEXT: v_writelane_b32 v44, s38, 4 +; GFX-NEXT: v_writelane_b32 v44, s39, 5 +; GFX-NEXT: v_writelane_b32 v44, s40, 6 +; GFX-NEXT: v_writelane_b32 v44, s41, 7 +; GFX-NEXT: v_writelane_b32 v44, s42, 8 +; GFX-NEXT: v_writelane_b32 v44, s43, 9 +; GFX-NEXT: v_writelane_b32 v44, s44, 10 +; GFX-NEXT: s_mov_b32 s33, s32 +; GFX-NEXT: s_add_u32 s32, s32, 0x800 +; GFX-NEXT: s_mov_b64 s[40:41], s[4:5] +; GFX-NEXT: v_writelane_b32 v44, s46, 11 +; GFX-NEXT: s_getpc_b64 s[4:5] +; GFX-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 +; GFX-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 +; GFX-NEXT: v_writelane_b32 v44, s47, 12 +; GFX-NEXT: s_load_dwordx2 s[46:47], s[4:5], 0x0 +; GFX-NEXT: s_waitcnt lgkmcnt(0) +; GFX-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX-NEXT: v_mov_b32_e32 v41, v1 +; GFX-NEXT: v_mov_b32_e32 v42, v0 +; GFX-NEXT: v_writelane_b32 v44, s30, 13 +; GFX-NEXT: v_mul_u32_u24_e32 v0, v42, v41 +; GFX-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX-NEXT: v_writelane_b32 v44, s31, 14 +; GFX-NEXT: v_mov_b32_e32 v40, v31 +; GFX-NEXT: s_mov_b32 s42, s14 +; GFX-NEXT: s_mov_b32 s43, s13 +; GFX-NEXT: s_mov_b32 s44, s12 +; GFX-NEXT: s_mov_b64 s[34:35], s[10:11] +; GFX-NEXT: s_mov_b64 s[36:37], s[8:9] +; GFX-NEXT: s_mov_b64 s[38:39], s[6:7] +; GFX-NEXT: v_and_b32_e32 v43, 0xffffff, v41 +; GFX-NEXT: s_swappc_b64 s[30:31], s[46:47] +; GFX-NEXT: v_mad_u32_u24 v41, v42, v41, v43 +; GFX-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX-NEXT: s_mov_b64 s[6:7], s[38:39] +; GFX-NEXT: s_mov_b64 s[8:9], s[36:37] +; GFX-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX-NEXT: s_mov_b32 s12, s44 +; GFX-NEXT: s_mov_b32 s13, s43 +; GFX-NEXT: s_mov_b32 s14, s42 +; GFX-NEXT: v_mov_b32_e32 v31, v40 +; GFX-NEXT: v_mov_b32_e32 v0, v41 +; GFX-NEXT: s_swappc_b64 s[30:31], s[46:47] +; GFX-NEXT: v_add_u32_e32 v0, v41, v43 +; GFX-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX-NEXT: s_mov_b64 s[6:7], s[38:39] +; GFX-NEXT: s_mov_b64 s[8:9], s[36:37] +; GFX-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX-NEXT: s_mov_b32 s12, s44 +; GFX-NEXT: s_mov_b32 s13, s43 +; GFX-NEXT: s_mov_b32 s14, s42 +; GFX-NEXT: v_mov_b32_e32 v31, v40 +; GFX-NEXT: s_swappc_b64 s[30:31], s[46:47] +; GFX-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX-NEXT: v_readlane_b32 s4, v44, 13 +; GFX-NEXT: v_readlane_b32 s5, v44, 14 +; GFX-NEXT: v_readlane_b32 s47, v44, 12 +; GFX-NEXT: v_readlane_b32 s46, v44, 11 +; GFX-NEXT: v_readlane_b32 s44, v44, 10 +; GFX-NEXT: v_readlane_b32 s43, v44, 9 +; GFX-NEXT: v_readlane_b32 s42, v44, 8 +; GFX-NEXT: v_readlane_b32 s41, v44, 7 +; GFX-NEXT: v_readlane_b32 s40, v44, 6 +; GFX-NEXT: v_readlane_b32 s39, v44, 5 +; GFX-NEXT: v_readlane_b32 s38, v44, 4 +; GFX-NEXT: v_readlane_b32 s37, v44, 3 +; GFX-NEXT: v_readlane_b32 s36, v44, 2 +; GFX-NEXT: v_readlane_b32 s35, v44, 1 +; GFX-NEXT: v_readlane_b32 s34, v44, 0 +; GFX-NEXT: s_sub_u32 s32, s32, 0x800 +; GFX-NEXT: v_readlane_b32 s33, v44, 15 +; GFX-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX-NEXT: s_mov_b64 exec, s[6:7] +; GFX-NEXT: s_waitcnt vmcnt(0) +; GFX-NEXT: s_setpc_b64 s[4:5] %b = and i32 %b.arg, 16777215 %s = and i32 %s.arg, 16777215 diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-csr-vgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-csr-vgpr-spill.ll --- a/llvm/test/CodeGen/AMDGPU/need-fp-from-csr-vgpr-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-csr-vgpr-spill.ll @@ -27,23 +27,23 @@ ; CHECK-LABEL: csr_vgpr_spill_fp_callee: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s8, s33 +; CHECK-NEXT: s_mov_b32 s15, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_add_u32 s32, s32, 0x400 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12 +; CHECK-NEXT: s_getpc_b64 s[18:19] +; CHECK-NEXT: s_add_u32 s18, s18, callee_has_fp@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s19, s19, callee_has_fp@rel32@hi+12 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 s[6:7], s[30:31] -; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_mov_b64 s[16:17], s[30:31] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; clobber csr v40 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_sub_u32 s32, s32, 0x400 -; CHECK-NEXT: s_mov_b32 s33, s8 +; CHECK-NEXT: s_mov_b32 s33, s15 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[6:7] +; CHECK-NEXT: s_setpc_b64 s[16:17] bb: call fastcc void @callee_has_fp() call void asm sideeffect "; clobber csr v40", "~{v40}"() @@ -53,15 +53,15 @@ define amdgpu_kernel void @kernel_call() { ; CHECK-LABEL: kernel_call: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s7 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, csr_vgpr_spill_fp_callee@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, csr_vgpr_spill_fp_callee@rel32@hi+12 -; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s17 +; CHECK-DAG: s_addc_u32 s1, s1, 0 +; CHECK-DAG: s_getpc_b64 s[18:19] +; CHECK-DAG: s_add_u32 s18, s18, csr_vgpr_spill_fp_callee@rel32@lo+4 +; CHECK-DAG: s_addc_u32 s19, s19, csr_vgpr_spill_fp_callee@rel32@hi+12 +; CHECK-DAG: s_mov_b32 s32, 0 +; CHECK-DAG: s_swappc_b64 s[30:31], s[18:19] ; CHECK-NEXT: s_endpgm bb: tail call fastcc void @csr_vgpr_spill_fp_callee() @@ -73,23 +73,23 @@ ; CHECK-LABEL: csr_vgpr_spill_fp_tailcall_callee: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: s_or_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_mov_b64 exec, s[16:17] ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; clobber csr v40 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; CHECK-NEXT: v_writelane_b32 v1, s33, 0 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, callee_has_fp@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, callee_has_fp@rel32@hi+12 ; CHECK-NEXT: v_readlane_b32 s33, v1, 0 -; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_setpc_b64 s[4:5] +; CHECK-NEXT: s_mov_b64 exec, s[18:19] +; CHECK-NEXT: s_setpc_b64 s[16:17] bb: call void asm sideeffect "; clobber csr v40", "~{v40}"() tail call fastcc void @callee_has_fp() @@ -99,15 +99,15 @@ define amdgpu_kernel void @kernel_tailcall() { ; CHECK-LABEL: kernel_tailcall: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s7 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, csr_vgpr_spill_fp_tailcall_callee@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, csr_vgpr_spill_fp_tailcall_callee@rel32@hi+12 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s17 +; CHECK-DAG: s_addc_u32 s1, s1, 0 +; CHECK-DAG: s_getpc_b64 s[18:19] +; CHECK-NEXT: s_add_u32 s18, s18, csr_vgpr_spill_fp_tailcall_callee@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s19, s19, csr_vgpr_spill_fp_tailcall_callee@rel32@hi+12 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] ; CHECK-NEXT: s_endpgm bb: tail call fastcc void @csr_vgpr_spill_fp_tailcall_callee() diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -238,7 +238,7 @@ ; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4 ; MUBUF-NEXT: v_lshl_add_u32 v2, v4, 2, s6 ; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v5 +; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31 ; MUBUF-NEXT: s_mov_b32 s32, s6 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3 @@ -275,7 +275,7 @@ ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s2 ; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s4 ; FLATSCR-NEXT: scratch_load_dword v2, v2, off -; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v5 +; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31 ; FLATSCR-NEXT: s_mov_b32 s32, s4 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3 @@ -331,13 +331,13 @@ ; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 ; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0 -; MUBUF-NEXT: v_mov_b32_e32 v5, s6 -; MUBUF-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; MUBUF-NEXT: v_mov_b32_e32 v4, s6 +; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; MUBUF-NEXT: v_mov_b32_e32 v2, 1 -; MUBUF-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4 ; MUBUF-NEXT: v_lshl_add_u32 v2, v3, 2, s6 ; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v4 +; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31 ; MUBUF-NEXT: s_mov_b32 s32, s6 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3 @@ -364,12 +364,12 @@ ; FLATSCR-NEXT: ; %bb.1: ; %bb.0 ; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 ; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000 -; FLATSCR-NEXT: v_mov_b32_e32 v5, 0 -; FLATSCR-NEXT: v_mov_b32_e32 v6, 1 +; FLATSCR-NEXT: v_mov_b32_e32 v4, 0 +; FLATSCR-NEXT: v_mov_b32_e32 v5, 1 ; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s2 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[5:6], s2 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[4:5], s2 ; FLATSCR-NEXT: scratch_load_dword v2, v2, off -; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v4 +; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31 ; FLATSCR-NEXT: s_mov_b32 s32, s2 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll --- a/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll @@ -10,7 +10,7 @@ ; GCN: v_writelane_b32 v255, s33, 2 ; GCN: v_writelane_b32 v255, s30, 0 ; GCN: v_writelane_b32 v255, s31, 1 -; GCN: s_swappc_b64 s[30:31], s[4:5] +; GCN: s_swappc_b64 s[30:31], s[16:17] ; GCN: v_readlane_b32 s30, v255, 0 ; GCN: v_readlane_b32 s31, v255, 1 ; GCN: v_readlane_b32 s33, v255, 2 @@ -56,7 +56,7 @@ ; GCN: v_writelane_b32 v254, s33, 2 ; GCN: v_writelane_b32 v254, s30, 0 ; GCN: v_writelane_b32 v254, s31, 1 -; GCN: s_swappc_b64 s[30:31], s[4:5] +; GCN: s_swappc_b64 s[30:31], s[16:17] ; GCN: v_readlane_b32 s30, v254, 0 ; GCN: v_readlane_b32 s31, v254, 1 ; GCN: v_readlane_b32 s33, v254, 2 @@ -150,7 +150,7 @@ ; GCN-LABEL: {{^}}reserve_vgpr_with_tail_call ; GCN-NOT: buffer_store_dword v255, off, s[0:3], s32 ; GCN-NOT: v_writelane -; GCN: s_setpc_b64 s[4:5] +; GCN: s_setpc_b64 s[16:17] define void @reserve_vgpr_with_tail_call() #0 { %alloca = alloca i32, align 4, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -128,12 +128,12 @@ ; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}} +; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32 offset:8 ; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4 ; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 -; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_0]] ; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_1]] +; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_0]] ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 @@ -155,12 +155,9 @@ ; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}} ; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4 -; GCN-NOT: s32 - ; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32{{$}} ; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:4 -; GCN-NOT: s32 ; GCN: s_setpc_b64 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { entry: @@ -170,7 +167,7 @@ ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: ; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:28 +; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:32 ; GCN: s_setpc_b64 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 { entry: @@ -197,15 +194,14 @@ ; Have another non-tail in the function ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call: ; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec ; GCN: s_mov_b32 s33, s32 -; GCN-DAG: s_add_u32 s32, s32, 0x400 +; GCN-DAG: s_add_u32 s32, s32, 0x800 -; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-DAG: v_writelane_b32 v42, s34, 0 -; GCN-DAG: v_writelane_b32 v42, s35, 1 +; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-DAG: v_writelane_b32 v43, s46, 12 ; GCN-DAG: s_getpc_b64 s[4:5] ; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 @@ -214,22 +210,22 @@ ; GCN: s_swappc_b64 -; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 +; GCN: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 -; GCN-DAG: v_readlane_b32 s34, v42, 0 -; GCN-DAG: v_readlane_b32 s35, v42, 1 +; GCN-DAG: v_readlane_b32 s35, v43, 1 +; GCN-DAG: v_readlane_b32 s34, v43, 0 -; GCN: s_sub_u32 s32, s32, 0x400 +; GCN: s_sub_u32 s32, s32, 0x800 ; GCN-NEXT: v_readlane_b32 s33, -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: s_setpc_b64 s[16:17] define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 { entry: %other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) @@ -247,7 +243,7 @@ ; GCN-NOT: s33 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset: -; GCN: s_setpc_b64 s[4:5] +; GCN: s_setpc_b64 s[16:17] define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { entry: %alloca = alloca [16 x i32], align 4, addrspace(5) @@ -259,10 +255,10 @@ ; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: ; GCN-NOT: s33 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:44 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:48 ; GCN-NOT: s33 -; GCN: s_setpc_b64 s[4:5] +; GCN: s_setpc_b64 s[16:17] define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 { entry: %alloca = alloca [16 x i32], align 4, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll --- a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll @@ -1,16 +1,16 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=1 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=7 < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}spill_csr_s5_copy: ; GCN: s_or_saveexec_b64 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec -; GCN: v_writelane_b32 v40, s33, 2 +; GCN: v_writelane_b32 v40, s33, 5 ; GCN: s_swappc_b64 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9 ; GCN: buffer_store_dword [[K]], off, s[0:3], s33{{$}} -; GCN: v_readlane_b32 s33, v40, 2 +; GCN: v_readlane_b32 s33, v40, 5 ; GCN: s_or_saveexec_b64 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN: s_mov_b64 exec diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -157,20 +157,21 @@ ; GCN-LABEL: func_call_align1024_bp_gets_vgpr_spill: ; GCN: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[16:17] ; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s33, 2 -; GCN-DAG: v_writelane_b32 [[VGPR_REG]], s34, 3 ; GCN-DAG: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0 ; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000 -; GCN: s_mov_b32 s34, s32 ; GCN: v_mov_b32_e32 v32, 0 +; GCN-DAG: v_writelane_b32 [[VGPR_REG]], s34, 3 +; GCN: s_mov_b32 s34, s32 ; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 -; GCN-NEXT: s_add_u32 s32, s32, 0x30000 - +; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 offset:4 +; GCN-DAG: s_add_u32 s32, s32, 0x30000 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN: s_sub_u32 s32, s32, 0x30000 ; GCN-NEXT: v_readlane_b32 s33, [[VGPR_REG]], 2 diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll --- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -5,61 +5,61 @@ define hidden void @widget() { ; GCN-LABEL: widget: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v40, s33, 2 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_add_u32 s32, s32, 0x400 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: flat_load_dword v0, v[0:1] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 21, v0 -; GCN-NEXT: s_and_b64 vcc, exec, vcc -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: s_cbranch_vccz BB0_3 -; GCN-NEXT: ; %bb.1: ; %bb4 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0 -; GCN-NEXT: s_and_b64 vcc, exec, vcc -; GCN-NEXT: s_cbranch_vccnz BB0_4 -; GCN-NEXT: ; %bb.2: ; %bb7 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, wibble@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, wibble@rel32@hi+12 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: s_branch BB0_7 -; GCN-NEXT: BB0_3: ; %bb2 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 21, v0 -; GCN-NEXT: s_and_b64 vcc, exec, vcc -; GCN-NEXT: s_cbranch_vccnz BB0_6 -; GCN-NEXT: BB0_4: ; %bb9 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, wibble@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, wibble@rel32@hi+12 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execnz BB0_7 -; GCN-NEXT: ; %bb.5: ; %bb9.bb12_crit_edge -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: BB0_6: ; %bb12 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: flat_store_dword v[0:1], v2 -; GCN-NEXT: BB0_7: ; %UnifiedReturnBlock -; GCN-NEXT: v_readlane_b32 s4, v40, 0 -; GCN-NEXT: v_readlane_b32 s5, v40, 1 -; GCN-NEXT: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: v_readlane_b32 s33, v40, 2 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: v_writelane_b32 v40, s33, 2 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_add_u32 s32, s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: flat_load_dword v0, v[0:1] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 21, v0 +; GCN-NEXT: s_and_b64 vcc, exec, vcc +; GCN-NEXT: s_cbranch_vccz BB0_3 +; GCN-NEXT: ; %bb.1: ; %bb4 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0 +; GCN-NEXT: s_and_b64 vcc, exec, vcc +; GCN-NEXT: s_cbranch_vccnz BB0_4 +; GCN-NEXT: ; %bb.2: ; %bb7 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, wibble@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, wibble@rel32@hi+12 +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: s_branch BB0_7 +; GCN-NEXT: BB0_3: ; %bb2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 21, v0 +; GCN-NEXT: s_and_b64 vcc, exec, vcc +; GCN-NEXT: s_cbranch_vccnz BB0_6 +; GCN-NEXT: BB0_4: ; %bb9 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, wibble@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, wibble@rel32@hi+12 +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execnz BB0_7 +; GCN-NEXT: ; %bb.5: ; %bb9.bb12_crit_edge +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: BB0_6: ; %bb12 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: BB0_7: ; %UnifiedReturnBlock +; GCN-NEXT: v_readlane_b32 s4, v40, 0 +; GCN-NEXT: v_readlane_b32 s5, v40, 1 +; GCN-NEXT: s_sub_u32 s32, s32, 0x400 +; GCN-NEXT: v_readlane_b32 s33, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] ; SI-OPT-LABEL: @widget( ; SI-OPT-NEXT: bb: ; SI-OPT-NEXT: [[TMP:%.*]] = load i32, i32 addrspace(1)* null, align 16 @@ -186,95 +186,124 @@ ; GCN-LABEL: blam: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v43, s33, 4 +; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: v_writelane_b32 v44, s33, 15 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x800 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v43, s34, 0 -; GCN-NEXT: v_writelane_b32 v43, s35, 1 -; GCN-NEXT: v_writelane_b32 v43, s36, 2 -; GCN-NEXT: v_writelane_b32 v43, s37, 3 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v44, s34, 0 +; GCN-NEXT: v_writelane_b32 v44, s35, 1 +; GCN-NEXT: v_writelane_b32 v44, s36, 2 +; GCN-NEXT: v_writelane_b32 v44, s38, 3 +; GCN-NEXT: v_writelane_b32 v44, s39, 4 +; GCN-NEXT: v_writelane_b32 v44, s40, 5 +; GCN-NEXT: v_writelane_b32 v44, s41, 6 +; GCN-NEXT: v_writelane_b32 v44, s42, 7 +; GCN-NEXT: v_writelane_b32 v44, s43, 8 +; GCN-NEXT: v_writelane_b32 v44, s44, 9 +; GCN-NEXT: v_writelane_b32 v44, s45, 10 +; GCN-NEXT: v_writelane_b32 v44, s46, 11 +; GCN-NEXT: v_writelane_b32 v44, s47, 12 +; GCN-NEXT: v_writelane_b32 v44, s48, 13 +; GCN-NEXT: v_writelane_b32 v44, s49, 14 +; GCN-NEXT: v_mov_b32_e32 v40, v31 +; GCN-NEXT: s_mov_b32 s34, s14 +; GCN-NEXT: s_mov_b32 s35, s13 +; GCN-NEXT: s_mov_b32 s36, s12 +; GCN-NEXT: s_mov_b64 s[38:39], s[10:11] +; GCN-NEXT: s_mov_b64 s[40:41], s[8:9] +; GCN-NEXT: s_mov_b64 s[42:43], s[6:7] +; GCN-NEXT: s_mov_b64 s[44:45], s[4:5] ; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: flat_load_dword v40, v[1:2] -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: s_getpc_b64 s[36:37] -; GCN-NEXT: s_add_u32 s36, s36, spam@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s37, s37, spam@rel32@hi+12 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 2, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v40 +; GCN-NEXT: flat_load_dword v41, v[0:1] +; GCN-NEXT: v_mov_b32_e32 v43, 0 +; GCN-NEXT: s_getpc_b64 s[48:49] +; GCN-NEXT: s_add_u32 s48, s48, spam@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s49, s49, spam@rel32@hi+12 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 2, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_f32_e64 s[34:35], 0, v40 +; GCN-NEXT: v_cmp_eq_f32_e64 s[46:47], 0, v41 ; GCN-NEXT: s_branch BB1_3 -; GCN-NEXT: BB1_1: ; %bb10 -; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 +; GCN-NEXT: BB1_1: ; %bb10 +; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: v_mov_b32_e32 v0, 0x7fc00000 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-NEXT: BB1_2: ; %bb18 -; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 +; GCN-NEXT: BB1_2: ; %bb18 +; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GCN-NEXT: v_mov_b32_e32 v0, 0x7fc00000 ; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-NEXT: BB1_3: ; %bb2 -; GCN-NEXT: ; =>This Loop Header: Depth=1 -; GCN-NEXT: ; Child Loop BB1_4 Depth 2 +; GCN-NEXT: BB1_3: ; %bb2 +; GCN-NEXT: ; =>This Loop Header: Depth=1 +; GCN-NEXT: ; Child Loop BB1_4 Depth 2 ; GCN-NEXT: s_mov_b64 s[6:7], 0 -; GCN-NEXT: BB1_4: ; %bb2 -; GCN-NEXT: ; Parent Loop BB1_3 Depth=1 -; GCN-NEXT: ; => This Inner Loop Header: Depth=2 -; GCN-NEXT: flat_load_dword v0, v[41:42] +; GCN-NEXT: BB1_4: ; %bb2 +; GCN-NEXT: ; Parent Loop BB1_3 Depth=1 +; GCN-NEXT: ; => This Inner Loop Header: Depth=2 +; GCN-NEXT: flat_load_dword v0, v[42:43] ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 3, v0 ; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GCN-NEXT: s_cbranch_execz BB1_6 -; GCN-NEXT: ; %bb.5: ; %bb8 -; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2 +; GCN-NEXT: %bb.5: ; %bb8 +; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GCN-NEXT: s_cbranch_execnz BB1_4 ; GCN-NEXT: s_branch BB1_1 -; GCN-NEXT: BB1_6: ; %bb6 -; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2 +; GCN-NEXT: BB1_6: ; %bb6 +; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2 ; GCN-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 ; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN-NEXT: s_cbranch_execnz BB1_4 -; GCN-NEXT: ; %bb.7: ; %bb11 -; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 -; GCN-NEXT: s_mov_b64 s[4:5], 0 -; GCN-NEXT: s_mov_b64 s[6:7], 0 -; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-NEXT: s_cbranch_execnz BB1_4 -; GCN-NEXT: ; %bb.8: ; %bb14 -; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-NEXT: s_and_saveexec_b64 s[4:5], s[34:35] -; GCN-NEXT: s_cbranch_execnz BB1_10 -; GCN-NEXT: ; %bb.9: ; %bb16 -; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, 0x7fc00000 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-NEXT: BB1_10: ; %bb17 -; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], 0 -; GCN-NEXT: s_branch BB1_2 +; GCN-NEXT: %bb.7: ; %bb11 +; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2 +; GCN-NEXT: _or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_mov_b64 s[4:5], s[44:45] +; GCN-NEXT: s_mov_b64 s[6:7], s[42:43] +; GCN-NEXT: s_mov_b64 s[8:9], s[40:41] +; GCN-NEXT: s_mov_b64 s[10:11], s[38:39] +; GCN-NEXT: s_mov_b32 s12, s36 +; GCN-NEXT: s_mov_b32 s13, s35 +; GCN-NEXT: s_mov_b32 s14, s34 +; GCN-NEXT: v_mov_b32_e32 v31, v40 +; GCN-NEXT: s_swappc_b64 s[30:31], s[48:49] +; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: s_mov_b64 s[6:7], 0 +; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GCN-NEXT: s_cbranch_execnz BB1_4 +; GCN-NEXT: ; %bb.8: ; %bb14 +; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 +; GCN-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-NEXT: s_and_saveexec_b64 s[4:5], s[46:47] +; GCN-NEXT: s_cbranch_execnz BB1_10 +; GCN-NEXT: ; %bb.9: ; %bb16 +; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, 0x7fc00000 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: BB1_10: ; %bb17 +; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], 0 +; GCN-NEXT: s_branch BB1_2 + bb: %tmp = load float, float* null, align 16 br label %bb2 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -21,14 +21,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v32, v12 ; GFX9: ;;#ASMSTART ; GFX9-NEXT: ;;#ASMEND -; GFX9: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[4:7] dmask:0x1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9: image_gather4_c_b_cl v[40:43], v[32:39], s[16:23], s[4:7] dmask:0x1 +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX9-NEXT: v_writelane_b32 v44, s30, 0 ; GFX9: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload @@ -53,14 +53,14 @@ ; GFX10: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10: image_gather4_c_b_cl v[40:43], v[32:39], s[16:23], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 -; GFX10: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: s_getpc_b64 s[16:17] +; GFX10-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12 +; GFX10: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX10: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX10: buffer_load_dword v43, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 @@ -100,14 +100,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v40, v12 ; GFX9: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[4:7] dmask:0x1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX9: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[4:7] dmask:0x1 ; GFX9: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload @@ -127,22 +127,29 @@ ; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill - -; GFX10: image_gather4_c_b_cl v[0:3], v[12:19], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 +; GFX10: s_getpc_b64 s[16:17] +; GFX10-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12 +; GFX10-NEXT: s_mov_b32 s37, s36 +; GFX10-NEXT: s_mov_b32 s38, s36 +; GFX10-NEXT: s_mov_b32 s39, s36 +; GFX10-NEXT: s_mov_b32 s40, s36 +; GFX10-NEXT: s_mov_b32 s41, s36 +; GFX10-NEXT: s_mov_b32 s42, s36 +; GFX10-NEXT: s_mov_b32 s43, s36 +; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:19], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: v_writelane_b32 v45, s30, 8 ; GFX10-NEXT: v_mov_b32_e32 v40, v16 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v41, v15 ; GFX10-NEXT: v_mov_b32_e32 v42, v14 ; GFX10-NEXT: v_mov_b32_e32 v43, v13 +; GFX10-NEXT: v_writelane_b32 v45, s31, 9 ; GFX10-NEXT: v_mov_b32_e32 v44, v12 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10: buffer_load_dword v44, off, s[0:3], s33 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1062,7 +1062,7 @@ ; GCN-NEXT: s_waitcnt_vscnt ; GFX1064-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GFX1032-NEXT: s_or_saveexec_b32 [[COPY_EXEC0:s[0-9]]], -1{{$}} +; GFX1032-NEXT: s_or_saveexec_b32 [[COPY_EXEC0:s[0-9]+]], -1{{$}} ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir @@ -20,10 +20,18 @@ # FULL-NEXT: stackPtrOffsetReg: '$sgpr13' # FULL-NEXT: argumentInfo: # FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } # FULL-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -# FULL-NEXT: workGroupIDX: { reg: '$sgpr6' } +# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +# FULL-NEXT: workGroupIDX: { reg: '$sgpr6' } +# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' } +# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' } # FULL-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' } -# FULL-NEXT: workItemIDX: { reg: '$vgpr0' } +# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } +# FULL-NEXT: workItemIDX: { reg: '$vgpr0' } +# FULL-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } +# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } # FULL-NEXT: mode: # FULL-NEXT: ieee: true # FULL-NEXT: dx10-clamp: true @@ -47,10 +55,18 @@ # SIMPLE-NEXT: stackPtrOffsetReg: '$sgpr13' # SIMPLE-NEXT: argumentInfo: # SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } # SIMPLE-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr6' } +# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr6' } +# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' } +# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' } # SIMPLE-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' } -# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr0' } +# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } +# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr0' } +# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } +# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } # SIMPLE-NEXT: occupancy: 10 # SIMPLE-NEXT: body: name: kernel0 @@ -96,6 +112,16 @@ # FULL-NEXT: stackPtrOffsetReg: '$sp_reg' # FULL-NEXT: argumentInfo: # FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +# FULL-NEXT: workGroupIDX: { reg: '$sgpr12' } +# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' } +# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' } +# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } +# FULL-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } +# FULL-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } +# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } # FULL-NEXT: mode: # FULL-NEXT: ieee: true # FULL-NEXT: dx10-clamp: true @@ -111,6 +137,16 @@ # SIMPLE-NEXT: maxKernArgAlign: 1 # SIMPLE-NEXT: argumentInfo: # SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr12' } +# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' } +# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' } +# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } +# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } +# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } +# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } # SIMPLE-NEXT: occupancy: 10 # SIMPLE-NEXT: body: @@ -139,6 +175,16 @@ # FULL-NEXT: stackPtrOffsetReg: '$sp_reg' # FULL-NEXT: argumentInfo: # FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +# FULL-NEXT: workGroupIDX: { reg: '$sgpr12' } +# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' } +# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' } +# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } +# FULL-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } +# FULL-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } +# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } # FULL-NEXT: mode: # FULL-NEXT: ieee: true # FULL-NEXT: dx10-clamp: true @@ -154,6 +200,16 @@ # SIMPLE-NEXT: maxKernArgAlign: 1 # SIMPLE-NEXT: argumentInfo: # SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr12' } +# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' } +# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' } +# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } +# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } +# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } +# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } # SIMPLE-NEXT: occupancy: 10 # SIMPLE-NEXT: body: @@ -183,6 +239,16 @@ # FULL-NEXT: stackPtrOffsetReg: '$sp_reg' # FULL-NEXT: argumentInfo: # FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +# FULL-NEXT: workGroupIDX: { reg: '$sgpr12' } +# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' } +# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' } +# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } +# FULL-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } +# FULL-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } +# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } # FULL-NEXT: mode: # FULL-NEXT: ieee: true # FULL-NEXT: dx10-clamp: true @@ -199,6 +265,16 @@ # SIMPLE-NEXT: isEntryFunction: true # SIMPLE-NEXT: argumentInfo: # SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr12' } +# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' } +# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' } +# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } +# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } +# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } +# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } # SIMPLE-NEXT: occupancy: 10 # SIMPLE-NEXT: body: @@ -235,13 +311,31 @@ # FULL: argumentInfo: # FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } # FULL-NEXT: flatScratchInit: { offset: 4 } -# FULL-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 } +# FULL-NEXT: workGroupIDX: { reg: '$sgpr12' } +# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' } +# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' } +# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } +# FULL-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } +# FULL-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 } +# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } # SIMPLE: argumentInfo: # SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } # SIMPLE-NEXT: flatScratchInit: { offset: 4 } -# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 } +# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr12' } +# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' } +# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' } +# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } +# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } +# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 } +# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } name: fake_stack_arginfo machineFunctionInfo: argumentInfo: