diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -25,6 +25,13 @@ using namespace llvm; namespace { +static constexpr StringLiteral ImplicitAttrNames[] = { + // X ids unnecessarily propagated to kernels. + "amdgpu-work-item-id-x", "amdgpu-work-item-id-y", + "amdgpu-work-item-id-z", "amdgpu-work-group-id-x", + "amdgpu-work-group-id-y", "amdgpu-work-group-id-z", + "amdgpu-dispatch-ptr", "amdgpu-dispatch-id", + "amdgpu-implicitarg-ptr"}; class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { private: @@ -194,18 +201,10 @@ static void copyFeaturesToFunction(Function &Parent, const Function &Callee, bool &NeedQueuePtr) { - // X ids unnecessarily propagated to kernels. - static constexpr StringLiteral AttrNames[] = { - "amdgpu-work-item-id-x", "amdgpu-work-item-id-y", - "amdgpu-work-item-id-z", "amdgpu-work-group-id-x", - "amdgpu-work-group-id-y", "amdgpu-work-group-id-z", - "amdgpu-dispatch-ptr", "amdgpu-dispatch-id", - "amdgpu-implicitarg-ptr"}; - if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) NeedQueuePtr = true; - for (StringRef AttrName : AttrNames) + for (StringRef AttrName : ImplicitAttrNames) handleAttr(Parent, Callee, AttrName); } @@ -268,7 +267,20 @@ bool Changed = false; bool NeedQueuePtr = false; bool HaveCall = false; + bool HasIndirectCall = false; bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv()); + CallingConv::ID CC = F.getCallingConv(); + bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx); + + // If this function hasAddressTaken() = true + // then add all attributes corresponding to the implicit args. + if (CallingConvSupportsAllImplicits && + F.hasAddressTaken(nullptr, true, true, true)) { + for (StringRef AttrName : ImplicitAttrNames) { + F.addFnAttr(AttrName); + } + Changed = true; + } for (BasicBlock &BB : F) { for (Instruction &I : BB) { @@ -281,10 +293,12 @@ const Function *Callee = dyn_cast(CB->getCalledOperand()->stripPointerCasts()); - // TODO: Do something with indirect calls. + // Note the occurence of indirect call. if (!Callee) { - if (!CB->isInlineAsm()) + if (!CB->isInlineAsm()) { + HasIndirectCall = true; HaveCall = true; + } continue; } @@ -351,6 +365,28 @@ Changed = true; } + // This pass cannot copy attributes from callees to callers + // if there is an indirect call and in thus such cases, + // hasAddressTaken() would be false for kernels and functions + // making an indirect call (if they are themselves not indirectly called). + // We must tag all such kernels/functions with all implicits attributes + // for correctness. + // e.g. + // 1. Kernel K1 makes an indirect call to function F1. + // Without detecting an indirect call in K1, this pass will not + // add all implicit args to K1 (which is incorrect). + // 2. Kernel K1 makes direct call to F1 which makes indirect call to function + // F2. + // Without detecting an indirect call in F1 (whose hasAddressTaken() is + // false), the pass will not add all implicit args to F1 (which is + // essential for correctness). + if (CallingConvSupportsAllImplicits && HasIndirectCall) { + for (StringRef AttrName : ImplicitAttrNames) { + F.addFnAttr(AttrName); + } + Changed = true; + } + return Changed; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -388,10 +388,6 @@ else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32")) MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32)); } - // Set -fixed-function-abi to true if not provided.. - if (TT.getOS() == Triple::AMDHSA && - EnableAMDGPUFixedFunctionABIOpt.getNumOccurrences() == 0) - EnableFixedFunctionABI = true; } bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll @@ -53,9 +53,9 @@ ; CHECK: bb.1 (%ir-block.0): ; CHECK: liveins: $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %8, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %9, !0 - ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8 - ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %9 + ; CHECK: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %2, !0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %2 ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY2]] ; CHECK: $vgpr0 = COPY [[ADD]](s32) ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] @@ -87,8 +87,8 @@ ; CHECK: bb.1.entry: ; CHECK: liveins: $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8 - ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8 + ; CHECK: INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1 ; CHECK: $vgpr0 = COPY [[COPY1]](s32) ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 @@ -102,8 +102,8 @@ ; CHECK: bb.1.entry: ; CHECK: liveins: $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %8 - ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8 + ; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1 ; CHECK: $vgpr0 = COPY [[COPY1]](s32) ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 @@ -118,9 +118,9 @@ ; CHECK: bb.1 (%ir-block.0): ; CHECK: liveins: $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8, 1835018 /* regdef:VGPR_32 */, def %9 - ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8 - ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %9 + ; CHECK: INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1, 1835018 /* regdef:VGPR_32 */, def %2 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %2 ; CHECK: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY1]], [[COPY2]] ; CHECK: $vgpr0 = COPY [[FADD]](s32) ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] @@ -138,9 +138,9 @@ ; CHECK: bb.1 (%ir-block.0): ; CHECK: liveins: $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8, 2883594 /* regdef:VReg_64 */, def %9 - ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8 - ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY %9 + ; CHECK: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1, 2883594 /* regdef:VReg_64 */, def %2 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY %2 ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](s64) ; CHECK: $vgpr0 = COPY [[UV]](s32) ; CHECK: $vgpr1 = COPY [[UV1]](s32) @@ -209,8 +209,8 @@ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32) - ; CHECK: INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %9, 1835017 /* reguse:VGPR_32 */, [[COPY2]] - ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %9 + ; CHECK: INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %2, 1835017 /* reguse:VGPR_32 */, [[COPY2]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %2 ; CHECK: $vgpr0 = COPY [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] ; CHECK: S_SETPC_B64_return [[COPY4]], implicit $vgpr0 @@ -225,8 +225,8 @@ ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 1835018 /* regdef:VGPR_32 */, def %9, 196622 /* mem:m */, [[COPY]](p3) - ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %9 + ; CHECK: INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 1835018 /* regdef:VGPR_32 */, def %2, 196622 /* mem:m */, [[COPY]](p3) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %2 ; CHECK: $vgpr0 = COPY [[COPY2]](s32) ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] ; CHECK: S_SETPC_B64_return [[COPY3]], implicit $vgpr0 @@ -243,8 +243,8 @@ ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[AND]](s32) - ; CHECK: INLINEASM &";", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11, 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3) - ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %11 + ; CHECK: INLINEASM &";", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %4, 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %4 ; CHECK: $vgpr0 = COPY [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] ; CHECK: S_SETPC_B64_return [[COPY4]], implicit $vgpr0 @@ -258,14 +258,14 @@ ; CHECK: bb.1.entry: ; CHECK: liveins: $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %8 - ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8 - ; CHECK: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %10 - ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %10 + ; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1 + ; CHECK: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %3 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %3 ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY1]](s32) ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY2]](s32) - ; CHECK: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %12, 1966089 /* reguse:SReg_32 */, [[COPY3]], 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3) - ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY %12 + ; CHECK: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %5, 1966089 /* reguse:SReg_32 */, [[COPY3]], 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY %5 ; CHECK: $vgpr0 = COPY [[COPY5]](s32) ; CHECK: [[COPY6:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] ; CHECK: S_SETPC_B64_return [[COPY6]], implicit $vgpr0 @@ -288,10 +288,10 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]](s32) ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32) ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32) - ; CHECK: INLINEASM &"; ", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11, 1835018 /* regdef:VGPR_32 */, def %12, 1835018 /* regdef:VGPR_32 */, def %13, 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY5]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY6]](tied-def 5) - ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY %11 - ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY %12 - ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY %13 + ; CHECK: INLINEASM &"; ", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %4, 1835018 /* regdef:VGPR_32 */, def %5, 1835018 /* regdef:VGPR_32 */, def %6, 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY5]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY6]](tied-def 5) + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY %4 + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY %5 + ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY %6 ; CHECK: G_STORE [[COPY7]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; CHECK: G_STORE [[COPY8]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; CHECK: G_STORE [[COPY9]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) @@ -312,11 +312,11 @@ ; CHECK: bb.1.entry: ; CHECK: liveins: $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %8 - ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8 + ; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1 ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32) - ; CHECK: INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3) - ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %10 + ; CHECK: INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %3, 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %3 ; CHECK: $vgpr0 = COPY [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] ; CHECK: S_SETPC_B64_return [[COPY4]], implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll @@ -14,7 +14,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_mov_b64 s[0:1], s[6:7] +; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX8-NEXT: s_trap 2 ; GFX8-NEXT: ds_write_b32 v0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -37,10 +37,10 @@ ; GFX8-LABEL: func_use_lds_global_constexpr_cast: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b64 s[0:1], s[6:7] +; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX8-NEXT: s_trap 2 -; GFX8-NEXT: flat_store_dword v[0:1], v0 -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dword v[0:1], v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: func_use_lds_global_constexpr_cast: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll @@ -101,7 +101,7 @@ ; ALL-LABEL: {{^}}test_workitem_id_x_func: ; ALL: s_waitcnt -; HSA-NEXT: v_and_b32_e32 v2, 0x3ff, v31 +; HSA-NEXT: v_and_b32_e32 v2, 0x3ff, v2 ; MESA-NEXT: v_and_b32_e32 v2, 0x3ff, v2 define void @test_workitem_id_x_func(i32 addrspace(1)* %out) #1 { %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -110,7 +110,7 @@ } ; ALL-LABEL: {{^}}test_workitem_id_y_func: -; HSA: v_lshrrev_b32_e32 v2, 10, v31 +; HSA: v_lshrrev_b32_e32 v2, 10, v2 ; MESA: v_lshrrev_b32_e32 v2, 10, v2 define void @test_workitem_id_y_func(i32 addrspace(1)* %out) #1 { %id = call i32 @llvm.amdgcn.workitem.id.y() @@ -119,7 +119,7 @@ } ; ALL-LABEL: {{^}}test_workitem_id_z_func: -; HSA: v_lshrrev_b32_e32 v2, 20, v31 +; HSA: v_lshrrev_b32_e32 v2, 20, v2 ; MESA: v_lshrrev_b32_e32 v2, 20, v2 define void @test_workitem_id_z_func(i32 addrspace(1)* %out) #1 { %id = call i32 @llvm.amdgcn.workitem.id.z() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -172,7 +172,7 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v4 ; GCN-NEXT: v_add_u32_e32 v2, s6, v2 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v31 +; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v5 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_u32_e32 v2, v2, v3 ; GCN-NEXT: global_store_dword v[0:1], v2, off @@ -227,14 +227,14 @@ ; GCN-NEXT: s_add_u32 s6, s32, 0x1000 ; GCN-NEXT: s_and_b32 s6, s6, 0xfffff000 ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mov_b32_e32 v4, s6 -; GCN-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v5, s6 +; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v2, 1 -; GCN-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4 +; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen offset:4 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v3 ; GCN-NEXT: v_add_u32_e32 v2, s6, v2 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v31 +; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_u32_e32 v2, v2, v3 ; GCN-NEXT: global_store_dword v[0:1], v2, off diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -42,7 +42,7 @@ ; Test handling inside a non-kernel ; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast_func: -; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x10{{$}} +; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}} ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] ; CI-DAG: v_cmp_ne_u32_e32 vcc, -1, v0 ; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc diff --git a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll --- a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll @@ -122,18 +122,18 @@ ; GCN-LABEL: {{^}}kernel_call_func_32_agprs: ; GFX908: .amdhsa_next_free_vgpr 32 -; GFX90A: .amdhsa_accum_offset 32 -; GCN: NumVgprs: 32 +; GFX90A: .amdhsa_accum_offset 12 +; GCN: NumVgprs: 9 ; GCN: NumAgprs: 32 ; GFX908: TotalNumVgprs: 32 -; GFX90A: TotalNumVgprs: 64 +; GFX90A: TotalNumVgprs: 44 ; GFX908: VGPRBlocks: 7 -; GFX90A: VGPRBlocks: 7 +; GFX90A: VGPRBlocks: 5 ; GFX908: NumVGPRsForWavesPerEU: 32 -; GFX90A: NumVGPRsForWavesPerEU: 64 -; GFX90A: AccumOffset: 32 +; GFX90A: NumVGPRsForWavesPerEU: 44 +; GFX90A: AccumOffset: 12 ; GCN: Occupancy: 8 -; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 7 +; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 2 define amdgpu_kernel void @kernel_call_func_32_agprs() #0 { bb: call void @func_32_agprs() #0 @@ -141,10 +141,10 @@ } ; GCN-LABEL: {{^}}func_call_func_32_agprs: -; GCN: NumVgprs: 32 +; GCN: NumVgprs: 9 ; GCN: NumAgprs: 32 ; GFX908: TotalNumVgprs: 32 -; GFX90A: TotalNumVgprs: 64 +; GFX90A: TotalNumVgprs: 44 define void @func_call_func_32_agprs() #0 { bb: call void @func_32_agprs() #0 @@ -154,21 +154,21 @@ declare void @undef_func() ; GCN-LABEL: {{^}}kernel_call_undef_func: -; GFX908: .amdhsa_next_free_vgpr 32 -; GFX90A: .amdhsa_next_free_vgpr 56 -; GFX90A: .amdhsa_accum_offset 32 -; GCN: NumVgprs: 32 +; GFX908: .amdhsa_next_free_vgpr 24 +; GFX90A: .amdhsa_next_free_vgpr 48 +; GFX90A: .amdhsa_accum_offset 24 +; GCN: NumVgprs: 24 ; GCN: NumAgprs: 24 -; GFX908: TotalNumVgprs: 32 -; GFX90A: TotalNumVgprs: 56 -; GFX908: VGPRBlocks: 7 -; GFX90A: VGPRBlocks: 6 -; GFX908: NumVGPRsForWavesPerEU: 32 -; GFX90A: NumVGPRsForWavesPerEU: 56 -; GFX90A: AccumOffset: 32 -; GFX908: Occupancy: 8 +; GFX908: TotalNumVgprs: 24 +; GFX90A: TotalNumVgprs: 48 +; GFX908: VGPRBlocks: 5 +; GFX90A: VGPRBlocks: 5 +; GFX908: NumVGPRsForWavesPerEU: 24 +; GFX90A: NumVGPRsForWavesPerEU: 48 +; GFX90A: AccumOffset: 24 +; GFX908: Occupancy: 10 ; GFX90A: Occupancy: 8 -; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 7 +; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 5 define amdgpu_kernel void @kernel_call_undef_func() #0 { bb: call void @undef_func() diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -334,4 +334,4 @@ ; HSA: attributes #17 = { nounwind "uniform-work-group-size"="false" } ; HSA: attributes #18 = { nounwind } ; HSA: attributes #19 = { nounwind "amdgpu-calls" "uniform-work-group-size"="false" } -; HSA: attributes #20 = { nounwind "amdgpu-dispatch-ptr" "target-cpu"="fiji" } +; HSA: attributes #20 = { nounwind "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "target-cpu"="fiji" } diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -80,15 +80,14 @@ ; GCN-LABEL: {{^}}test_call_external_void_func_i1_signext: ; HSA: buffer_load_ubyte [[VAR:v[0-9]+]] -; HSA-DAG: s_mov_b32 s32, 0 +; HSA: s_mov_b32 s32, 0 ; MESA-DAG: buffer_load_ubyte [[VAR:v[0-9]+]] ; MESA-DAG: s_mov_b32 s32, 0{{$}} ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_signext@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_signext@rel32@hi+12 -; MESA-DAG: v_bfe_i32 v0, v0, 0, 1 -; HSA: v_bfe_i32 v0, v3, 0, 1 +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { @@ -100,24 +99,18 @@ ; FIXME: load should be scheduled before getpc ; GCN-LABEL: {{^}}test_call_external_void_func_i1_zeroext: -; HSA: buffer_load_ubyte v3 +; HSA: buffer_load_ubyte v0 ; HSA-DAG: s_mov_b32 s32, 0{{$}} ; MESA: buffer_load_ubyte v0 ; MESA-DAG: s_mov_b32 s32, 0{{$}} -; MESA: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} -; MESA-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4 -; MESA-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext@rel32@hi+12 -; MESA-NEXT: v_and_b32_e32 v0, 1, v0 -; MESA-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} -; MESA-NEXT: s_endpgm -; HSA: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} -; HSA-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext@rel32@hi+12 -; HSA-NEXT: v_and_b32_e32 v0, 1, v3 -; HSA-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} -; HSA-NEXT: s_endpgm +; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext@rel32@hi+12 +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} +; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { %var = load volatile i1, i1 addrspace(1)* undef call void @external_void_func_i1_zeroext(i1 %var) @@ -143,8 +136,7 @@ ; FIXME: don't wait before call ; GCN-LABEL: {{^}}test_call_external_void_func_i8_signext: -; MESA-DAG: buffer_load_sbyte v0 -; HSA-DAG: buffer_load_sbyte v3 +; GCN-DAG: buffer_load_sbyte v0 ; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_signext@rel32@lo+4 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext@rel32@hi+12 @@ -152,7 +144,7 @@ ; GCN-DAG: s_mov_b32 s32, 0 ; GCN-NOT: s_waitcnt -; GCN-DAG: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} +; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { %var = load volatile i8, i8 addrspace(1)* undef @@ -162,8 +154,7 @@ ; GCN-LABEL: {{^}}test_call_external_void_func_i8_zeroext: -; MESA-DAG: buffer_load_ubyte v0 -; HSA-DAG: buffer_load_ubyte v3 +; GCN-DAG: buffer_load_ubyte v0 ; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_zeroext@rel32@lo+4 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_zeroext@rel32@hi+12 @@ -171,7 +162,7 @@ ; GCN-DAG: s_mov_b32 s32, 0 ; GCN-NOT: s_waitcnt -; GCN-DAG: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} +; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { %var = load volatile i8, i8 addrspace(1)* undef @@ -192,8 +183,7 @@ ; GCN-LABEL: {{^}}test_call_external_void_func_i16_signext: -; MESA-DAG: buffer_load_sshort v0 -; HSA-DAG: buffer_load_sshort v3 +; GCN-DAG: buffer_load_sshort v0 ; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_signext@rel32@lo+4 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_signext@rel32@hi+12 @@ -201,7 +191,7 @@ ; GCN-DAG: s_mov_b32 s32, 0 ; GCN-NOT: s_waitcnt -; GCN-DAG: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} +; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { %var = load volatile i16, i16 addrspace(1)* undef @@ -218,7 +208,7 @@ ; GCN-DAG: s_mov_b32 s32, 0 ; GCN-NOT: s_waitcnt -; GCN-DAG: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} +; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { %var = load volatile i16, i16 addrspace(1)* undef @@ -491,7 +481,7 @@ ; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm: {{.*}} -; GCN-NOT: v3, +; GCN-NOT: v3 ; GCN-DAG: v_mov_b32_e32 v0, 3 ; GCN-DAG: v_mov_b32_e32 v1, 4 ; GCN-DAG: v_mov_b32_e32 v2, 5 @@ -596,7 +586,7 @@ ; GCN-DAG: buffer_load_dwordx4 v[20:23], off ; GCN-DAG: buffer_load_dwordx4 v[24:27], off ; GCN-DAG: buffer_load_dwordx4 v[28:31], off -; MESA-NOT: s_waitcnt +; GCN-NOT: s_waitcnt ; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { %ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef @@ -621,8 +611,7 @@ ; GCN-DAG: buffer_load_dwordx4 v[28:31], off ; GCN: s_waitcnt -; MESA: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32{{$}} -; HSA: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32 offset:4 +; GCN: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32{{$}} ; GCN: s_swappc_b64 ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { @@ -645,11 +634,9 @@ } ; GCN-LABEL: {{^}}test_call_external_void_func_struct_i8_i32: -; MESA: buffer_load_ubyte v0, off -; MESA-DAG: buffer_load_dword v1, off -; HSA: buffer_load_ubyte v3, off -; HSA-DAG: buffer_load_dword v4, off -; MESA-NOT: s_waitcnt +; GCN: buffer_load_ubyte v0, off +; GCN: buffer_load_dword v1, off +; GCN-NOT: s_waitcnt ; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef @@ -751,19 +738,15 @@ } ; GCN-LABEL: {{^}}tail_call_byval_align16: -; GCN-NOT: s32, -; MESA: buffer_load_dword [[VREG1:v[0-9]+]], off, s[0:3], s32 offset:8 -; MESA: buffer_load_dword [[VREG2:v[0-9]+]], off, s[0:3], s32 offset:12 -; HSA: buffer_load_dword [[VREG1:v[0-9]+]], off, s[0:3], s32 -; HSA: buffer_load_dword [[VREG2:v[0-9]+]], off, s[0:3], s32 offset:24 +; GCN-NOT: s32 +; GCN: buffer_load_dword [[VREG1:v[0-9]+]], off, s[0:3], s32 offset:8 +; GCN: buffer_load_dword [[VREG2:v[0-9]+]], off, s[0:3], s32 offset:12 ; GCN: s_getpc_b64 -; MESA: buffer_store_dword [[VREG2]], off, s[0:3], s32 offset:4 -; MESA: buffer_store_dword [[VREG1]], off, s[0:3], s32{{$}} -; HSA: buffer_store_dword [[VREG2]], off, s[0:3], s32 offset:16 -; HSA: buffer_store_dword [[VREG1]], off, s[0:3], s32 -; GCN-NOT: s32, +; GCN: buffer_store_dword [[VREG2]], off, s[0:3], s32 offset:4 +; GCN: buffer_store_dword [[VREG1]], off, s[0:3], s32{{$}} +; GCN-NOT: s32 ; GCN: s_setpc_b64 define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { entry: @@ -774,16 +757,11 @@ ; GCN-LABEL: {{^}}tail_call_stack_passed_arg_alignment_v32i32_f64: ; GCN-NOT: s32 -; MESA: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; MESA: buffer_load_dword v33, off, s[0:3], s32{{$}} -; MESA: s_getpc_b64 -; MESA: buffer_store_dword v33, off, s[0:3], s32{{$}} -; MESA: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; HSA: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; HSA: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; HSA: s_getpc_b64 -; HSA: buffer_store_dword v33, off, s[0:3], s32 offset:4 -; HSA: buffer_store_dword v32, off, s[0:3], s32 offset:8 +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN: buffer_load_dword v33, off, s[0:3], s32{{$}} +; GCN: s_getpc_b64 +; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}} +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; GCN-NOT: s32 ; GCN: s_setpc_b64 define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 { @@ -793,27 +771,16 @@ } ; GCN-LABEL: {{^}}stack_12xv3i32: -; MESA: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 -; MESA: buffer_store_dword [[REG12]], {{.*$}} -; MESA: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; MESA: buffer_store_dword [[REG13]], {{.*}} offset:4 -; MESA: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; MESA: buffer_store_dword [[REG14]], {{.*}} offset:8 -; MESA: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 -; MESA: buffer_store_dword [[REG15]], {{.*}} offset:12 -; MESA: v_mov_b32_e32 v31, 11 -; MESA: s_getpc -; HSA: v_mov_b32_e32 [[REG12:v[0-9]+]], 11 -; HSA: buffer_store_dword [[REG12]], {{.*$}} -; HSA: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 -; HSA: buffer_store_dword [[REG12]], {{.*}} offset:4 -; HSA: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; HSA: buffer_store_dword [[REG13]], {{.*}} offset:8 -; HSA: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; HSA: buffer_store_dword [[REG14]], {{.*}} offset:12 -; HSA: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 -; HSA: buffer_store_dword [[REG15]], {{.*}} offset:16 -; HSA: s_getpc +; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 +; GCN: buffer_store_dword [[REG12]], {{.*$}} +; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 +; GCN: buffer_store_dword [[REG13]], {{.*}} offset:4 +; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 +; GCN: buffer_store_dword [[REG14]], {{.*}} offset:8 +; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 +; GCN: buffer_store_dword [[REG15]], {{.*}} offset:12 +; GCN: v_mov_b32_e32 v31, 11 +; GCN: s_getpc define void @stack_12xv3i32() #0 { entry: call void @external_void_func_12xv3i32( @@ -833,25 +800,16 @@ } ; GCN-LABEL: {{^}}stack_12xv3f32: -; MESA: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 -; MESA: buffer_store_dword [[REG12]], {{.*$}} -; MESA: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 -; MESA: buffer_store_dword [[REG13]], {{.*}} offset:4 -; MESA: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 -; MESA: buffer_store_dword [[REG14]], {{.*}} offset:8 -; MESA: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 -; MESA: buffer_store_dword [[REG15]], {{.*}} offset:12 -; MESA: v_mov_b32_e32 v31, 0x41300000 -; MESA: s_getpc -; HSA: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 -; HSA: buffer_store_dword [[REG12]], {{.*}} offset:4 -; HSA: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 -; HSA: buffer_store_dword [[REG13]], {{.*}} offset:8 -; HSA: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 -; HSA: buffer_store_dword [[REG14]], {{.*}} offset:12 -; HSA: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 -; HSA: buffer_store_dword [[REG15]], {{.*}} offset:16 -; HSA: s_getpc +; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 +; GCN: buffer_store_dword [[REG12]], {{.*$}} +; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 +; GCN: buffer_store_dword [[REG13]], {{.*}} offset:4 +; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 +; GCN: buffer_store_dword [[REG14]], {{.*}} offset:8 +; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 +; GCN: buffer_store_dword [[REG15]], {{.*}} offset:12 +; GCN: v_mov_b32_e32 v31, 0x41300000 +; GCN: s_getpc define void @stack_12xv3f32() #0 { entry: call void @external_void_func_12xv3f32( @@ -872,41 +830,24 @@ ; GCN-LABEL: {{^}}stack_8xv5i32: -; MESA: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 -; MESA: buffer_store_dword [[REG8]], {{.*$}} -; MESA: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 -; MESA: buffer_store_dword [[REG9]], {{.*}} offset:4 -; MESA: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 -; MESA: buffer_store_dword [[REG10]], {{.*}} offset:8 -; MESA: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 -; MESA: buffer_store_dword [[REG11]], {{.*}} offset:12 -; MESA: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 -; MESA: buffer_store_dword [[REG12]], {{.*}} offset:16 -; MESA: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; MESA: buffer_store_dword [[REG13]], {{.*}} offset:20 -; MESA: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; MESA: buffer_store_dword [[REG14]], {{.*}} offset:24 -; MESA: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 -; MESA: buffer_store_dword [[REG15]], {{.*}} offset:28 -; HSA: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 -; HSA: buffer_store_dword [[REG8]], {{.*}} offset:4 -; HSA: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 -; HSA: buffer_store_dword [[REG9]], {{.*}} offset:8 -; HSA: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 -; HSA: buffer_store_dword [[REG10]], {{.*}} offset:12 -; HSA: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 -; HSA: buffer_store_dword [[REG11]], {{.*}} offset:16 -; HSA: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 -; HSA: buffer_store_dword [[REG12]], {{.*}} offset:20 -; HSA: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; HSA: buffer_store_dword [[REG13]], {{.*}} offset:24 -; HSA: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; HSA: buffer_store_dword [[REG14]], {{.*}} offset:28 -; HSA: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 -; HSA: buffer_store_dword [[REG15]], {{.*}} offset:32 - - -; MESA: v_mov_b32_e32 v31, 7 +; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 +; GCN: buffer_store_dword [[REG8]], {{.*$}} +; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 +; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4 +; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 +; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8 +; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 +; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12 +; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 +; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16 +; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 +; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20 +; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 +; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24 +; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 +; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28 + +; GCN: v_mov_b32_e32 v31, 7 ; GCN: s_getpc define void @stack_8xv5i32() #0 { entry: @@ -923,42 +864,24 @@ } ; GCN-LABEL: {{^}}stack_8xv5f32: -; MESA: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000 -; MESA: buffer_store_dword [[REG8]], {{.*$}} -; MESA: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000 -; MESA: buffer_store_dword [[REG9]], {{.*}} offset:4 -; MESA: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000 -; MESA: buffer_store_dword [[REG10]], {{.*}} offset:8 -; MESA: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000 -; MESA: buffer_store_dword [[REG11]], {{.*}} offset:12 -; MESA: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 -; MESA: buffer_store_dword [[REG12]], {{.*}} offset:16 -; MESA: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 -; MESA: buffer_store_dword [[REG13]], {{.*}} offset:20 -; MESA: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 -; MESA: buffer_store_dword [[REG14]], {{.*}} offset:24 -; MESA: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 -; MESA: buffer_store_dword [[REG15]], {{.*}} offset:28 -; MESA: v_mov_b32_e32 v31, 0x40e00000 - -; HSA: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x40e00000 -; HSA: buffer_store_dword [[REG8]], {{.*$}} -; HSA: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000 -; HSA: buffer_store_dword [[REG8]], {{.*}} offset:4 -; HSA: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000 -; HSA: buffer_store_dword [[REG9]], {{.*}} offset:8 -; HSA: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000 -; HSA: buffer_store_dword [[REG10]], {{.*}} offset:12 -; HSA: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000 -; HSA: buffer_store_dword [[REG11]], {{.*}} offset:16 -; HSA: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 -; HSA: buffer_store_dword [[REG12]], {{.*}} offset:20 -; HSA: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 -; HSA: buffer_store_dword [[REG13]], {{.*}} offset:24 -; HSA: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 -; HSA: buffer_store_dword [[REG14]], {{.*}} offset:28 -; HSA: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 -; HSA: buffer_store_dword [[REG15]], {{.*}} offset:32 +; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000 +; GCN: buffer_store_dword [[REG8]], {{.*$}} +; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000 +; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4 +; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000 +; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8 +; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000 +; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12 +; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 +; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16 +; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 +; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20 +; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 +; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24 +; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 +; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28 + +; GCN: v_mov_b32_e32 v31, 0x40e00000 ; GCN: s_getpc define void @stack_8xv5f32() #0 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/call-constant.ll b/llvm/test/CodeGen/AMDGPU/call-constant.ll --- a/llvm/test/CodeGen/AMDGPU/call-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/call-constant.ll @@ -4,8 +4,8 @@ ; FIXME: Emitting unnecessary flat_scratch setup ; GCN-LABEL: {{^}}test_call_undef: -; SDAG: s_mov_b32 flat_scratch_lo, s5 -; SDAG: s_add_u32 s4, s4, s7 +; SDAG: s_mov_b32 flat_scratch_lo, s11 +; SDAG: s_add_u32 s10, s10, s15 ; SDAG: s_lshr_b32 ; GCN: s_endpgm define amdgpu_kernel void @test_call_undef() #0 { @@ -26,8 +26,8 @@ } ; GCN-LABEL: {{^}}test_call_null: -; SDAG: s_mov_b32 flat_scratch_lo, s5 -; SDAG: s_add_u32 s4, s4, s7 +; SDAG: s_mov_b32 flat_scratch_lo, s11 +; SDAG: s_add_u32 s10, s10, s15 ; SDAG: s_lshr_b32 ; GISEL: s_swappc_b64 s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/call-constexpr.ll b/llvm/test/CodeGen/AMDGPU/call-constexpr.ll --- a/llvm/test/CodeGen/AMDGPU/call-constexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/call-constexpr.ll @@ -65,7 +65,7 @@ ; GCN-LABEL: {{^}}use_workitem_id_x: ; GCN: s_waitcnt -; GCN-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; GCN-NEXT: v_and_b32_e32 v1, 0x3ff, v1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: s_setpc_b64 define hidden i32 @use_workitem_id_x(i32 %arg0) #0 { @@ -78,6 +78,7 @@ ; GCN: s_getpc_b64 ; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@lo+4 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@hi+12 +; GCN: v_or_b32_e32 v1, v0 ; GCN: v_mov_b32_e32 v0, 9 ; GCN: s_swappc_b64 ; GCN: v_add_f32_e32 diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -230,7 +230,7 @@ ; CI: NumSgprs: 48 ; VI-NOBUG: NumSgprs: 48 ; VI-BUG: NumSgprs: 96 -; GCN: NumVgprs: 32 +; GCN: NumVgprs: 24 define amdgpu_kernel void @count_use_sgpr96_external_call() { entry: tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> ) #1 @@ -244,7 +244,7 @@ ; CI: NumSgprs: 48 ; VI-NOBUG: NumSgprs: 48 ; VI-BUG: NumSgprs: 96 -; GCN: NumVgprs: 32 +; GCN: NumVgprs: 24 define amdgpu_kernel void @count_use_sgpr160_external_call() { entry: tail call void asm sideeffect "; sgpr160 $0", "s"(<5 x i32> ) #1 @@ -258,7 +258,7 @@ ; CI: NumSgprs: 48 ; VI-NOBUG: NumSgprs: 48 ; VI-BUG: NumSgprs: 96 -; GCN: NumVgprs: 32 +; GCN: NumVgprs: 24 define amdgpu_kernel void @count_use_vgpr160_external_call() { entry: tail call void asm sideeffect "; vgpr160 $0", "v"(<5 x i32> ) #1 diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll --- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -6,15 +6,15 @@ declare hidden void @external_void_func_void() #0 ; GCN-LABEL: {{^}}test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: -; GCN: s_getpc_b64 s[44:45] -; GCN-NEXT: s_add_u32 s44, s44, -; GCN-NEXT: s_addc_u32 s45, s45, +; GCN: s_getpc_b64 s[34:35] +; GCN-NEXT: s_add_u32 s34, s34, +; GCN-NEXT: s_addc_u32 s35, s35, ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN: s_swappc_b64 s[30:31], s[44:45] +; GCN: s_swappc_b64 s[30:31], s[34:35] -; GCN-DAG: #ASMSTART -; GCN-DAG: #ASMEND -; GCN-DAG: s_swappc_b64 s[30:31], s[44:45] +; GCN-NEXT: #ASMSTART +; GCN-NEXT: #ASMEND +; GCN-NEXT: s_swappc_b64 s[30:31], s[34:35] define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 { call void @external_void_func_void() call void asm sideeffect "", ""() #0 @@ -25,60 +25,24 @@ ; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: ; MUBUF: buffer_store_dword ; FLATSCR: scratch_store_dword -; GCN: v_writelane_b32 v41, s33, 15 -; GCN-NEXT: v_writelane_b32 v41, s34, 0 -; GCN-NEXT: v_writelane_b32 v41, s35, 1 -; GCN-NEXT: v_writelane_b32 v41, s36, 2 -; GCN-NEXT: v_writelane_b32 v41, s37, 3 -; GCN-NEXT: v_writelane_b32 v41, s38, 4 -; GCN-NEXT: v_writelane_b32 v41, s39, 5 -; GCN-NEXT: v_writelane_b32 v41, s40, 6 -; GCN-NEXT: v_writelane_b32 v41, s41, 7 -; GCN-NEXT: v_writelane_b32 v41, s42, 8 -; GCN-NEXT: v_writelane_b32 v41, s43, 9 -; GCN-NEXT: v_writelane_b32 v41, s44, 10 -; GCN-NEXT: v_writelane_b32 v41, s46, 11 -; GCN-NEXT: v_writelane_b32 v41, s47, 12 -; GCN-NEXT: v_writelane_b32 v41, s30, 13 +; GCN: v_writelane_b32 v40, s33, 4 +; GCN: v_writelane_b32 v40, s34, 0 +; GCN: v_writelane_b32 v40, s35, 1 +; GCN: v_writelane_b32 v40, s30, 2 +; GCN: v_writelane_b32 v40, s31, 3 ; GCN: s_swappc_b64 -; GCN-DAG: ;;#ASMSTART +; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_swappc_b64 - -; MUBUF-DAG: v_readlane_b32 s4, v41, 13 -; MUBUF-DAG: v_readlane_b32 s5, v41, 14 -; MUBUF-DAG: v_readlane_b32 s47, v41, 12 -; MUBUF-DAG: v_readlane_b32 s46, v41, 11 -; MUBUF-DAG: v_readlane_b32 s44, v41, 10 -; MUBUF-DAG: v_readlane_b32 s43, v41, 9 -; MUBUF-DAG: v_readlane_b32 s42, v41, 8 -; MUBUF-DAG: v_readlane_b32 s41, v41, 7 -; MUBUF-DAG: v_readlane_b32 s40, v41, 6 -; MUBUF-DAG: v_readlane_b32 s39, v41, 5 -; MUBUF-DAG: v_readlane_b32 s38, v41, 4 -; MUBUF-DAG: v_readlane_b32 s37, v41, 3 -; MUBUF-DAG: v_readlane_b32 s36, v41, 2 -; MUBUF-DAG: v_readlane_b32 s35, v41, 1 -; MUBUF-DAG: v_readlane_b32 s34, v41, 0 - -; FLATSCR: v_readlane_b32 s0, v41, 13 -; FLATSCR-DAG: v_readlane_b32 s1, v41, 14 -; FLATSCR-DAG: v_readlane_b32 s47, v41, 12 -; FLATSCR-DAG: v_readlane_b32 s46, v41, 11 -; FLATSCR-DAG: v_readlane_b32 s44, v41, 10 -; FLATSCR-DAG: v_readlane_b32 s43, v41, 9 -; FLATSCR-DAG: v_readlane_b32 s42, v41, 8 -; FLATSCR-DAG: v_readlane_b32 s41, v41, 7 -; FLATSCR-DAG: v_readlane_b32 s40, v41, 6 -; FLATSCR-DAG: v_readlane_b32 s39, v41, 5 -; FLATSCR-DAG: v_readlane_b32 s38, v41, 4 -; FLATSCR-DAG: v_readlane_b32 s37, v41, 3 -; FLATSCR-DAG: v_readlane_b32 s36, v41, 2 -; FLATSCR-DAG: v_readlane_b32 s35, v41, 1 -; FLATSCR-DAG: v_readlane_b32 s34, v41, 0 -; FLATSCR-DAG: v_readlane_b32 s33, v41, 15 - +; MUBUF-DAG: v_readlane_b32 s4, v40, 2 +; MUBUF-DAG: v_readlane_b32 s5, v40, 3 +; FLATSCR-DAG: v_readlane_b32 s0, v40, 2 +; FLATSCR-DAG: v_readlane_b32 s1, v40, 3 +; GCN: v_readlane_b32 s35, v40, 1 +; GCN: v_readlane_b32 s34, v40, 0 + +; GCN: v_readlane_b32 s33, v40, 4 ; MUBUF: buffer_load_dword ; FLATSCR: scratch_load_dword ; GCN: s_setpc_b64 @@ -90,19 +54,19 @@ } ; GCN-LABEL: {{^}}test_func_call_external_void_funcx2: -; MUBUF: buffer_store_dword v41 -; GCN: v_writelane_b32 v41, s33, 15 +; MUBUF: buffer_store_dword v40 +; FLATSCR: scratch_store_dword off, v40 +; GCN: v_writelane_b32 v40, s33, 4 ; GCN: s_mov_b32 s33, s32 -; FLATSCR: s_add_u32 s32, s32, 16 -; FLATSCR: scratch_store_dword off, v40 ; MUBUF: s_add_u32 s32, s32, 0x400 +; FLATSCR: s_add_u32 s32, s32, 16 ; GCN: s_swappc_b64 -; GCN-DAG: s_swappc_b64 +; GCN-NEXT: s_swappc_b64 -; GCN: v_readlane_b32 s33, v41, 15 -; MUBUF: buffer_load_dword v41 -; FLATSCR: scratch_load_dword v41 +; GCN: v_readlane_b32 s33, v40, 4 +; MUBUF: buffer_load_dword v40 +; FLATSCR: scratch_load_dword v40 define void @test_func_call_external_void_funcx2() #0 { call void @external_void_func_void() call void @external_void_func_void() @@ -160,7 +124,7 @@ ; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_v31: ; GCN: v_mov_b32_e32 v40, v31 -; GCN-DAG: s_swappc_b64 +; GCN-NEXT: s_swappc_b64 ; GCN-NEXT: v_mov_b32_e32 v31, v40 define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)* %out) #0 { %v31 = call i32 asm sideeffect "; def $0", "={v31}"() @@ -172,18 +136,18 @@ ; FIXME: What is the expected behavior for reserved registers here? ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33: -; MUBUF: s_getpc_b64 s[18:19] -; MUBUF-NEXT: s_add_u32 s18, s18, external_void_func_void@rel32@lo+4 -; MUBUF-NEXT: s_addc_u32 s19, s19, external_void_func_void@rel32@hi+12 -; FLATSCR: s_getpc_b64 s[16:17] -; FLATSCR-NEXT: s_add_u32 s16, s16, external_void_func_void@rel32@lo+4 -; FLATSCR-NEXT: s_addc_u32 s17, s17, external_void_func_void@rel32@hi+12 +; MUBUF: s_getpc_b64 s[4:5] +; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; FLATSCR: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 ; GCN: s_mov_b32 s32, 0 ; GCN: #ASMSTART ; GCN-NEXT: ; def s33 ; GCN-NEXT: #ASMEND -; MUBUF: s_swappc_b64 s[30:31], s[18:19] -; FLATSCR: s_swappc_b64 s[30:31], s[16:17] +; MUBUF: s_swappc_b64 s[30:31], s[4:5] +; FLATSCR: s_swappc_b64 s[30:31], s[0:1] ; GCN: ;;#ASMSTART ; GCN-NEXT: ; use s33 ; GCN-NEXT: ;;#ASMEND @@ -199,12 +163,12 @@ ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: {{.*}} ; GCN-NOT: s34 -; MUBUF: s_getpc_b64 s[18:19] -; MUBUF-NEXT: s_add_u32 s18, s18, external_void_func_void@rel32@lo+4 -; MUBUF-NEXT: s_addc_u32 s19, s19, external_void_func_void@rel32@hi+12 -; FLATSCR: s_getpc_b64 s[16:17] -; FLATSCR-NEXT: s_add_u32 s16, s16, external_void_func_void@rel32@lo+4 -; FLATSCR-NEXT: s_addc_u32 s17, s17, external_void_func_void@rel32@hi+12 +; MUBUF: s_getpc_b64 s[4:5] +; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; FLATSCR: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 ; GCN: s_mov_b32 s32, 0 ; GCN-NOT: s34 @@ -213,8 +177,8 @@ ; GCN-NEXT: ;;#ASMEND ; GCN-NOT: s34 -; MUBUF: s_swappc_b64 s[30:31], s[18:19] -; FLATSCR: s_swappc_b64 s[30:31], s[16:17] +; MUBUF: s_swappc_b64 s[30:31], s[4:5] +; FLATSCR: s_swappc_b64 s[30:31], s[0:1] ; GCN-NOT: s34 @@ -232,12 +196,12 @@ ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v40: {{.*}} ; GCN-NOT: v32 -; MUBUF: s_getpc_b64 s[18:19] -; MUBUF-NEXT: s_add_u32 s18, s18, external_void_func_void@rel32@lo+4 -; MUBUF-NEXT: s_addc_u32 s19, s19, external_void_func_void@rel32@hi+12 -; FLATSCR: s_getpc_b64 s[16:17] -; FLATSCR-NEXT: s_add_u32 s16, s16, external_void_func_void@rel32@lo+4 -; FLATSCR-NEXT: s_addc_u32 s17, s17, external_void_func_void@rel32@hi+12 +; MUBUF: s_getpc_b64 s[4:5] +; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; FLATSCR: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 ; GCN: s_mov_b32 s32, 0 ; GCN-NOT: v40 @@ -245,8 +209,8 @@ ; GCN-NEXT: ; def v40 ; GCN-NEXT: ;;#ASMEND -; MUBUF: s_swappc_b64 s[30:31], s[18:19] -; FLATSCR: s_swappc_b64 s[30:31], s[16:17] +; MUBUF: s_swappc_b64 s[30:31], s[4:5] +; FLATSCR: s_swappc_b64 s[30:31], s[0:1] ; GCN-NOT: v40 diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll --- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll @@ -5,30 +5,20 @@ define amdgpu_kernel void @call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0 { ; GCN-LABEL: call_memory_arg_load: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GCN-NEXT: s_mov_b32 s12, s14 -; GCN-NEXT: s_load_dword s14, s[8:9], 0x0 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GCN-NEXT: s_add_u32 s0, s0, s17 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_add_u32 s8, s8, 8 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v3, s14 -; GCN-NEXT: ds_read_b32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: s_addc_u32 s9, s9, 0 -; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 -; GCN-NEXT: s_mov_b32 s13, s15 -; GCN-NEXT: s_mov_b32 s14, s16 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, v3 -; GCN-NEXT: s_getpc_b64 s[18:19] -; GCN-NEXT: s_add_u32 s18, s18, func@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s19, s19, func@rel32@hi+12 -; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] -; GCN-NEXT: s_endpgm +; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_endpgm %vgpr = load volatile i32, i32 addrspace(3)* %ptr call void @func(i32 %vgpr) ret void @@ -38,29 +28,21 @@ define amdgpu_kernel void @call_memory_no_dep(i32 addrspace(1)* %ptr, i32) #0 { ; GCN-LABEL: call_memory_no_dep: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GCN-NEXT: s_mov_b32 s13, s15 -; GCN-NEXT: s_mov_b32 s12, s14 -; GCN-NEXT: s_load_dwordx2 s[14:15], s[8:9], 0x0 -; GCN-NEXT: s_add_u32 s0, s0, s17 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_add_u32 s8, s8, 16 -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: s_addc_u32 s9, s9, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_store_dword v3, v3, s[14:15] -; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 -; GCN-NEXT: s_mov_b32 s14, s16 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_getpc_b64 s[18:19] -; GCN-NEXT: s_add_u32 s18, s18, func@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s19, s19, func@rel32@hi+12 -; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] -; GCN-NEXT: s_endpgm +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_dword v0, v0, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_getpc_b64 s[6:7] +; GCN-NEXT: s_add_u32 s6, s6, func@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s7, s7, func@rel32@hi+12 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_endpgm store i32 0, i32 addrspace(1)* %ptr call void @func(i32 0) ret void @@ -69,29 +51,21 @@ ; Should not wait after the call before memory define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32) #0 { ; GCN-LABEL: call_no_wait_after_call: -; GCN: %bb.0: -; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GCN-NEXT: s_add_u32 s0, s0, s17 -; GCN-NEXT: s_load_dwordx2 s[34:35], s[8:9], 0x0 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_add_u32 s8, s8, 16 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: s_addc_u32 s9, s9, 0 -; GCN-NEXT: s_mov_b32 s12, s14 -; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 -; GCN-NEXT: s_mov_b32 s13, s15 -; GCN-NEXT: s_mov_b32 s14, s16 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_getpc_b64 s[18:19] -; GCN-NEXT: s_add_u32 s18, s18, func@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s19, s19, func@rel32@hi+12 -; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: v_mov_b32_e32 v40, 0 -; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] -; GCN-NEXT: global_store_dword v40, v40, s[34:35] -; GCN-NEXT: s_endpgm +; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: v_mov_b32_e32 v40, 0 +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: global_store_dword v40, v40, s[34:35] +; GCN-NEXT: s_endpgm call void @func(i32 0) store i32 0, i32 addrspace(1)* %ptr ret void @@ -100,28 +74,20 @@ define amdgpu_kernel void @call_no_wait_after_call_return_val(i32 addrspace(1)* %ptr, i32) #0 { ; GCN-LABEL: call_no_wait_after_call_return_val: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GCN-NEXT: s_add_u32 s0, s0, s17 -; GCN-NEXT: s_load_dwordx2 s[34:35], s[8:9], 0x0 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_add_u32 s8, s8, 16 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: s_addc_u32 s9, s9, 0 -; GCN-NEXT: s_mov_b32 s12, s14 -; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 -; GCN-NEXT: s_mov_b32 s13, s15 -; GCN-NEXT: s_mov_b32 s14, s16 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_getpc_b64 s[18:19] -; GCN-NEXT: s_add_u32 s18, s18, func.return@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s19, s19, func.return@rel32@hi+12 -; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: v_mov_b32_e32 v40, 0 -; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] -; GCN-NEXT: global_store_dword v40, v0, s[34:35] -; GCN-NEXT: s_endpgm +; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, func.return@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, func.return@rel32@hi+12 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: v_mov_b32_e32 v40, 0 +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: global_store_dword v40, v0, s[34:35] +; GCN-NEXT: s_endpgm %rv = call i32 @func.return(i32 0) store i32 %rv, i32 addrspace(1)* %ptr ret void @@ -131,27 +97,19 @@ define amdgpu_kernel void @call_got_load(i32 addrspace(1)* %ptr, i32) #0 { ; GCN-LABEL: call_got_load: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GCN-NEXT: s_add_u32 s0, s0, s17 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_add_u32 s8, s8, 16 -; GCN-NEXT: s_addc_u32 s9, s9, 0 -; GCN-NEXT: s_mov_b32 s13, s15 -; GCN-NEXT: s_mov_b32 s12, s14 -; GCN-NEXT: s_getpc_b64 s[14:15] -; GCN-NEXT: s_add_u32 s14, s14, got.func@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s15, s15, got.func@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 -; GCN-NEXT: s_mov_b32 s14, s16 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] -; GCN-NEXT: s_endpgm +; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, got.func@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, got.func@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_endpgm call void @got.func(i32 0) ret void } @@ -160,14 +118,14 @@ define void @tailcall_got_load(i32 addrspace(1)* %ptr, i32) #0 { ; GCN-LABEL: tailcall_got_load: ; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, got.func@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, got.func@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 s[16:17] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, got.func@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, got.func@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] tail call void @got.func(i32 0) ret void } @@ -176,12 +134,12 @@ define void @tail_call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0 { ; GCN-LABEL: tail_call_memory_arg_load: ; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: ds_read_b32 v0, v0 -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, func@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, func@rel32@hi+12 -; GCN-NEXT: s_setpc_b64 s[16:17] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[4:5] %vgpr = load volatile i32, i32 addrspace(3)* %ptr tail call void @func(i32 %vgpr) ret void diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -amdgpu-fixed-function-abi=0 --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,CIVI %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-fixed-function-abi=0 --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,CIVI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: {{^}}use_dispatch_ptr: ; GCN: s_load_dword s{{[0-9]+}}, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}use_workitem_id_x: ; GCN: s_waitcnt -; GCN: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31 +; GCN: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v0 ; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -15,7 +15,7 @@ ; GCN-LABEL: {{^}}use_workitem_id_y: ; GCN: s_waitcnt -; GCN: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10 +; GCN: v_bfe_u32 [[ID:v[0-9]+]], v0, 10, 10 ; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -27,7 +27,7 @@ ; GCN-LABEL: {{^}}use_workitem_id_z: ; GCN: s_waitcnt -; GCN: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10 +; GCN: v_bfe_u32 [[ID:v[0-9]+]], v0, 20, 10 ; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -39,10 +39,9 @@ ; GCN-LABEL: {{^}}use_workitem_id_xy: ; GCN: s_waitcnt -; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 +; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0 +; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] -; GCN-NEXT: s_waitcnt -; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -56,13 +55,11 @@ ; GCN-LABEL: {{^}}use_workitem_id_xyz: ; GCN: s_waitcnt -; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 +; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0 +; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10 +; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] -; GCN-NEXT: s_waitcnt -; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] -; GCN-NEXT: s_waitcnt -; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -78,10 +75,9 @@ ; GCN-LABEL: {{^}}use_workitem_id_xz: ; GCN: s_waitcnt -; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 +; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0 +; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] -; GCN-NEXT: s_waitcnt -; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -95,10 +91,9 @@ ; GCN-LABEL: {{^}}use_workitem_id_yz: ; GCN: s_waitcnt -; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 +; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10 +; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] -; GCN-NEXT: s_waitcnt -; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -112,9 +107,11 @@ ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_x: +; GCN-NOT: v0 ; GCN: s_swappc_b64 +; GCN-NOT: v0 -; GCN: .amdhsa_system_vgpr_workitem_id 2 +; GCN: .amdhsa_system_vgpr_workitem_id 0 define amdgpu_kernel void @kern_indirect_use_workitem_id_x() #1 { call void @use_workitem_id_x() ret void @@ -122,10 +119,14 @@ ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_y: -; UNPACKED-TID: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NOT: v0 +; GCN-NOT: v1 +; UNPACKED-TID: v_lshlrev_b32_e32 v0, 10, v1 +; UNPACKED-TID-NOT: v0 +; UNPACKED-TID-NOT: v1 ; GCN: s_swappc_b64 -; GCN: .amdhsa_system_vgpr_workitem_id 2 +; GCN: .amdhsa_system_vgpr_workitem_id 1 define amdgpu_kernel void @kern_indirect_use_workitem_id_y() #1 { call void @use_workitem_id_y() ret void @@ -133,7 +134,11 @@ ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_z: -; UNPACKED-TID: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NOT: v0 +; GCN-NOT: v2 +; UNPACKED-TID: v_lshlrev_b32_e32 v0, 20, v2 +; UNPACKED-TID-NOT: v0 +; UNPACKED-TID-NOT: v1 ; GCN: s_swappc_b64 ; GCN: .amdhsa_system_vgpr_workitem_id 2 @@ -147,6 +152,8 @@ ; UNPACKED-TID-NOT: v1 ; UNPACKED-TID: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1 ; UNPACKED-TID: v_or_b32_e32 v0, v0, [[IDY]] +; GCN-NOT: v0 +; GCN-NOT: v1 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 { call void @use_workitem_id_xy() @@ -157,7 +164,9 @@ ; UNPACKED-TID-NOT: v0 ; UNPACKED-TID-NOT: v2 ; UNPACKED-TID: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2 -; UNPACKED-TID: v_or_b32_e32 v31, v0, [[IDZ]] +; UNPACKED-TID: v_or_b32_e32 v0, v0, [[IDZ]] +; GCN-NOT: v0 +; GCN-NOT: v2 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 { call void @use_workitem_id_xz() @@ -169,9 +178,9 @@ ; UNPACKED-TID-NOT: v2 ; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1 ; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2 -; UNPACKED-TID: v_or_b32_e32 v0, v0, [[IDY]] -; UNPACKED-TID: v_or_b32_e32 v31, v0, [[IDZ]] +; UNPACKED-TID: v_or_b32_e32 v0, [[IDY]], [[IDZ]] ; GCN-NOT: v1 +; GCN-NOT: v2 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_yz() #1 { call void @use_workitem_id_yz() @@ -185,7 +194,8 @@ ; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1 ; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2 ; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, [[IDY]] -; UNPACKED-TID-DAG: v_or_b32_e32 v31, v0, [[IDZ]] +; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, [[IDZ]] +; GCN-NOT: v0 ; GCN-NOT: v1 ; GCN-NOT: v2 ; GCN: s_swappc_b64 @@ -223,8 +233,8 @@ ; GCN-LABEL: {{^}}other_arg_use_workitem_id_x: ; GCN: s_waitcnt +; GCN-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v1 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GCN-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]] define void @other_arg_use_workitem_id_x(i32 %arg0) #1 { %val = call i32 @llvm.amdgcn.workitem.id.x() @@ -235,9 +245,8 @@ ; GCN-LABEL: {{^}}other_arg_use_workitem_id_y: ; GCN: s_waitcnt +; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 10, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GCN: s_waitcnt -; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]] define void @other_arg_use_workitem_id_y(i32 %arg0) #1 { %val = call i32 @llvm.amdgcn.workitem.id.y() @@ -248,9 +257,8 @@ ; GCN-LABEL: {{^}}other_arg_use_workitem_id_z: ; GCN: s_waitcnt +; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 20, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GCN: s_waitcnt -; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]] define void @other_arg_use_workitem_id_z(i32 %arg0) #1 { %val = call i32 @llvm.amdgcn.workitem.id.z() @@ -262,10 +270,11 @@ ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_x: +; GCN: v_mov_b32_e32 v1, v0 ; GCN: v_mov_b32_e32 v0, 0x22b ; GCN: s_swappc_b64 -; GCN: .amdhsa_system_vgpr_workitem_id 2 +; GCN: .amdhsa_system_vgpr_workitem_id 0 define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_x() #1 { call void @other_arg_use_workitem_id_x(i32 555) ret void @@ -275,13 +284,14 @@ ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_y: ; UNPACKED-TID: v_lshlrev_b32_e32 v1, 10, v1 -; PACKED-TID: v_mov_b32_e32 v31, v0 +; PACKED-TID: v_mov_b32_e32 v1, v0 +; GCN-NOT: v1 ; GCN: v_mov_b32_e32 v0, 0x22b ; GCN-NOT: v1 ; GCN: s_swappc_b64 ; GCN-NOT: v0 -; GCN: .amdhsa_system_vgpr_workitem_id 2 +; GCN: .amdhsa_system_vgpr_workitem_id 1 define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_y() #1 { call void @other_arg_use_workitem_id_y(i32 555) ret void @@ -290,8 +300,8 @@ ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_z: ; GCN-DAG: v_mov_b32_e32 v0, 0x22b -; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v2, 20, v2 -; PACKED-TID-DAG: v_mov_b32_e32 v31, v0 +; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v1, 20, v2 +; PACKED-TID-DAG: v_mov_b32_e32 v1, v0 ; GCN: s_swappc_b64 ; GCN-NOT: v0 @@ -302,10 +312,9 @@ } ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x: -; GFX90A: buffer_load_dword v32, off, s[0:3], s32{{$}} -; GCN: v_and_b32_e32 v31, 0x3ff, v31 -; GFX7: buffer_load_dword v0, off, s[0:3], s32{{$}} -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v0 +; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}} +; GCN: v_and_b32_e32 v32, 0x3ff, v32 +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 ; GCN: s_setpc_b64 define void @too_many_args_use_workitem_id_x( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, @@ -357,11 +366,10 @@ ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x: ; GCN: s_mov_b32 s32, 0 -; GFX7: buffer_store_dword v3, off, s[0:3], s32{{$}} -; GFX90A: buffer_store_dword v1, off, s[0:3], s32{{$}} +; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}} ; GCN: s_swappc_b64 -; GCN: .amdhsa_system_vgpr_workitem_id 2 +; GCN: .amdhsa_system_vgpr_workitem_id 0 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 { call void @too_many_args_use_workitem_id_x( i32 10, i32 20, i32 30, i32 40, @@ -377,7 +385,7 @@ ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x: ; GCN: s_mov_b32 s33, s32 -; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}} +; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}} ; GCN: s_swappc_b64 define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 { store volatile i32 %arg0, i32 addrspace(1)* undef @@ -425,13 +433,13 @@ ; frame[2] = VGPR spill slot ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval: -; GFX7: buffer_load_dword v0, off, s[0:3], s32 -; GFX90A: buffer_load_dword v32, off, s[0:3], s32 +; GFX7: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX90A: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GCN-DAG: s_waitcnt ; GFX7: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX90A: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32, -; GFX7: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc{{$}} -; GFX90A: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc{{$}} +; GFX7: buffer_load_dword v0, off, s[0:3], s32 glc{{$}} +; GFX90A: buffer_load_dword v0, off, s[0:3], s32 glc{{$}} ; GCN: s_setpc_b64 define void @too_many_args_use_workitem_id_x_byval( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, @@ -486,18 +494,17 @@ ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval: ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} -; GCN-DAG: buffer_store_dword [[K]], off, s[0:3], 0 offset:4 ; GCN-DAG: s_movk_i32 s32, 0x400 -; GFX7: buffer_store_dword v3, off, s[0:3], s32 -; GFX90A: buffer_store_dword v0, off, s[0:3], s32 +; GCN: buffer_store_dword [[K]], off, s[0:3], 0 offset:4 +; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], 0 offset:4 -; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4 +; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}} ; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]], ; GCN: s_swappc_b64 -; GCN: .amdhsa_system_vgpr_workitem_id 2 +; GCN: .amdhsa_system_vgpr_workitem_id 0 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 999, i32 addrspace(5)* %alloca @@ -515,12 +522,11 @@ } ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval: -; GCN: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} ; GFX7: buffer_store_dword [[K]], off, s[0:3], s33{{$}} ; GFX90A: buffer_store_dword [[K]], off, s[0:3], s33{{$}} ; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}} -; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4 +; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}} ; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]], ; GCN: s_swappc_b64 define void @func_call_too_many_args_use_workitem_id_x_byval() #1 { @@ -541,20 +547,21 @@ ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz: ; GFX90A: buffer_load_dword v32, off, s[0:3], s32{{$}} -; GFX90A: v_and_b32_e32 v33, 0x3ff, v31 -; GFX90A: v_bfe_u32 v33, v31, 10, 10 -; GCN90A: v_bfe_u32 v31, v31, 20, 10 -; GFX7: v_and_b32_e32 v32, 0x3ff, v31 -; GFX7: v_bfe_u32 v32, v31, 10, 10 -; GCN7: v_bfe_u32 v31, v31, 20, 10 -; GFX7: buffer_load_dword v0, off, s[0:3], s32{{$}} -; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v12 -; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v30{{$}} -; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v0{{$}} -; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v29, off{{$}} -; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v30, off{{$}} +; GFX90A: v_and_b32_e32 v33, 0x3ff, v32 +; GFX90A: v_bfe_u32 v34, v32, 10, 10 +; GCN90A: v_bfe_u32 v32, v32, 20, 10 +; GFX7: buffer_load_dword v32, off, s[0:3], s32{{$}} +; GFX7: v_and_b32_e32 v33, 0x3ff, v32 +; GFX7: v_bfe_u32 v33, v32, 10, 10 +; GCN7: v_bfe_u32 v32, v32, 20, 10 +; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v33{{$}} +; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32{{$}} +; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v33, off{{$}} +; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v34, off{{$}} ; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v32, off{{$}} +; GFX7-COUNT-32: flat_store_dword v{{\[[0-9]+:[0-9]+]}} +; GFX90A-COUNT-32: global_store_dword v{{\[[0-9]+:[0-9]+]}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @too_many_args_use_workitem_id_xyz( @@ -617,11 +624,11 @@ ; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v1, 10, v1 ; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, v1 ; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v2, 20, v2 -; UNPACKED-TID-DAG: v_or_b32_e32 v31, v0, v2 +; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, v2 ; PACKED-TID-NOT: v0 +; PACKED-TID-NOT: v1 ; PACKED-TID-NOT: v2 -; GFX7: buffer_store_dword v3, off, s[0:3], s32{{$}} -; GFX90A: buffer_store_dword v1, off, s[0:3], s32{{$}} +; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}} ; GCN: s_swappc_b64 ; GCN: .amdhsa_system_vgpr_workitem_id 2 diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -1,8 +1,10 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VARABI %s +; RUN: llc -amdgpu-fixed-function-abi -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FIXEDABI %s ; GCN-LABEL: {{^}}use_workitem_id_x: ; GCN: s_waitcnt -; GCN: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31 +; VARABI: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v0 +; FIXEDABI: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31 ; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -14,7 +16,8 @@ ; GCN-LABEL: {{^}}use_workitem_id_y: ; GCN: s_waitcnt -; GCN: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10 +; VARABI: v_bfe_u32 [[ID:v[0-9]+]], v0, 10, 10 +; FIXEDABI: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10 ; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -26,7 +29,8 @@ ; GCN-LABEL: {{^}}use_workitem_id_z: ; GCN: s_waitcnt -; GCN: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10 +; VARABI: v_bfe_u32 [[ID:v[0-9]+]], v0, 20, 10 +; FIXEDABI: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10 ; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -38,9 +42,11 @@ ; GCN-LABEL: {{^}}use_workitem_id_xy: ; GCN: s_waitcnt +; VARABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0 +; VARABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10 -; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 -; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 +; FIXEDABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 +; FIXEDABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] @@ -57,10 +63,13 @@ ; GCN-LABEL: {{^}}use_workitem_id_xyz: ; GCN: s_waitcnt +; VARABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0 +; VARABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10 +; VARABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10 -; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 -; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 -; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 +; FIXEDABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 +; FIXEDABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 +; FIXEDABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] @@ -80,9 +89,11 @@ ; GCN-LABEL: {{^}}use_workitem_id_xz: ; GCN: s_waitcnt +; VARABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0 +; VARABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10 -; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 -; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 +; FIXEDABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 +; FIXEDABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] @@ -98,9 +109,11 @@ ; GCN-LABEL: {{^}}use_workitem_id_yz: ; GCN: s_waitcnt +; VARABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10 +; VARABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10 -; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 -; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 +; FIXEDABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 +; FIXEDABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] @@ -115,31 +128,38 @@ } ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_x: -; GCN: enable_vgpr_workitem_id = 2 +; VARABI: enable_vgpr_workitem_id = 0 +; FIXEDABI: enable_vgpr_workitem_id = 2 ; FIXEDA-NOT: v0 +; VARABI-NOT: v31 ; GCN: s_swappc_b64 -; GCN-NOT: v0 +; FIXEDABI-NOT: v0 +; VARABI-NOT: v31 define amdgpu_kernel void @kern_indirect_use_workitem_id_x() #1 { call void @use_workitem_id_x() ret void } ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_y: -; GCN: enable_vgpr_workitem_id = 2 +; VARABI: enable_vgpr_workitem_id = 1 +; FIXEDABI: enable_vgpr_workitem_id = 2 -; GCN-NOT: v0 -; GCN-NOT: v1 +; FIXEDABI-NOT: v0 +; FIXEDABI-NOT: v1 +; VARABI-NOT: v31 +; VARABI: v_lshlrev_b32_e32 v0, 10, v1 -; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 -; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 -; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] -; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] -; GCN-NOT: v0 -; GCN-NOT: v1 +; FIXEDABI-NOT: v0 +; FIXEDABI-NOT: v1 +; VARABI-NOT: v31 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_y() #1 { @@ -150,11 +170,16 @@ ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_z: ; GCN: enable_vgpr_workitem_id = 2 +; VARABI-NOT: v0 +; VARABI-NOT: v2 +; VARABI: v_lshlrev_b32_e32 v0, 20, v2 +; VARABI-NOT: v0 +; VARABI-NOT: v1 -; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 -; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 -; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] -; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 { @@ -163,11 +188,17 @@ } ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xy: - -; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 -; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 -; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] -; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] +; VARABI-NOT: v0 +; VARABI-NOT: v1 +; VARABI: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1 +; VARABI: v_or_b32_e32 v0, v0, [[IDY]] +; VARABI-NOT: v0 +; VARABI-NOT: v1 + +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 { @@ -176,12 +207,18 @@ } ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xz: +; VARABI-NOT: v0 +; VARABI-NOT: v2 +; VARABI: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2 +; VARABI: v_or_b32_e32 v0, v0, [[IDZ]] +; VARABI-NOT: v0 +; VARABI-NOT: v2 -; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 -; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 -; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] -; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 { @@ -190,12 +227,19 @@ } ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_yz: +; VARABI-NOT: v1 +; VARABI-NOT: v2 +; VARABI-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1 +; VARABI-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2 +; VARABI: v_or_b32_e32 v0, [[IDY]], [[IDZ]] +; VARABI-NOT: v1 +; VARABI-NOT: v2 -; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 -; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 -; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] -; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_yz() #1 { @@ -204,11 +248,21 @@ } ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xyz: - -; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 -; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 -; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] -; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] +; VARABI-NOT: v0 +; VARABI-NOT: v1 +; VARABI-NOT: v2 +; VARABI-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1 +; VARABI-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2 +; VARABI-DAG: v_or_b32_e32 v0, v0, [[IDY]] +; VARABI-DAG: v_or_b32_e32 v0, v0, [[IDZ]] +; VARABI-NOT: v0 +; VARABI-NOT: v1 +; VARABI-NOT: v2 + +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_xyz() #1 { @@ -245,7 +299,8 @@ ; GCN-LABEL: {{^}}other_arg_use_workitem_id_x: ; GCN: s_waitcnt -; GCN-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31 +; VARABI-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v1 +; FIXEDABI-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]] @@ -258,7 +313,8 @@ ; GCN-LABEL: {{^}}other_arg_use_workitem_id_y: ; GCN: s_waitcnt -; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10 +; VARABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 10, 10 +; FIXEDABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]] define void @other_arg_use_workitem_id_y(i32 %arg0) #1 { @@ -270,7 +326,8 @@ ; GCN-LABEL: {{^}}other_arg_use_workitem_id_z: ; GCN: s_waitcnt -; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10 +; VARABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 20, 10 +; FIXEDABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]] define void @other_arg_use_workitem_id_z(i32 %arg0) #1 { @@ -282,13 +339,16 @@ ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_x: -; GCN: enable_vgpr_workitem_id = 2 +; VARABI: enable_vgpr_workitem_id = 0 +; FIXEDABI: enable_vgpr_workitem_id = 2 +; VARABI: v_mov_b32_e32 v1, v0 +; VARABI: v_mov_b32_e32 v0, 0x22b -; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 -; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 -; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] -; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_x() #1 { @@ -298,13 +358,20 @@ ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_y: - - -; GCN: enable_vgpr_workitem_id = 2 -; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 -; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 -; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] -; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] +; VARABI: enable_vgpr_workitem_id = 1 + +; VARABI: v_lshlrev_b32_e32 v1, 10, v1 +; VARABI-NOT: v1 +; VARABI: v_mov_b32_e32 v0, 0x22b +; VARABI-NOT: v1 +; VARABI: s_swappc_b64 +; VARABI-NOT: v0 + +; FIXEDABI: enable_vgpr_workitem_id = 2 +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_y() #1 { call void @other_arg_use_workitem_id_y(i32 555) ret void @@ -313,21 +380,29 @@ ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_z: ; GCN: enable_vgpr_workitem_id = 2 +; VARABI-DAG: v_mov_b32_e32 v0, 0x22b +; VARABI-DAG: v_lshlrev_b32_e32 v1, 20, v2 +; VARABI: s_swappc_b64 +; VARABI-NOT: v0 -; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 -; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 -; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] -; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 { call void @other_arg_use_workitem_id_z(i32 555) ret void } ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x: +; VARABI: buffer_load_dword v32, off, s[0:3], s32{{$}} +; VARABI: v_and_b32_e32 v32, 0x3ff, v32 +; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 +; VARABI: s_setpc_b64 -; GCN: v_and_b32_e32 v31, 0x3ff, v31 -; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}} +; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31 +; FIXEDABI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}} define void @too_many_args_use_workitem_id_x( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, @@ -376,19 +451,23 @@ } ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x: +; VARABI: enable_vgpr_workitem_id = 0 +; VARABI: s_mov_b32 s32, 0 +; VARABI: buffer_store_dword v0, off, s[0:3], s32{{$}} +; VARABI: s_swappc_b64 -; GCN: enable_vgpr_workitem_id = 2 -; GCN-DAG: s_mov_b32 s32, 0 -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}} -; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 -; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 -; GCN-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] -; GCN-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] -; GCN: buffer_store_dword [[K]], off, s[0:3], s32{{$}} +; FIXEDABI: enable_vgpr_workitem_id = 2 +; FIXEDABI-DAG: s_mov_b32 s32, 0 +; FIXEDABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}} +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; FIXEDABI-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; FIXEDABI-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] +; FIXEDABI: buffer_store_dword [[K]], off, s[0:3], s32{{$}} -; GCN: s_swappc_b64 +; FIXEDABI: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 { call void @too_many_args_use_workitem_id_x( i32 10, i32 20, i32 30, i32 40, @@ -403,13 +482,15 @@ } ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x: +; VARABI: s_mov_b32 s33, s32 +; VARABI: buffer_store_dword v1, off, s[0:3], s32{{$}} ; Touching the workitem id register is not necessary. -; GCN-NOT: v31 -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}} -; GCN-NOT: v31 -; GCN: buffer_store_dword [[K]], off, s[0:3], s32{{$}} -; GCN-NOT: v31 +; FIXEDABI-NOT: v31 +; FIXEDABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}} +; FIXEDABI-NOT: v31 +; FIXEDABI: buffer_store_dword [[K]], off, s[0:3], s32{{$}} +; FIXEDABI-NOT: v31 ; GCN: s_swappc_b64 define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 { @@ -458,15 +539,21 @@ ; frame[2] = VGPR spill slot ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval: +; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VARABI-NEXT: s_waitcnt +; VARABI-NEXT: v_and_b32_e32 v32, 0x3ff, v32 +; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32 +; VARABI: buffer_load_dword v0, off, s[0:3], s32 glc{{$}} +; VARABI: s_setpc_b64 -; GCN: v_and_b32_e32 v31, 0x3ff, v31 -; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v31 +; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31 +; FIXEDABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v31 -; GCN: buffer_load_dword v0, off, s[0:3], s32{{$}} -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GCN: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc{{$}} -; GCN: s_setpc_b64 +; FIXEDABI: buffer_load_dword v0, off, s[0:3], s32{{$}} +; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; FIXEDABI: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc{{$}} +; FIXEDABI: s_setpc_b64 define void @too_many_args_use_workitem_id_x_byval( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, @@ -520,27 +607,36 @@ ; sp[2] = stack passed workitem ID x ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval: +; VARABI: enable_vgpr_workitem_id = 0 +; VARABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} +; VARABI: s_movk_i32 s32, 0x400{{$}} +; VARABI: buffer_store_dword [[K]], off, s[0:3], 0 offset:4 +; VARABI: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; VARABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], 0 offset:4 +; VARABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}} +; VARABI: v_mov_b32_e32 [[RELOAD_BYVAL]], +; VARABI: s_swappc_b64 -; GCN: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7 -; GCN: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}} -; GCN: s_movk_i32 s32, 0x400{{$}} -; GCN: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140 +; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7 +; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}} +; FIXEDABI: s_movk_i32 s32, 0x400{{$}} +; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140 -; GCN: buffer_store_dword [[K1]], off, s[0:3], s32{{$}} +; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}} ; FIXME: Why this reload? -; GCN: buffer_load_dword [[RELOAD:v[0-9]+]], off, s[0:3], 0 offset:4{{$}} +; FIXEDABI: buffer_load_dword [[RELOAD:v[0-9]+]], off, s[0:3], 0 offset:4{{$}} -; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 -; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 -; GCN-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] -; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; FIXEDABI-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] -; GCN-NOT: s32 -; GCN: buffer_store_dword [[RELOAD]], off, s[0:3], s32 offset:4 -; GCN: s_swappc_b64 +; FIXEDABI-NOT: s32 +; FIXEDABI: buffer_store_dword [[RELOAD]], off, s[0:3], s32 offset:4 +; FIXEDABI: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 999, i32 addrspace(5)* %alloca @@ -558,19 +654,26 @@ } ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval: +; VARABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} +; VARABI: buffer_store_dword [[K]], off, s[0:3], s33{{$}} +; VARABI: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; VARABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}} +; VARABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}} +; VARABI: v_mov_b32_e32 [[RELOAD_BYVAL]], +; VARABI: s_swappc_b64 ; FIXED-ABI-NOT: v31 -; GCN: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7{{$}} -; GCN: buffer_store_dword [[K0]], off, s[0:3], s33{{$}} -; GCN: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}} -; GCN: buffer_store_dword [[K1]], off, s[0:3], s32{{$}} -; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}} +; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7{{$}} +; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33{{$}} +; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}} +; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}} +; FIXEDABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}} ; FIXED-ABI-NOT: v31 -; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}} +; FIXEDABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}} ; FIXED-ABI-NOT: v31 -; GCN: s_swappc_b64 +; FIXEDABI: s_swappc_b64 define void @func_call_too_many_args_use_workitem_id_x_byval() #1 { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 999, i32 addrspace(5)* %alloca @@ -588,17 +691,29 @@ } ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz: - - - -; GCN: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v31 -; GCN-NOT: buffer_load_dword -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[AND_X]] -; GCN-NOT: buffer_load_dword -; GCN: v_bfe_u32 [[BFE_Y:v[0-9]+]], v31, 10, 10 -; GCN-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v31, 20, 10 -; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]] -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]] +; VARABI-NOT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}} +; VARABI: buffer_load_dword v32, off, s[0:3], s32{{$}} +; VARABI-NOT: buffer_load_dword + +; VARABI: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v32 +; VARABI-NOT: buffer_load_dword +; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[AND_X]] +; VARABI-NOT: buffer_load_dword +; VARABI: v_bfe_u32 [[BFE_Y:v[0-9]+]], v32, 10, 10 +; VARABI-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v32, 20, 10 +; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]] +; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]] +; VARABI: s_setpc_b64 + + +; FIXEDABI: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v31 +; FIXEDABI-NOT: buffer_load_dword +; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[AND_X]] +; FIXEDABI-NOT: buffer_load_dword +; FIXEDABI: v_bfe_u32 [[BFE_Y:v[0-9]+]], v31, 10, 10 +; FIXEDABI-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v31, 20, 10 +; FIXEDABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]] +; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]] define void @too_many_args_use_workitem_id_xyz( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, @@ -659,10 +774,12 @@ ; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 ; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 ; GCN-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; VARABI-DAG: v_or_b32_e32 [[PACKEDID:v[0-9]+]], [[TMP2]], [[TMP0]] +; VARABI: buffer_store_dword [[PACKEDID]], off, s[0:3], s32{{$}} -; GCN-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140 -; GCN: buffer_store_dword [[K]], off, s[0:3], s32{{$}} +; FIXEDABI-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] +; FIXEDABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140 +; FIXEDABI: buffer_store_dword [[K]], off, s[0:3], s32{{$}} ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 { diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll --- a/llvm/test/CodeGen/AMDGPU/cc-update.ll +++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -64,64 +64,45 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 { ; GFX803-LABEL: test_kern_call: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_u32 s12, s12, s17 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX803-NEXT: s_add_u32 s0, s0, s17 -; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX803-NEXT: s_add_u32 s4, s4, s7 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; GFX803-NEXT: s_add_u32 s0, s0, s7 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 -; GFX803-NEXT: s_mov_b32 s12, s14 -; GFX803-NEXT: s_mov_b32 s13, s15 -; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 -; GFX803-NEXT: s_mov_b32 s14, s16 -; GFX803-NEXT: s_getpc_b64 s[18:19] -; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 -; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 +; GFX803-NEXT: s_getpc_b64 s[4:5] +; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 ; GFX803-NEXT: s_mov_b32 s32, 0 -; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX803-NEXT: s_endpgm ; ; GFX900-LABEL: test_kern_call: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX900-NEXT: s_add_u32 s0, s0, s17 -; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX900-NEXT: s_add_u32 s0, s0, s7 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_getpc_b64 s[18:19] -; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 -; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX900-NEXT: s_getpc_b64 s[4:5] +; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 ; GFX900-NEXT: s_mov_b32 s32, 0 -; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX900-NEXT: s_endpgm - +; ; GFX1010-LABEL: test_kern_call: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_add_u32 s12, s12, s17 -; GFX1010-NEXT: s_mov_b32 s32, 0 -; GFX1010-NEXT: s_addc_u32 s13, s13, 0 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1010-NEXT: s_add_u32 s0, s0, s17 -; GFX1010-NEXT: s_addc_u32 s1, s1, 0 -; GFX1010-NEXT: s_mov_b32 s12, s14 -; GFX1010-NEXT: s_mov_b32 s13, s15 -; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1010-NEXT: s_mov_b32 s14, s16 -; GFX1010-NEXT: s_getpc_b64 s[18:19] -; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 -; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 -; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] -; GFX1010-NEXT: s_endpgm +; GFX1010-NEXT: s_add_u32 s4, s4, s7 +; GFX1010-NEXT: s_mov_b32 s32, 0 +; GFX1010-NEXT: s_addc_u32 s5, s5, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1010-NEXT: s_add_u32 s0, s0, s7 +; GFX1010-NEXT: s_addc_u32 s1, s1, 0 +; GFX1010-NEXT: s_getpc_b64 s[4:5] +; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 +; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1010-NEXT: s_endpgm entry: tail call void @ex() #0 ret void @@ -130,73 +111,54 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { ; GFX803-LABEL: test_kern_stack_and_call: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_u32 s12, s12, s17 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX803-NEXT: s_add_u32 s0, s0, s17 -; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX803-NEXT: s_add_u32 s4, s4, s7 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; GFX803-NEXT: s_add_u32 s0, s0, s7 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 -; GFX803-NEXT: s_mov_b32 s12, s14 -; GFX803-NEXT: v_mov_b32_e32 v3, 0 -; GFX803-NEXT: s_mov_b32 s13, s15 -; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 -; GFX803-NEXT: s_mov_b32 s14, s16 -; GFX803-NEXT: s_getpc_b64 s[18:19] -; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 -; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX803-NEXT: v_mov_b32_e32 v0, 0 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 +; GFX803-NEXT: s_getpc_b64 s[4:5] +; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 ; GFX803-NEXT: s_movk_i32 s32, 0x400 -; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 +; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX803-NEXT: s_endpgm ; ; GFX900-LABEL: test_kern_stack_and_call: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX900-NEXT: s_add_u32 s0, s0, s17 -; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX900-NEXT: s_add_u32 s0, s0, s7 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_getpc_b64 s[18:19] -; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 -; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_getpc_b64 s[4:5] +; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 ; GFX900-NEXT: s_movk_i32 s32, 0x400 -; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX900-NEXT: s_endpgm - +; ; GFX1010-LABEL: test_kern_stack_and_call: -; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_add_u32 s12, s12, s17 -; GFX1010-NEXT: s_movk_i32 s32, 0x200 -; GFX1010-NEXT: s_addc_u32 s13, s13, 0 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1010-NEXT: v_mov_b32_e32 v3, 0 -; GFX1010-NEXT: s_add_u32 s0, s0, s17 -; GFX1010-NEXT: s_addc_u32 s1, s1, 0 -; GFX1010-NEXT: s_mov_b32 s12, s14 -; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1010-NEXT: s_mov_b32 s13, s15 -; GFX1010-NEXT: s_mov_b32 s14, s16 -; GFX1010-NEXT: s_getpc_b64 s[18:19] -; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 -; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 -; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 -; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] -; GFX1010-NEXT: s_endpgm +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_add_u32 s4, s4, s7 +; GFX1010-NEXT: s_movk_i32 s32, 0x200 +; GFX1010-NEXT: s_addc_u32 s5, s5, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1010-NEXT: v_mov_b32_e32 v0, 0 +; GFX1010-NEXT: s_add_u32 s0, s0, s7 +; GFX1010-NEXT: s_addc_u32 s1, s1, 0 +; GFX1010-NEXT: s_getpc_b64 s[4:5] +; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 +; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1010-NEXT: s_endpgm entry: %x = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %x, align 4 @@ -209,7 +171,7 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_mov_b32 s33, 0 ; GFX803-NEXT: s_endpgm - +; ; GFX900-LABEL: test_force_fp_kern_empty: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_mov_b32 s33, 0 @@ -271,67 +233,48 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 { ; GFX803-LABEL: test_force_fp_kern_call: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_u32 s12, s12, s17 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX803-NEXT: s_add_u32 s0, s0, s17 -; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX803-NEXT: s_add_u32 s4, s4, s7 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; GFX803-NEXT: s_add_u32 s0, s0, s7 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 -; GFX803-NEXT: s_mov_b32 s12, s14 -; GFX803-NEXT: s_mov_b32 s13, s15 -; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 -; GFX803-NEXT: s_mov_b32 s14, s16 -; GFX803-NEXT: s_getpc_b64 s[18:19] -; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 -; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 +; GFX803-NEXT: s_getpc_b64 s[4:5] +; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 ; GFX803-NEXT: s_mov_b32 s32, 0 ; GFX803-NEXT: s_mov_b32 s33, 0 -; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX803-NEXT: s_endpgm ; ; GFX900-LABEL: test_force_fp_kern_call: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX900-NEXT: s_add_u32 s0, s0, s17 -; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX900-NEXT: s_add_u32 s0, s0, s7 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_getpc_b64 s[18:19] -; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 -; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX900-NEXT: s_getpc_b64 s[4:5] +; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 ; GFX900-NEXT: s_mov_b32 s32, 0 ; GFX900-NEXT: s_mov_b32 s33, 0 -; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX900-NEXT: s_endpgm ; ; GFX1010-LABEL: test_force_fp_kern_call: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT s_add_u32 s12, s12, s17 -; GFX1010-NEXT s_mov_b32 s32, 0 -; GFX1010-NEXT s_mov_b32 s33, 0 -; GFX1010-NEXT s_addc_u32 s13, s13, 0 -; GFX1010-NEXT s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX1010-NEXT s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX1010-NEXT v_lshlrev_b32_e32 v2, 20, v2 -; GFX1010-NEXT v_lshlrev_b32_e32 v1, 10, v1 -; GFX1010-NEXT s_add_u32 s0, s0, s17 -; GFX1010-NEXT s_addc_u32 s1, s1, 0 -; GFX1010-NEXT s_mov_b32 s12, s14 -; GFX1010-NEXT s_mov_b32 s13, s15 -; GFX1010-NEXT v_or3_b32 v31, v0, v1, v2 -; GFX1010-NEXT s_mov_b32 s14, s16 -; GFX1010-NEXT s_getpc_b64 s[18:19] -; GFX1010-NEXT s_add_u32 s18, s18, ex@rel32@lo+4 -; GFX1010-NEXT s_addc_u32 s19, s19, ex@rel32@hi+12 -; GFX1010-NEXT s_swappc_b64 s[30:31], s[18:19] -; GFX1010-NEXT s_endpgm +; GFX1010-NEXT: s_add_u32 s4, s4, s7 +; GFX1010-NEXT: s_mov_b32 s32, 0 +; GFX1010-NEXT: s_mov_b32 s33, 0 +; GFX1010-NEXT: s_addc_u32 s5, s5, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1010-NEXT: s_add_u32 s0, s0, s7 +; GFX1010-NEXT: s_addc_u32 s1, s1, 0 +; GFX1010-NEXT: s_getpc_b64 s[4:5] +; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 +; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1010-NEXT: s_endpgm entry: tail call void @ex() #2 ret void @@ -340,76 +283,57 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_addr #2 { ; GFX803-LABEL: test_force_fp_kern_stack_and_call: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_u32 s12, s12, s17 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX803-NEXT: s_add_u32 s0, s0, s17 -; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 -; GFX803-NEXT: s_addc_u32 s1, s1, 0 -; GFX803-NEXT: s_mov_b32 s12, s14 +; GFX803-NEXT: s_add_u32 s4, s4, s7 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; GFX803-NEXT: s_add_u32 s0, s0, s7 ; GFX803-NEXT: s_mov_b32 s33, 0 -; GFX803-NEXT: v_mov_b32_e32 v3, 0 -; GFX803-NEXT: s_mov_b32 s13, s15 -; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 -; GFX803-NEXT: s_mov_b32 s14, s16 -; GFX803-NEXT: s_getpc_b64 s[18:19] -; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 -; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX803-NEXT: s_addc_u32 s1, s1, 0 +; GFX803-NEXT: v_mov_b32_e32 v0, 0 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 +; GFX803-NEXT: s_getpc_b64 s[4:5] +; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 ; GFX803-NEXT: s_movk_i32 s32, 0x400 -; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4 +; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX803-NEXT: s_endpgm ; ; GFX900-LABEL: test_force_fp_kern_stack_and_call: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX900-NEXT: s_add_u32 s0, s0, s17 -; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX900-NEXT: s_add_u32 s0, s0, s7 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 -; GFX900-NEXT: s_mov_b32 s12, s14 ; GFX900-NEXT: s_mov_b32 s33, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_getpc_b64 s[18:19] -; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 -; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_getpc_b64 s[4:5] +; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 ; GFX900-NEXT: s_movk_i32 s32, 0x400 -; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX900-NEXT: s_endpgm ; ; GFX1010-LABEL: test_force_fp_kern_stack_and_call: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_add_u32 s12, s12, s17 -; GFX1010-NEXT: s_movk_i32 s32, 0x200 -; GFX1010-NEXT: s_mov_b32 s33, 0 -; GFX1010-NEXT: s_addc_u32 s13, s13, 0 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1010-NEXT: v_mov_b32_e32 v3, 0 -; GFX1010-NEXT: s_add_u32 s0, s0, s17 -; GFX1010-NEXT: s_addc_u32 s1, s1, 0 -; GFX1010-NEXT: s_mov_b32 s12, s14 -; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1010-NEXT: s_mov_b32 s13, s15 -; GFX1010-NEXT: s_mov_b32 s14, s16 -; GFX1010-NEXT: s_getpc_b64 s[18:19] -; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 -; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 -; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4 -; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] -; GFX1010-NEXT: s_endpgm +; GFX1010-NEXT: s_add_u32 s4, s4, s7 +; GFX1010-NEXT: s_movk_i32 s32, 0x200 +; GFX1010-NEXT: s_mov_b32 s33, 0 +; GFX1010-NEXT: s_addc_u32 s5, s5, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1010-NEXT: v_mov_b32_e32 v0, 0 +; GFX1010-NEXT: s_add_u32 s0, s0, s7 +; GFX1010-NEXT: s_addc_u32 s1, s1, 0 +; GFX1010-NEXT: s_getpc_b64 s[4:5] +; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 +; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1010-NEXT: s_endpgm entry: %x = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %x, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -27,18 +27,18 @@ ; GCN-LABEL: call_split_type_used_outside_block_v2f32: ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 v40, s33, 2 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, func_v2f32@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, func_v2f32@rel32@hi+12 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, func_v2f32@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, func_v2f32@rel32@hi+12 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_readlane_b32 s4, v40, 0 ; GCN-NEXT: v_readlane_b32 s5, v40, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 @@ -59,29 +59,30 @@ define float @call_split_type_used_outside_block_v3f32() #0 { ; GCN-LABEL: call_split_type_used_outside_block_v3f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v40, s33, 2 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_add_u32 s32, s32, 0x400 -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, func_v3f32@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, func_v3f32@rel32@hi+12 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: v_readlane_b32 s4, v40, 0 -; GCN-NEXT: v_readlane_b32 s5, v40, 1 -; GCN-NEXT: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: v_readlane_b32 s33, v40, 2 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN: ; %bb.0: ; %bb0 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_writelane_b32 v40, s33, 2 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_add_u32 s32, s32, 0x400 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, func_v3f32@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, func_v3f32@rel32@hi+12 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_readlane_b32 s4, v40, 0 +; GCN-NEXT: v_readlane_b32 s5, v40, 1 +; GCN-NEXT: s_sub_u32 s32, s32, 0x400 +; GCN-NEXT: v_readlane_b32 s33, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] +bb0: %split.ret.type = call <3 x float> @func_v3f32() br label %bb1 @@ -93,29 +94,28 @@ define half @call_split_type_used_outside_block_v4f16() #0 { ; GCN-LABEL: call_split_type_used_outside_block_v4f16: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v40, s33, 2 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_add_u32 s32, s32, 0x400 -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, func_v4f16@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, func_v4f16@rel32@hi+12 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: v_readlane_b32 s4, v40, 0 -; GCN-NEXT: v_readlane_b32 s5, v40, 1 -; GCN-NEXT: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: v_readlane_b32 s33, v40, 2 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[4:5] - +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_writelane_b32 v40, s33, 2 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_add_u32 s32, s32, 0x400 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, func_v4f16@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, func_v4f16@rel32@hi+12 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_readlane_b32 s4, v40, 0 +; GCN-NEXT: v_readlane_b32 s5, v40, 1 +; GCN-NEXT: s_sub_u32 s32, s32, 0x400 +; GCN-NEXT: v_readlane_b32 s33, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] bb0: %split.ret.type = call <4 x half> @func_v4f16() br label %bb1 @@ -128,29 +128,29 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 { ; GCN-LABEL: call_split_type_used_outside_block_struct: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v40, s33, 2 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_add_u32 s32, s32, 0x400 -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, func_struct@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, func_struct@rel32@hi+12 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: v_readlane_b32 s4, v40, 0 -; GCN-NEXT: v_mov_b32_e32 v1, v4 -; GCN-NEXT: v_readlane_b32 s5, v40, 1 -; GCN-NEXT: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: v_readlane_b32 s33, v40, 2 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_writelane_b32 v40, s33, 2 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_add_u32 s32, s32, 0x400 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, func_struct@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, func_struct@rel32@hi+12 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_readlane_b32 s4, v40, 0 +; GCN-NEXT: v_mov_b32_e32 v1, v4 +; GCN-NEXT: v_readlane_b32 s5, v40, 1 +; GCN-NEXT: s_sub_u32 s32, s32, 0x400 +; GCN-NEXT: v_readlane_b32 s33, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] bb0: %split.ret.type = call { <4 x i32>, <4 x half> } @func_struct() br label %bb1 @@ -168,40 +168,32 @@ define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 { ; GCN-LABEL: v3i16_registers: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GCN-NEXT: s_load_dword s12, s[8:9], 0x0 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GCN-NEXT: s_add_u32 s0, s0, s17 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s12, 1, s12 -; GCN-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 1 -; GCN-NEXT: s_and_b64 vcc, exec, s[12:13] -; GCN-NEXT: s_cbranch_vccnz BB4_2 -; GCN-NEXT: ; %bb.1: ; %if.else -; GCN-NEXT: s_add_u32 s8, s8, 8 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: s_addc_u32 s9, s9, 0 -; GCN-NEXT: s_mov_b32 s12, s14 -; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 -; GCN-NEXT: s_mov_b32 s13, s15 -; GCN-NEXT: s_mov_b32 s14, s16 -; GCN-NEXT: s_getpc_b64 s[18:19] -; GCN-NEXT: s_add_u32 s18, s18, func_v3i16@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s19, s19, func_v3i16@rel32@hi+12 -; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] -; GCN-NEXT: s_branch BB4_3 -; GCN-NEXT: BB4_2: -; GCN-NEXT: s_mov_b32 s4, 0 -; GCN-NEXT: s_mov_b32 s5, s4 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: BB4_3: ; %if.end -; GCN-NEXT: global_store_short v[0:1], v1, off -; GCN-NEXT: global_store_dword v[0:1], v0, off -; GCN-NEXT: s_endpgm +; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_and_b32 s4, 1, s4 +; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 1 +; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_cbranch_vccnz BB4_2 +; GCN-NEXT: ; %bb.1: ; %if.else +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, func_v3i16@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, func_v3i16@rel32@hi+12 +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_branch BB4_3 +; GCN-NEXT: BB4_2: +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: BB4_3: ; %if.end +; GCN-NEXT: global_store_short v[0:1], v1, off +; GCN-NEXT: global_store_dword v[0:1], v0, off +; GCN-NEXT: s_endpgm entry: br i1 %cond, label %if.then, label %if.else @@ -221,36 +213,32 @@ define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 { ; GCN-LABEL: v3f16_registers: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GCN-NEXT: s_load_dword s12, s[8:9], 0x0 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GCN-NEXT: s_add_u32 s0, s0, s17 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s12, 1, s12 -; GCN-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 1 -; GCN-NEXT: s_and_b64 vcc, exec, s[12:13] -; GCN-NEXT: s_cbranch_vccnz BB5_2 -; GCN-NEXT: %bb.1: ; %if.else -; GCN-NEXT: s_add_u32 s8, s8, 8 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: s_addc_u32 s9, s9, 0 -; GCN-NEXT: s_mov_b32 s12, s14 -; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 -; GCN-NEXT: s_mov_b32 s13, s15 -; GCN-NEXT: s_mov_b32 s14, s16 -; GCN-NEXT: s_getpc_b64 s[18:19] -; GCN-NEXT: s_add_u32 s18, s18, func_v3f16@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s19, s19, func_v3f16@rel32@hi+12 -; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] -; GCN-NEXT: s_branch BB5_3 +; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_and_b32 s4, 1, s4 +; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 1 +; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_cbranch_vccnz BB5_2 +; GCN-NEXT: ; %bb.1: ; %if.else +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, func_v3f16@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, func_v3f16@rel32@hi+12 +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_branch BB5_3 ; GCN-NEXT: BB5_2: -; GCN-NEXT: s_mov_b32 s4, 0 -; GCN-NEXT: s_mov_b32 s5, s4 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: BB5_3: ; %if.end +; GCN-NEXT: global_store_short v[0:1], v1, off +; GCN-NEXT: global_store_dword v[0:1], v0, off +; GCN-NEXT: s_endpgm entry: br i1 %cond, label %if.then, label %if.else diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll @@ -0,0 +1,25 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: define internal void @indirect() #0 { +define internal void @indirect() { + ret void +} + +; GCN-LABEL: define internal void @direct() #1 { +define internal void @direct() { + %fptr = alloca void()* + store void()* @indirect, void()** %fptr + %fp = load void()*, void()** %fptr + call void %fp() + ret void +} + +; GCN-LABEL: define amdgpu_kernel void @test_direct_indirect_call() #2 { +define amdgpu_kernel void @test_direct_indirect_call() { + call void @direct() + ret void +} + +; attributes #0 = { "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } +; attributes #1 = { "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-stack-objects" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "uniform-work-group-size"="false" } +; attributes #2 = { "amdgpu-calls" "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll @@ -0,0 +1,22 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: define internal void @indirect() #0 { +define internal void @indirect() { + ret void +} + +; GCN-LABEL: define amdgpu_kernel void @test_simple_indirect_call() #1 { +define amdgpu_kernel void @test_simple_indirect_call() #0 { + %fptr = alloca void()* + store void()* @indirect, void()** %fptr + %fp = load void()*, void()** %fptr + call void %fp() + ret void +} + +attributes #0 = { "amdgpu-dispatch-id" } + +; compiler modification to attributes +attributes #0 = { "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } +attributes #1 = { "amdgpu-calls" "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-stack-objects" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } + diff --git a/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll b/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll --- a/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll @@ -11,25 +11,25 @@ ; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN: %13:vgpr_32, %14:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GCN: %15:vgpr_32, %16:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GCN: %17:vgpr_32 = nofpexcept V_RCP_F32_e64 0, %15, 0, 0, implicit $mode, implicit $exec + ; GCN: %6:vgpr_32, %7:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GCN: %8:vgpr_32, %9:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GCN: %10:vgpr_32 = nofpexcept V_RCP_F32_e64 0, %8, 0, 0, implicit $mode, implicit $exec ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3 ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216 ; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GCN: S_SETREG_B32_mode killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode - ; GCN: %21:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %15, 0, %17, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec - ; GCN: %22:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %21, 0, %17, 0, %17, 0, 0, implicit $mode, implicit $exec - ; GCN: %23:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %13, 0, %22, 0, 0, implicit $mode, implicit $exec - ; GCN: %24:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %15, 0, %23, 0, %13, 0, 0, implicit $mode, implicit $exec - ; GCN: %25:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %24, 0, %22, 0, %23, 0, 0, implicit $mode, implicit $exec - ; GCN: %26:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %15, 0, %25, 0, %13, 0, 0, implicit $mode, implicit $exec + ; GCN: %14:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %8, 0, %10, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec + ; GCN: %15:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %14, 0, %10, 0, %10, 0, 0, implicit $mode, implicit $exec + ; GCN: %16:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %6, 0, %15, 0, 0, implicit $mode, implicit $exec + ; GCN: %17:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %8, 0, %16, 0, %6, 0, 0, implicit $mode, implicit $exec + ; GCN: %18:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %17, 0, %15, 0, %16, 0, 0, implicit $mode, implicit $exec + ; GCN: %19:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %8, 0, %18, 0, %6, 0, 0, implicit $mode, implicit $exec ; GCN: S_SETREG_B32_mode killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode - ; GCN: $vcc = COPY %14 - ; GCN: %27:vgpr_32 = nofpexcept V_DIV_FMAS_F32_e64 0, killed %26, 0, %22, 0, %25, 0, 0, implicit $mode, implicit $vcc, implicit $exec - ; GCN: %28:vgpr_32 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed %27, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GCN: $vcc = COPY %7 + ; GCN: %20:vgpr_32 = nofpexcept V_DIV_FMAS_F32_e64 0, killed %19, 0, %15, 0, %18, 0, 0, implicit $mode, implicit $vcc, implicit $exec + ; GCN: %21:vgpr_32 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed %20, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec ; GCN: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] - ; GCN: $vgpr0 = COPY %28 + ; GCN: $vgpr0 = COPY %21 ; GCN: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] ; GCN: S_SETPC_B64_return [[COPY4]], implicit $vgpr0 entry: @@ -44,25 +44,25 @@ ; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN: %13:vgpr_32, %14:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GCN: %15:vgpr_32, %16:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GCN: %17:vgpr_32 = nnan nofpexcept V_RCP_F32_e64 0, %15, 0, 0, implicit $mode, implicit $exec + ; GCN: %6:vgpr_32, %7:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GCN: %8:vgpr_32, %9:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GCN: %10:vgpr_32 = nnan nofpexcept V_RCP_F32_e64 0, %8, 0, 0, implicit $mode, implicit $exec ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3 ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216 ; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GCN: S_SETREG_B32_mode killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode - ; GCN: %21:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %15, 0, %17, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec - ; GCN: %22:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %21, 0, %17, 0, %17, 0, 0, implicit $mode, implicit $exec - ; GCN: %23:vgpr_32 = nnan nofpexcept V_MUL_F32_e64 0, %13, 0, %22, 0, 0, implicit $mode, implicit $exec - ; GCN: %24:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %15, 0, %23, 0, %13, 0, 0, implicit $mode, implicit $exec - ; GCN: %25:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %24, 0, %22, 0, %23, 0, 0, implicit $mode, implicit $exec - ; GCN: %26:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %15, 0, %25, 0, %13, 0, 0, implicit $mode, implicit $exec + ; GCN: %14:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %8, 0, %10, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec + ; GCN: %15:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %14, 0, %10, 0, %10, 0, 0, implicit $mode, implicit $exec + ; GCN: %16:vgpr_32 = nnan nofpexcept V_MUL_F32_e64 0, %6, 0, %15, 0, 0, implicit $mode, implicit $exec + ; GCN: %17:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %8, 0, %16, 0, %6, 0, 0, implicit $mode, implicit $exec + ; GCN: %18:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %17, 0, %15, 0, %16, 0, 0, implicit $mode, implicit $exec + ; GCN: %19:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %8, 0, %18, 0, %6, 0, 0, implicit $mode, implicit $exec ; GCN: S_SETREG_B32_mode killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode - ; GCN: $vcc = COPY %14 - ; GCN: %27:vgpr_32 = nnan nofpexcept V_DIV_FMAS_F32_e64 0, killed %26, 0, %22, 0, %25, 0, 0, implicit $mode, implicit $vcc, implicit $exec - ; GCN: %28:vgpr_32 = nnan nofpexcept V_DIV_FIXUP_F32_e64 0, killed %27, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GCN: $vcc = COPY %7 + ; GCN: %20:vgpr_32 = nnan nofpexcept V_DIV_FMAS_F32_e64 0, killed %19, 0, %15, 0, %18, 0, 0, implicit $mode, implicit $vcc, implicit $exec + ; GCN: %21:vgpr_32 = nnan nofpexcept V_DIV_FIXUP_F32_e64 0, killed %20, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec ; GCN: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] - ; GCN: $vgpr0 = COPY %28 + ; GCN: $vgpr0 = COPY %21 ; GCN: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] ; GCN: S_SETPC_B64_return [[COPY4]], implicit $vgpr0 entry: diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll --- a/llvm/test/CodeGen/AMDGPU/ipra.ll +++ b/llvm/test/CodeGen/AMDGPU/ipra.ll @@ -31,7 +31,7 @@ ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8 ; GCN: ; NumSgprs: 37 -; GCN: ; NumVgprs: 32 +; GCN: ; NumVgprs: 9 define amdgpu_kernel void @kernel_call() #0 { %vgpr = load volatile i32, i32 addrspace(1)* undef tail call void @func() @@ -53,7 +53,7 @@ ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8 ; GCN: ; NumSgprs: 32 -; GCN: ; NumVgprs: 32 +; GCN: ; NumVgprs: 9 define void @func_regular_call() #1 { %vgpr = load volatile i32, i32 addrspace(1)* undef tail call void @func() @@ -63,13 +63,13 @@ ; GCN-LABEL: {{^}}func_tail_call: ; GCN: s_waitcnt -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, -; GCN-NEXT: s_addc_u32 s17, -; GCN-NEXT: s_setpc_b64 s[16:17] +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, +; GCN-NEXT: s_addc_u32 s5, +; GCN-NEXT: s_setpc_b64 s[4:5] ; GCN: ; NumSgprs: 32 -; GCN: ; NumVgprs: 32 +; GCN: ; NumVgprs: 8 define void @func_tail_call() #1 { tail call void @func() ret void @@ -82,7 +82,7 @@ ; GCN: s_setpc_b64 ; GCN: ; NumSgprs: 32 -; GCN: ; NumVgprs: 32 +; GCN: ; NumVgprs: 9 define void @func_call_tail_call() #1 { %vgpr = load volatile i32, i32 addrspace(1)* undef tail call void @func() diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll --- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll @@ -13,9 +13,9 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_write_b32 v0, v0 -; GFX8-NEXT: s_mov_b64 s[0:1], s[6:7] +; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX8-NEXT: s_trap 2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -37,7 +37,7 @@ ; GFX8-LABEL: func_use_lds_global_constexpr_cast: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b64 s[0:1], s[6:7] +; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX8-NEXT: s_trap 2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll @@ -59,8 +59,7 @@ ; GCN-LABEL: {{^}}func_implicitarg_ptr: ; GCN: s_waitcnt -; HSA: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 -; MESA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @func_implicitarg_ptr() #0 { @@ -72,8 +71,7 @@ ; GCN-LABEL: {{^}}opencl_func_implicitarg_ptr: ; GCN: s_waitcnt -; HSA: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 -; MESA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @opencl_func_implicitarg_ptr() #0 { @@ -114,11 +112,10 @@ ; HSA: kernarg_segment_byte_size = 112 ; MESA: kernarg_segment_byte_size = 128 -; HSA: s_add_u32 s8, s8, 0x70 +; HSA: s_add_u32 s4, s4, 0x70 ; MESA: s_add_u32 s4, s4, 0x70 -; HSA: s_addc_u32 s9, s9, 0{{$}} -; MESA: s_addc_u32 s5, s5, 0{{$}} +; GCN: s_addc_u32 s5, s5, 0{{$}} ; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_implicitarg_ptr_func([112 x i8]) #0 { call void @func_implicitarg_ptr() @@ -130,10 +127,8 @@ ; HSA: kernarg_segment_byte_size = 160 ; MESA: kernarg_segment_byte_size = 128 -; HSA: s_add_u32 s8, s8, 0x70 -; HSA: s_addc_u32 s9, s9, 0{{$}} -; MESA: s_add_u32 s4, s4, 0x70 -; MESA: s_addc_u32 s5, s5, 0{{$}} +; GCN: s_add_u32 s4, s4, 0x70 +; GCN: s_addc_u32 s5, s5, 0{{$}} ; GCN: s_swappc_b64 define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func([112 x i8]) #1 { call void @func_implicitarg_ptr() @@ -141,24 +136,18 @@ } ; GCN-LABEL: {{^}}func_call_implicitarg_ptr_func: -; HSA-NOT: s8 -; HSA-NOT: s9 -; HSA-NOT: s[8:9] -; MESA-NOT: s4 -; MESA-NOT: s5 -; MESA-NOT: s[4:5] +; GCN-NOT: s4 +; GCN-NOT: s5 +; GCN-NOT: s[4:5] define void @func_call_implicitarg_ptr_func() #0 { call void @func_implicitarg_ptr() ret void } ; GCN-LABEL: {{^}}opencl_func_call_implicitarg_ptr_func: -; HSA-NOT: s8 -; HSA-NOT: s9 -; HSA-NOT: s[8:9] -; MESA-NOT: s4 -; MESA-NOT: s5 -; MESA-NOT: s[4:5] +; GCN-NOT: s4 +; GCN-NOT: s5 +; GCN-NOT: s[4:5] define void @opencl_func_call_implicitarg_ptr_func() #0 { call void @func_implicitarg_ptr() ret void @@ -168,8 +157,7 @@ ; GCN: s_waitcnt ; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0 ; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0 -; HSA: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 -; MESA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 ; GCN: s_waitcnt lgkmcnt(0) define void @func_kernarg_implicitarg_ptr() #0 { %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() @@ -185,8 +173,7 @@ ; GCN: s_waitcnt ; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0 ; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0 -; HSA: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 -; MESA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 ; GCN: s_waitcnt lgkmcnt(0) define void @opencl_func_kernarg_implicitarg_ptr() #0 { %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() @@ -199,10 +186,8 @@ } ; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func: -; HSA: s_add_u32 s8, s8, 0x70 -; HSA: s_addc_u32 s9, s9, 0 -; MESA: s_add_u32 s4, s4, 0x70 -; MESA: s_addc_u32 s5, s5, 0 +; GCN: s_add_u32 s4, s4, 0x70 +; GCN: s_addc_u32 s5, s5, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 { call void @func_kernarg_implicitarg_ptr() diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -187,98 +187,49 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 { ; GFX9-LABEL: slsr1_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] -; GFX9-NEXT: v_writelane_b32 v44, s33, 15 -; GFX9-NEXT: v_writelane_b32 v44, s34, 0 -; GFX9-NEXT: v_writelane_b32 v44, s35, 1 -; GFX9-NEXT: v_writelane_b32 v44, s36, 2 -; GFX9-NEXT: v_writelane_b32 v44, s37, 3 -; GFX9-NEXT: v_writelane_b32 v44, s38, 4 -; GFX9-NEXT: v_writelane_b32 v44, s39, 5 -; GFX9-NEXT: v_writelane_b32 v44, s40, 6 -; GFX9-NEXT: v_writelane_b32 v44, s41, 7 -; GFX9-NEXT: v_writelane_b32 v44, s42, 8 -; GFX9-NEXT: v_writelane_b32 v44, s43, 9 -; GFX9-NEXT: v_writelane_b32 v44, s44, 10 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_add_u32 s32, s32, 0x800 -; GFX9-NEXT: s_mov_b64 s[40:41], s[4:5] -; GFX9-NEXT: v_writelane_b32 v44, s46, 11 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v44, s47, 12 -; GFX9-NEXT: s_load_dwordx2 s[46:47], s[4:5], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-NEXT: v_mov_b32_e32 v42, v0 -; GFX9-NEXT: v_writelane_b32 v44, s30, 13 -; GFX9-NEXT: v_mul_u32_u24_e32 v0, v42, v41 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-NEXT: v_writelane_b32 v44, s31, 14 -; GFX9-NEXT: v_mov_b32_e32 v40, v31 -; GFX9-NEXT: s_mov_b32 s42, s14 -; GFX9-NEXT: s_mov_b32 s43, s13 -; GFX9-NEXT: s_mov_b32 s44, s12 -; GFX9-NEXT: s_mov_b64 s[34:35], s[10:11] -; GFX9-NEXT: s_mov_b64 s[36:37], s[8:9] -; GFX9-NEXT: s_mov_b64 s[38:39], s[6:7] -; GFX9-NEXT: v_and_b32_e32 v43, 0xffffff, v41 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[46:47] -; GFX9-NEXT: v_mad_u32_u24 v41, v42, v41, v43 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] -; GFX9-NEXT: s_mov_b64 s[8:9], s[36:37] -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s44 -; GFX9-NEXT: s_mov_b32 s13, s43 -; GFX9-NEXT: s_mov_b32 s14, s42 -; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: v_mov_b32_e32 v0, v41 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[46:47] -; GFX9-NEXT: v_add_u32_e32 v0, v41, v43 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] -; GFX9-NEXT: s_mov_b64 s[8:9], s[36:37] -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s44 -; GFX9-NEXT: s_mov_b32 s13, s43 -; GFX9-NEXT: s_mov_b32 s14, s42 -; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[46:47] -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s4, v44, 13 -; GFX9-NEXT: v_readlane_b32 s5, v44, 14 -; GFX9-NEXT: v_readlane_b32 s47, v44, 12 -; GFX9-NEXT: v_readlane_b32 s46, v44, 11 -; GFX9-NEXT: v_readlane_b32 s44, v44, 10 -; GFX9-NEXT: v_readlane_b32 s43, v44, 9 -; GFX9-NEXT: v_readlane_b32 s42, v44, 8 -; GFX9-NEXT: v_readlane_b32 s41, v44, 7 -; GFX9-NEXT: v_readlane_b32 s40, v44, 6 -; GFX9-NEXT: v_readlane_b32 s39, v44, 5 -; GFX9-NEXT: v_readlane_b32 s38, v44, 4 -; GFX9-NEXT: v_readlane_b32 s37, v44, 3 -; GFX9-NEXT: v_readlane_b32 s36, v44, 2 -; GFX9-NEXT: v_readlane_b32 s35, v44, 1 -; GFX9-NEXT: v_readlane_b32 s34, v44, 0 -; GFX9-NEXT: s_sub_u32 s32, s32, 0x800 -; GFX9-NEXT: v_readlane_b32 s33, v44, 15 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v43, s33, 4 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x800 +; GFX9-NEXT: v_writelane_b32 v43, s34, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v43, s35, 1 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, v1 +; GFX9-NEXT: v_mov_b32_e32 v41, v0 +; GFX9-NEXT: v_writelane_b32 v43, s30, 2 +; GFX9-NEXT: v_mul_u32_u24_e32 v0, v41, v40 +; GFX9-NEXT: v_writelane_b32 v43, s31, 3 +; GFX9-NEXT: v_and_b32_e32 v42, 0xffffff, v40 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: v_mad_u32_u24 v40, v41, v40, v42 +; GFX9-NEXT: v_mov_b32_e32 v0, v40 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: v_add_u32_e32 v0, v40, v42 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s4, v43, 2 +; GFX9-NEXT: v_readlane_b32 s5, v43, 3 +; GFX9-NEXT: v_readlane_b32 s35, v43, 1 +; GFX9-NEXT: v_readlane_b32 s34, v43, 0 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x800 +; GFX9-NEXT: v_readlane_b32 s33, v43, 4 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] %b = and i32 %b.arg, 16777215 %s = and i32 %s.arg, 16777215 diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-csr-vgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-csr-vgpr-spill.ll --- a/llvm/test/CodeGen/AMDGPU/need-fp-from-csr-vgpr-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-csr-vgpr-spill.ll @@ -27,23 +27,23 @@ ; CHECK-LABEL: csr_vgpr_spill_fp_callee: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s15, s33 +; CHECK-NEXT: s_mov_b32 s8, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_add_u32 s32, s32, 0x400 -; CHECK-NEXT: s_getpc_b64 s[18:19] -; CHECK-NEXT: s_add_u32 s18, s18, callee_has_fp@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s19, s19, callee_has_fp@rel32@hi+12 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 s[16:17], s[30:31] -; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] +; CHECK-NEXT: s_mov_b64 s[6:7], s[30:31] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; clobber csr v40 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_sub_u32 s32, s32, 0x400 -; CHECK-NEXT: s_mov_b32 s33, s15 +; CHECK-NEXT: s_mov_b32 s33, s8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[16:17] +; CHECK-NEXT: s_setpc_b64 s[6:7] bb: call fastcc void @callee_has_fp() call void asm sideeffect "; clobber csr v40", "~{v40}"() @@ -53,15 +53,15 @@ define amdgpu_kernel void @kernel_call() { ; CHECK-LABEL: kernel_call: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s17 -; CHECK-DAG: s_addc_u32 s1, s1, 0 -; CHECK-DAG: s_getpc_b64 s[18:19] -; CHECK-DAG: s_add_u32 s18, s18, csr_vgpr_spill_fp_callee@rel32@lo+4 -; CHECK-DAG: s_addc_u32 s19, s19, csr_vgpr_spill_fp_callee@rel32@hi+12 -; CHECK-DAG: s_mov_b32 s32, 0 -; CHECK-DAG: s_swappc_b64 s[30:31], s[18:19] +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s7 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, csr_vgpr_spill_fp_callee@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, csr_vgpr_spill_fp_callee@rel32@hi+12 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_endpgm bb: tail call fastcc void @csr_vgpr_spill_fp_callee() @@ -73,23 +73,23 @@ ; CHECK-LABEL: csr_vgpr_spill_fp_tailcall_callee: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_or_saveexec_b64 s[16:17], -1 +; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[16:17] +; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; clobber csr v40 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; CHECK-NEXT: v_writelane_b32 v1, s33, 0 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, callee_has_fp@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, callee_has_fp@rel32@hi+12 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12 ; CHECK-NEXT: v_readlane_b32 s33, v1, 0 -; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 +; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: s_setpc_b64 s[16:17] +; CHECK-NEXT: s_mov_b64 exec, s[6:7] +; CHECK-NEXT: s_setpc_b64 s[4:5] bb: call void asm sideeffect "; clobber csr v40", "~{v40}"() tail call fastcc void @callee_has_fp() @@ -99,15 +99,15 @@ define amdgpu_kernel void @kernel_tailcall() { ; CHECK-LABEL: kernel_tailcall: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s17 -; CHECK-DAG: s_addc_u32 s1, s1, 0 -; CHECK-DAG: s_getpc_b64 s[18:19] -; CHECK-NEXT: s_add_u32 s18, s18, csr_vgpr_spill_fp_tailcall_callee@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s19, s19, csr_vgpr_spill_fp_tailcall_callee@rel32@hi+12 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s7 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, csr_vgpr_spill_fp_tailcall_callee@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, csr_vgpr_spill_fp_tailcall_callee@rel32@hi+12 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_endpgm bb: tail call fastcc void @csr_vgpr_spill_fp_tailcall_callee() diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -238,7 +238,7 @@ ; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4 ; MUBUF-NEXT: v_lshl_add_u32 v2, v4, 2, s6 ; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31 +; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v5 ; MUBUF-NEXT: s_mov_b32 s32, s6 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3 @@ -275,7 +275,7 @@ ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s2 ; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s4 ; FLATSCR-NEXT: scratch_load_dword v2, v2, off -; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31 +; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v5 ; FLATSCR-NEXT: s_mov_b32 s32, s4 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3 @@ -331,13 +331,13 @@ ; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 ; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0 -; MUBUF-NEXT: v_mov_b32_e32 v4, s6 -; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; MUBUF-NEXT: v_mov_b32_e32 v5, s6 +; MUBUF-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen ; MUBUF-NEXT: v_mov_b32_e32 v2, 1 -; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen offset:4 ; MUBUF-NEXT: v_lshl_add_u32 v2, v3, 2, s6 ; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31 +; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v4 ; MUBUF-NEXT: s_mov_b32 s32, s6 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3 @@ -364,12 +364,12 @@ ; FLATSCR-NEXT: ; %bb.1: ; %bb.0 ; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 ; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000 -; FLATSCR-NEXT: v_mov_b32_e32 v4, 0 -; FLATSCR-NEXT: v_mov_b32_e32 v5, 1 +; FLATSCR-NEXT: v_mov_b32_e32 v5, 0 +; FLATSCR-NEXT: v_mov_b32_e32 v6, 1 ; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s2 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[4:5], s2 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[5:6], s2 ; FLATSCR-NEXT: scratch_load_dword v2, v2, off -; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31 +; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v4 ; FLATSCR-NEXT: s_mov_b32 s32, s2 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll --- a/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll @@ -10,7 +10,7 @@ ; GCN: v_writelane_b32 v255, s33, 2 ; GCN: v_writelane_b32 v255, s30, 0 ; GCN: v_writelane_b32 v255, s31, 1 -; GCN: s_swappc_b64 s[30:31], s[16:17] +; GCN: s_swappc_b64 s[30:31], s[4:5] ; GCN: v_readlane_b32 s30, v255, 0 ; GCN: v_readlane_b32 s31, v255, 1 ; GCN: v_readlane_b32 s33, v255, 2 @@ -56,7 +56,7 @@ ; GCN: v_writelane_b32 v254, s33, 2 ; GCN: v_writelane_b32 v254, s30, 0 ; GCN: v_writelane_b32 v254, s31, 1 -; GCN: s_swappc_b64 s[30:31], s[16:17] +; GCN: s_swappc_b64 s[30:31], s[4:5] ; GCN: v_readlane_b32 s30, v254, 0 ; GCN: v_readlane_b32 s31, v254, 1 ; GCN: v_readlane_b32 s33, v254, 2 @@ -150,7 +150,7 @@ ; GCN-LABEL: {{^}}reserve_vgpr_with_tail_call ; GCN-NOT: buffer_store_dword v255, off, s[0:3], s32 ; GCN-NOT: v_writelane -; GCN: s_setpc_b64 s[16:17] +; GCN: s_setpc_b64 s[4:5] define void @reserve_vgpr_with_tail_call() #0 { %alloca = alloca i32, align 4, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -128,12 +128,12 @@ ; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32 offset:8 +; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}} ; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4 ; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 -; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_1]] ; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_0]] +; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_1]] ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 @@ -155,9 +155,12 @@ ; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}} ; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4 +; GCN-NOT: s32 + ; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32{{$}} ; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:4 +; GCN-NOT: s32 ; GCN: s_setpc_b64 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { entry: @@ -167,7 +170,7 @@ ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: ; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:32 +; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:28 ; GCN: s_setpc_b64 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 { entry: @@ -194,14 +197,15 @@ ; Have another non-tail in the function ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call: ; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec ; GCN: s_mov_b32 s33, s32 -; GCN-DAG: s_add_u32 s32, s32, 0x800 +; GCN-DAG: s_add_u32 s32, s32, 0x400 -; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-DAG: v_writelane_b32 v43, s46, 12 +; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-DAG: v_writelane_b32 v42, s34, 0 +; GCN-DAG: v_writelane_b32 v42, s35, 1 ; GCN-DAG: s_getpc_b64 s[4:5] ; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 @@ -210,22 +214,22 @@ ; GCN: s_swappc_b64 -; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 +; GCN: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 -; GCN-DAG: v_readlane_b32 s35, v43, 1 -; GCN-DAG: v_readlane_b32 s34, v43, 0 +; GCN-DAG: v_readlane_b32 s34, v42, 0 +; GCN-DAG: v_readlane_b32 s35, v42, 1 -; GCN: s_sub_u32 s32, s32, 0x800 +; GCN: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: v_readlane_b32 s33, -; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: s_setpc_b64 s[16:17] +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_setpc_b64 s[4:5] define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 { entry: %other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) @@ -243,7 +247,7 @@ ; GCN-NOT: s33 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset: -; GCN: s_setpc_b64 s[16:17] +; GCN: s_setpc_b64 s[4:5] define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { entry: %alloca = alloca [16 x i32], align 4, addrspace(5) @@ -255,10 +259,10 @@ ; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: ; GCN-NOT: s33 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:48 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:44 ; GCN-NOT: s33 -; GCN: s_setpc_b64 s[16:17] +; GCN: s_setpc_b64 s[4:5] define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 { entry: %alloca = alloca [16 x i32], align 4, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -0,0 +1,18 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: define internal void @indirect() #0 { +define internal void @indirect() { + ret void +} + +; GCN-LABEL: define amdgpu_kernel void @test_simple_indirect_call() #1 { +define amdgpu_kernel void @test_simple_indirect_call() { + %fptr = alloca void()* + store void()* @indirect, void()** %fptr + %fp = load void()*, void()** %fptr + call void %fp() + ret void +} + +; attributes #0 = { "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } +; attributes #1 = { "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-stack-objects" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll --- a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll @@ -1,16 +1,16 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=7 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=1 < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}spill_csr_s5_copy: ; GCN: s_or_saveexec_b64 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec -; GCN: v_writelane_b32 v40, s33, 5 +; GCN: v_writelane_b32 v40, s33, 2 ; GCN: s_swappc_b64 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9 ; GCN: buffer_store_dword [[K]], off, s[0:3], s33{{$}} -; GCN: v_readlane_b32 s33, v40, 5 +; GCN: v_readlane_b32 s33, v40, 2 ; GCN: s_or_saveexec_b64 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN: s_mov_b64 exec diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -157,21 +157,19 @@ ; GCN-LABEL: func_call_align1024_bp_gets_vgpr_spill: ; GCN: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s33, 2 ; GCN-DAG: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0 -; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000 -; GCN: v_mov_b32_e32 v32, 0 ; GCN-DAG: v_writelane_b32 [[VGPR_REG]], s34, 3 +; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000 ; GCN: s_mov_b32 s34, s32 +; GCN: v_mov_b32_e32 v32, 0 ; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 -; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 offset:4 ; GCN-DAG: s_add_u32 s32, s32, 0x30000 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN: s_sub_u32 s32, s32, 0x30000 ; GCN-NEXT: v_readlane_b32 s33, [[VGPR_REG]], 2 diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll --- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -5,61 +5,61 @@ define hidden void @widget() { ; GCN-LABEL: widget: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v40, s33, 2 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_add_u32 s32, s32, 0x400 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: flat_load_dword v0, v[0:1] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 21, v0 -; GCN-NEXT: s_and_b64 vcc, exec, vcc -; GCN-NEXT: s_cbranch_vccz BB0_3 -; GCN-NEXT: ; %bb.1: ; %bb4 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0 -; GCN-NEXT: s_and_b64 vcc, exec, vcc -; GCN-NEXT: s_cbranch_vccnz BB0_4 -; GCN-NEXT: ; %bb.2: ; %bb7 -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, wibble@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, wibble@rel32@hi+12 -; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: s_branch BB0_7 -; GCN-NEXT: BB0_3: ; %bb2 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 21, v0 -; GCN-NEXT: s_and_b64 vcc, exec, vcc -; GCN-NEXT: s_cbranch_vccnz BB0_6 -; GCN-NEXT: BB0_4: ; %bb9 -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, wibble@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, wibble@rel32@hi+12 -; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execnz BB0_7 -; GCN-NEXT: ; %bb.5: ; %bb9.bb12_crit_edge -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: BB0_6: ; %bb12 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: flat_store_dword v[0:1], v2 -; GCN-NEXT: BB0_7: ; %UnifiedReturnBlock -; GCN-NEXT: v_readlane_b32 s4, v40, 0 -; GCN-NEXT: v_readlane_b32 s5, v40, 1 -; GCN-NEXT: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: v_readlane_b32 s33, v40, 2 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_writelane_b32 v40, s33, 2 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_add_u32 s32, s32, 0x400 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: flat_load_dword v0, v[0:1] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 21, v0 +; GCN-NEXT: s_and_b64 vcc, exec, vcc +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_cbranch_vccz BB0_3 +; GCN-NEXT: ; %bb.1: ; %bb4 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0 +; GCN-NEXT: s_and_b64 vcc, exec, vcc +; GCN-NEXT: s_cbranch_vccnz BB0_4 +; GCN-NEXT: ; %bb.2: ; %bb7 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, wibble@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, wibble@rel32@hi+12 +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_branch BB0_7 +; GCN-NEXT: BB0_3: ; %bb2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 21, v0 +; GCN-NEXT: s_and_b64 vcc, exec, vcc +; GCN-NEXT: s_cbranch_vccnz BB0_6 +; GCN-NEXT: BB0_4: ; %bb9 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, wibble@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, wibble@rel32@hi+12 +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execnz BB0_7 +; GCN-NEXT: ; %bb.5: ; %bb9.bb12_crit_edge +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: BB0_6: ; %bb12 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: BB0_7: ; %UnifiedReturnBlock +; GCN-NEXT: v_readlane_b32 s4, v40, 0 +; GCN-NEXT: v_readlane_b32 s5, v40, 1 +; GCN-NEXT: s_sub_u32 s32, s32, 0x400 +; GCN-NEXT: v_readlane_b32 s33, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] ; SI-OPT-LABEL: @widget( ; SI-OPT-NEXT: bb: ; SI-OPT-NEXT: [[TMP:%.*]] = load i32, i32 addrspace(1)* null, align 16 @@ -186,124 +186,95 @@ ; GCN-LABEL: blam: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v44, s33, 15 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_writelane_b32 v43, s33, 4 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x800 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v44, s34, 0 -; GCN-NEXT: v_writelane_b32 v44, s35, 1 -; GCN-NEXT: v_writelane_b32 v44, s36, 2 -; GCN-NEXT: v_writelane_b32 v44, s38, 3 -; GCN-NEXT: v_writelane_b32 v44, s39, 4 -; GCN-NEXT: v_writelane_b32 v44, s40, 5 -; GCN-NEXT: v_writelane_b32 v44, s41, 6 -; GCN-NEXT: v_writelane_b32 v44, s42, 7 -; GCN-NEXT: v_writelane_b32 v44, s43, 8 -; GCN-NEXT: v_writelane_b32 v44, s44, 9 -; GCN-NEXT: v_writelane_b32 v44, s45, 10 -; GCN-NEXT: v_writelane_b32 v44, s46, 11 -; GCN-NEXT: v_writelane_b32 v44, s47, 12 -; GCN-NEXT: v_writelane_b32 v44, s48, 13 -; GCN-NEXT: v_writelane_b32 v44, s49, 14 -; GCN-NEXT: v_mov_b32_e32 v40, v31 -; GCN-NEXT: s_mov_b32 s34, s14 -; GCN-NEXT: s_mov_b32 s35, s13 -; GCN-NEXT: s_mov_b32 s36, s12 -; GCN-NEXT: s_mov_b64 s[38:39], s[10:11] -; GCN-NEXT: s_mov_b64 s[40:41], s[8:9] -; GCN-NEXT: s_mov_b64 s[42:43], s[6:7] -; GCN-NEXT: s_mov_b64 s[44:45], s[4:5] +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v43, s34, 0 +; GCN-NEXT: v_writelane_b32 v43, s35, 1 +; GCN-NEXT: v_writelane_b32 v43, s36, 2 +; GCN-NEXT: v_writelane_b32 v43, s37, 3 ; GCN-NEXT: s_mov_b64 s[4:5], 0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v40 -; GCN-NEXT: flat_load_dword v41, v[0:1] -; GCN-NEXT: v_mov_b32_e32 v43, 0 -; GCN-NEXT: s_getpc_b64 s[48:49] -; GCN-NEXT: s_add_u32 s48, s48, spam@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s49, s49, spam@rel32@hi+12 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 2, v2 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: flat_load_dword v40, v[1:2] +; GCN-NEXT: v_mov_b32_e32 v42, 0 +; GCN-NEXT: s_getpc_b64 s[36:37] +; GCN-NEXT: s_add_u32 s36, s36, spam@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s37, s37, spam@rel32@hi+12 +; GCN-NEXT: v_lshlrev_b32_e32 v41, 2, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_f32_e64 s[46:47], 0, v41 +; GCN-NEXT: v_cmp_eq_f32_e64 s[34:35], 0, v40 ; GCN-NEXT: s_branch BB1_3 -; GCN-NEXT: BB1_1: ; %bb10 -; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 +; GCN-NEXT: BB1_1: ; %bb10 +; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: v_mov_b32_e32 v0, 0x7fc00000 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-NEXT: BB1_2: ; %bb18 -; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 +; GCN-NEXT: BB1_2: ; %bb18 +; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GCN-NEXT: v_mov_b32_e32 v0, 0x7fc00000 ; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-NEXT: BB1_3: ; %bb2 -; GCN-NEXT: ; =>This Loop Header: Depth=1 -; GCN-NEXT: ; Child Loop BB1_4 Depth 2 +; GCN-NEXT: BB1_3: ; %bb2 +; GCN-NEXT: ; =>This Loop Header: Depth=1 +; GCN-NEXT: ; Child Loop BB1_4 Depth 2 ; GCN-NEXT: s_mov_b64 s[6:7], 0 -; GCN-NEXT: BB1_4: ; %bb2 -; GCN-NEXT: ; Parent Loop BB1_3 Depth=1 -; GCN-NEXT: ; => This Inner Loop Header: Depth=2 -; GCN-NEXT: flat_load_dword v0, v[42:43] +; GCN-NEXT: BB1_4: ; %bb2 +; GCN-NEXT: ; Parent Loop BB1_3 Depth=1 +; GCN-NEXT: ; => This Inner Loop Header: Depth=2 +; GCN-NEXT: flat_load_dword v0, v[41:42] ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 3, v0 ; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GCN-NEXT: s_cbranch_execz BB1_6 -; GCN-NEXT: %bb.5: ; %bb8 -; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2 +; GCN-NEXT: ; %bb.5: ; %bb8 +; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GCN-NEXT: s_cbranch_execnz BB1_4 ; GCN-NEXT: s_branch BB1_1 -; GCN-NEXT: BB1_6: ; %bb6 -; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2 +; GCN-NEXT: BB1_6: ; %bb6 +; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2 ; GCN-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 ; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN-NEXT: s_cbranch_execnz BB1_4 -; GCN-NEXT: %bb.7: ; %bb11 -; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2 -; GCN-NEXT: _or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b64 s[4:5], s[44:45] -; GCN-NEXT: s_mov_b64 s[6:7], s[42:43] -; GCN-NEXT: s_mov_b64 s[8:9], s[40:41] -; GCN-NEXT: s_mov_b64 s[10:11], s[38:39] -; GCN-NEXT: s_mov_b32 s12, s36 -; GCN-NEXT: s_mov_b32 s13, s35 -; GCN-NEXT: s_mov_b32 s14, s34 -; GCN-NEXT: v_mov_b32_e32 v31, v40 -; GCN-NEXT: s_swappc_b64 s[30:31], s[48:49] -; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 -; GCN-NEXT: s_mov_b64 s[4:5], 0 -; GCN-NEXT: s_mov_b64 s[6:7], 0 -; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-NEXT: s_cbranch_execnz BB1_4 -; GCN-NEXT: ; %bb.8: ; %bb14 -; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-NEXT: s_and_saveexec_b64 s[4:5], s[46:47] -; GCN-NEXT: s_cbranch_execnz BB1_10 -; GCN-NEXT: ; %bb.9: ; %bb16 -; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, 0x7fc00000 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-NEXT: BB1_10: ; %bb17 -; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], 0 -; GCN-NEXT: s_branch BB1_2 - +; GCN-NEXT: ; %bb.7: ; %bb11 +; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_swappc_b64 s[30:31], s[36:37] +; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: s_mov_b64 s[6:7], 0 +; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GCN-NEXT: s_cbranch_execnz BB1_4 +; GCN-NEXT: ; %bb.8: ; %bb14 +; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 +; GCN-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-NEXT: s_and_saveexec_b64 s[4:5], s[34:35] +; GCN-NEXT: s_cbranch_execnz BB1_10 +; GCN-NEXT: ; %bb.9: ; %bb16 +; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, 0x7fc00000 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: BB1_10: ; %bb17 +; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], 0 +; GCN-NEXT: s_branch BB1_2 bb: %tmp = load float, float* null, align 16 br label %bb2 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -21,14 +21,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v32, v12 ; GFX9: ;;#ASMSTART ; GFX9-NEXT: ;;#ASMEND -; GFX9: image_gather4_c_b_cl v[40:43], v[32:39], s[16:23], s[4:7] dmask:0x1 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX9: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[4:7] dmask:0x1 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v44, s30, 0 ; GFX9: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload @@ -53,14 +53,14 @@ ; GFX10: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10: image_gather4_c_b_cl v[40:43], v[32:39], s[16:23], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_getpc_b64 s[16:17] -; GFX10-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12 -; GFX10: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 +; GFX10: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10: buffer_load_dword v43, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 @@ -100,14 +100,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v40, v12 ; GFX9: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[4:7] dmask:0x1 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[4:7] dmask:0x1 ; GFX9: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload @@ -127,29 +127,22 @@ ; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10: s_getpc_b64 s[16:17] -; GFX10-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12 -; GFX10-NEXT: s_mov_b32 s37, s36 -; GFX10-NEXT: s_mov_b32 s38, s36 -; GFX10-NEXT: s_mov_b32 s39, s36 -; GFX10-NEXT: s_mov_b32 s40, s36 -; GFX10-NEXT: s_mov_b32 s41, s36 -; GFX10-NEXT: s_mov_b32 s42, s36 -; GFX10-NEXT: s_mov_b32 s43, s36 -; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:19], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: v_writelane_b32 v45, s30, 8 + +; GFX10: image_gather4_c_b_cl v[0:3], v[12:19], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v40, v16 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v41, v15 ; GFX10-NEXT: v_mov_b32_e32 v42, v14 ; GFX10-NEXT: v_mov_b32_e32 v43, v13 -; GFX10-NEXT: v_writelane_b32 v45, s31, 9 ; GFX10-NEXT: v_mov_b32_e32 v44, v12 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10: buffer_load_dword v44, off, s[0:3], s33 diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir @@ -20,18 +20,10 @@ # FULL-NEXT: stackPtrOffsetReg: '$sgpr13' # FULL-NEXT: argumentInfo: # FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } -# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } # FULL-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } -# FULL-NEXT: workGroupIDX: { reg: '$sgpr6' } -# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' } -# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' } +# FULL-NEXT: workGroupIDX: { reg: '$sgpr6' } # FULL-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' } -# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } -# FULL-NEXT: workItemIDX: { reg: '$vgpr0' } -# FULL-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } -# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } +# FULL-NEXT: workItemIDX: { reg: '$vgpr0' } # FULL-NEXT: mode: # FULL-NEXT: ieee: true # FULL-NEXT: dx10-clamp: true @@ -55,18 +47,10 @@ # SIMPLE-NEXT: stackPtrOffsetReg: '$sgpr13' # SIMPLE-NEXT: argumentInfo: # SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } -# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } # SIMPLE-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } -# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr6' } -# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' } -# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' } +# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr6' } # SIMPLE-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' } -# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } -# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr0' } -# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } -# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } +# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr0' } # SIMPLE-NEXT: occupancy: 10 # SIMPLE-NEXT: body: name: kernel0 @@ -112,16 +96,6 @@ # FULL-NEXT: stackPtrOffsetReg: '$sp_reg' # FULL-NEXT: argumentInfo: # FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } -# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } -# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } -# FULL-NEXT: workGroupIDX: { reg: '$sgpr12' } -# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' } -# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' } -# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } -# FULL-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } -# FULL-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } -# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } # FULL-NEXT: mode: # FULL-NEXT: ieee: true # FULL-NEXT: dx10-clamp: true @@ -137,16 +111,6 @@ # SIMPLE-NEXT: maxKernArgAlign: 1 # SIMPLE-NEXT: argumentInfo: # SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } -# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } -# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } -# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr12' } -# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' } -# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' } -# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } -# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } -# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } -# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } # SIMPLE-NEXT: occupancy: 10 # SIMPLE-NEXT: body: @@ -175,16 +139,6 @@ # FULL-NEXT: stackPtrOffsetReg: '$sp_reg' # FULL-NEXT: argumentInfo: # FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } -# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } -# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } -# FULL-NEXT: workGroupIDX: { reg: '$sgpr12' } -# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' } -# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' } -# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } -# FULL-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } -# FULL-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } -# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } # FULL-NEXT: mode: # FULL-NEXT: ieee: true # FULL-NEXT: dx10-clamp: true @@ -200,16 +154,6 @@ # SIMPLE-NEXT: maxKernArgAlign: 1 # SIMPLE-NEXT: argumentInfo: # SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } -# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } -# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } -# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr12' } -# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' } -# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' } -# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } -# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } -# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } -# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } # SIMPLE-NEXT: occupancy: 10 # SIMPLE-NEXT: body: @@ -239,16 +183,6 @@ # FULL-NEXT: stackPtrOffsetReg: '$sp_reg' # FULL-NEXT: argumentInfo: # FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } -# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } -# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } -# FULL-NEXT: workGroupIDX: { reg: '$sgpr12' } -# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' } -# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' } -# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } -# FULL-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } -# FULL-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } -# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } # FULL-NEXT: mode: # FULL-NEXT: ieee: true # FULL-NEXT: dx10-clamp: true @@ -265,16 +199,6 @@ # SIMPLE-NEXT: isEntryFunction: true # SIMPLE-NEXT: argumentInfo: # SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } -# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } -# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } -# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr12' } -# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' } -# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' } -# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } -# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } -# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } -# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } # SIMPLE-NEXT: occupancy: 10 # SIMPLE-NEXT: body: @@ -311,31 +235,13 @@ # FULL: argumentInfo: # FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } -# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } -# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } # FULL-NEXT: flatScratchInit: { offset: 4 } -# FULL-NEXT: workGroupIDX: { reg: '$sgpr12' } -# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' } -# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' } -# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } -# FULL-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } -# FULL-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 } -# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } +# FULL-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 } # SIMPLE: argumentInfo: # SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } -# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } -# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } # SIMPLE-NEXT: flatScratchInit: { offset: 4 } -# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr12' } -# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' } -# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' } -# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } -# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } -# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 } -# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } +# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 } name: fake_stack_arginfo machineFunctionInfo: argumentInfo: