Index: llvm/lib/Target/AMDGPU/AMDGPU.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPU.h +++ llvm/lib/Target/AMDGPU/AMDGPU.h @@ -56,6 +56,7 @@ ModulePass *createAMDGPULowerModuleLDSPass(); FunctionPass *createSIModeRegisterPass(); FunctionPass *createGCNPreRAOptimizationsPass(); +FunctionPass *createGCNPreRaFixupsPass(); struct AMDGPUSimplifyLibCallsPass : PassInfoMixin { AMDGPUSimplifyLibCallsPass(TargetMachine &TM) : TM(TM) {} @@ -334,6 +335,9 @@ void initializeGCNPreRAOptimizationsPass(PassRegistry &); extern char &GCNPreRAOptimizationsID; +void initializeGCNPreRaFixupsPass(PassRegistry &); +extern char &GCNPreRaFixupsID; + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -378,6 +378,7 @@ initializeAMDGPUResourceUsageAnalysisPass(*PR); initializeGCNNSAReassignPass(*PR); initializeGCNPreRAOptimizationsPass(*PR); + initializeGCNPreRaFixupsPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -1223,6 +1224,7 @@ insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID); insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID); + insertPass(&TwoAddressInstructionPassID, &GCNPreRaFixupsID); TargetPassConfig::addFastRegAlloc(); } @@ -1239,6 +1241,8 @@ if (isPassEnabled(EnablePreRAOptimizations)) insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID); + insertPass(&ProcessImplicitDefsID, &GCNPreRaFixupsID); + // This is not an essential optimization and it has a noticeable impact on // compilation time, so we only enable it from O2. if (TM->getOptLevel() > CodeGenOpt::Less) Index: llvm/lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- llvm/lib/Target/AMDGPU/CMakeLists.txt +++ llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -106,6 +106,7 @@ GCNPreRAOptimizations.cpp GCNRegPressure.cpp GCNSchedStrategy.cpp + GCNPreRaFixups.cpp R600AsmPrinter.cpp R600ClauseMergePass.cpp R600ControlFlowFinalizer.cpp Index: llvm/lib/Target/AMDGPU/GCNPreRaFixups.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/AMDGPU/GCNPreRaFixups.cpp @@ -0,0 +1,97 @@ +//===--------------------- GCNPreRaFixups.cpp ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" + +using namespace llvm; + +#define DEBUG_TYPE "gcn-pre-ra-fixups" + +namespace { + +class GCNPreRaFixups : public MachineFunctionPass { +private: + const SIRegisterInfo *TRI = nullptr; + const SIInstrInfo *TII = nullptr; + MachineRegisterInfo *MRI = nullptr; + + bool handleMAI(MachineInstr &MI) const; + +public: + static char ID; + + GCNPreRaFixups() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "GCN Pre-RA Fixups"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // end anonymous namespace + +char GCNPreRaFixups::ID = 0; + +INITIALIZE_PASS(GCNPreRaFixups, DEBUG_TYPE, "GCN Pre-RA Fixups", false, false) + +char &llvm::GCNPreRaFixupsID = GCNPreRaFixups::ID; + +FunctionPass *llvm::createGCNPreRaFixupsPass() { + return new GCNPreRaFixups(); +} + +bool GCNPreRaFixups::handleMAI(MachineInstr &MI) const { + if (!TII->requiresNoAliasingBetweenDstAndSrcC(MI) || + MI.getOperand(0).isTied()) + return false; + + // SrcC shall not partially overlap with Dst. Either tie it to Dst if + // SrcC has a single use and we can overwrite it or set earlyclobber + // on the Dst to allocate a disjoint register otherwise. + auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); + Register Reg = Src2->getReg(); + if (Reg.isVirtual() && MRI->hasOneUse(Reg)) { + MachineInstrBuilder(*MI.getMF(), &MI) + .addReg(Reg, RegState::Implicit, Src2->getSubReg()); + MI.tieOperands(0, MI.getNumOperands() - 1); + return true; + } + + MI.getOperand(0).setIsEarlyClobber(); + return true; +} + +bool GCNPreRaFixups::runOnMachineFunction(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget(); + + if (!ST.hasMAIInsts()) + return false; + + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + MRI = &MF.getRegInfo(); + + bool Changed = false; + + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (SIInstrInfo::isMAI(MI)) + Changed |= handleMAI(MI); + } + } + + return Changed; +} Index: llvm/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1132,6 +1132,11 @@ static unsigned getDSShaderTypeValue(const MachineFunction &MF); const TargetSchedModel &getSchedModel() const { return SchedModel; } + + /// An MFMA instruction reading more than 4 registers as SrcC shall have + /// SrcC either completely disjoint from the Dst or have SrcC exactly the + /// same as Dst. Partial overlap is not supported. + bool requiresNoAliasingBetweenDstAndSrcC(const MachineInstr &MI) const; }; /// \brief Returns true if a reg:subreg pair P has a TRC class Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -8268,3 +8268,17 @@ return false; } + +bool +SIInstrInfo::requiresNoAliasingBetweenDstAndSrcC(const MachineInstr &MI) const { + if (!isMAI(MI)) + return false; + const MachineOperand *SrcC = getNamedOperand(MI, AMDGPU::OpName::src2); + if (!SrcC || !SrcC->isReg()) + return false; + Register Reg = SrcC->getReg(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + const TargetRegisterClass *RC = TRI->getRegClassForReg(MRI, Reg); + return RC->MC->getSizeInBits() > 128; +} Index: llvm/test/CodeGen/AMDGPU/acc-ldst.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/acc-ldst.ll +++ llvm/test/CodeGen/AMDGPU/acc-ldst.ll @@ -168,10 +168,13 @@ ret void } +; FIXME: there are v_accvgpr_read_b32 instructions in between of two mfma +; since the result is marked earlyclobber. ; GCN-LABEL: {{^}}test_multiuse_load_mfma_mfma_store: ; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}] ; GCN-NOT: v_accvgpr_write ; GCN: v_mfma_f32_32x32x1f32 +; GCN: v_mfma_f32_32x32x1f32 ; GCN-NOT: v_accvgpr_read ; GCN-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}] define amdgpu_kernel void @test_multiuse_load_mfma_mfma_store(<32 x float> addrspace(1)* %arg) { Index: llvm/test/CodeGen/AMDGPU/llc-pipeline.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -113,6 +113,7 @@ ; GCN-O0-NEXT: Virtual Register Map ; GCN-O0-NEXT: Live Register Matrix ; GCN-O0-NEXT: SI Pre-allocate WWM Registers +; GCN-O0-NEXT: GCN Pre-RA Fixups ; GCN-O0-NEXT: Fast Register Allocator ; GCN-O0-NEXT: SI lower SGPR spill instructions ; GCN-O0-NEXT: Fast Register Allocator @@ -308,6 +309,7 @@ ; GCN-O1-NEXT: Detect Dead Lanes ; GCN-O1-NEXT: Remove dead machine instructions ; GCN-O1-NEXT: Process Implicit Definitions +; GCN-O1-NEXT: GCN Pre-RA Fixups ; GCN-O1-NEXT: Remove unreachable machine basic blocks ; GCN-O1-NEXT: Live Variable Analysis ; GCN-O1-NEXT: MachineDominator Tree Construction @@ -593,6 +595,7 @@ ; GCN-O1-OPTS-NEXT: Detect Dead Lanes ; GCN-O1-OPTS-NEXT: Remove dead machine instructions ; GCN-O1-OPTS-NEXT: Process Implicit Definitions +; GCN-O1-OPTS-NEXT: GCN Pre-RA Fixups ; GCN-O1-OPTS-NEXT: Remove unreachable machine basic blocks ; GCN-O1-OPTS-NEXT: Live Variable Analysis ; GCN-O1-OPTS-NEXT: SI Optimize VGPR LiveRange @@ -879,6 +882,7 @@ ; GCN-O2-NEXT: Detect Dead Lanes ; GCN-O2-NEXT: Remove dead machine instructions ; GCN-O2-NEXT: Process Implicit Definitions +; GCN-O2-NEXT: GCN Pre-RA Fixups ; GCN-O2-NEXT: Remove unreachable machine basic blocks ; GCN-O2-NEXT: Live Variable Analysis ; GCN-O2-NEXT: SI Optimize VGPR LiveRange @@ -1179,6 +1183,7 @@ ; GCN-O3-NEXT: Detect Dead Lanes ; GCN-O3-NEXT: Remove dead machine instructions ; GCN-O3-NEXT: Process Implicit Definitions +; GCN-O3-NEXT: GCN Pre-RA Fixups ; GCN-O3-NEXT: Remove unreachable machine basic blocks ; GCN-O3-NEXT: Live Variable Analysis ; GCN-O3-NEXT: SI Optimize VGPR LiveRange Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -602,14 +602,15 @@ ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_vecarg: ; GFX90A-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GFX90A-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 ; GCN-COUNT-8: global_load_dwordx4 ; GFX908-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 ; GFX90A-NOT: v_accvgpr_write +; GFX908-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GFX908-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 ; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 ; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX908-COUNT-32: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 ; GFX908-COUNT-8: global_store_dwordx4 ; GFX90A-NOT: v_accvgpr_read_b32 ; GFX90A-COUNT-5: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}] Index: llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll @@ -0,0 +1,66 @@ +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -sgpr-regalloc=fast -vgpr-regalloc=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,FAST %s + +; Check that Dst and SrcC of MFMA instructions reading more than 4 registers as SrcC +; is either completely disjoint or exactly the same, but does not alias. + +declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32) + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32: +; GREEDY: v_mfma_f32_32x32x1f32 a[0:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31] +; GREEDY: v_mfma_f32_32x32x1f32 a[32:63], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31] +; FAST: v_mfma_f32_32x32x1f32 a[64:95], v{{[0-9]+}}, v{{[0-9]+}}, a[64:95] +; FAST: v_mfma_f32_32x32x1f32 a[32:63], v{{[0-9]+}}, v{{[0-9]+}}, a[64:95] +; GCN: v_mfma_f32_32x32x1f32 a[0:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31] +define amdgpu_kernel void @test_mfma_f32_32x32x1f32(<32 x float> addrspace(1)* %arg) #0 { +bb: + %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) + %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0) + %tmp.1 = shufflevector <32 x float> %mai.2, <32 x float> %mai.1, <32 x i32> + %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %tmp.1, i32 0, i32 0, i32 0) + store <32 x float> %mai.3, <32 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32: +; GREEDY: v_mfma_f32_16x16x1f32 a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15] +; GREEDY: v_mfma_f32_16x16x1f32 a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15] +; FAST: v_mfma_f32_16x16x1f32 a[32:47], v{{[0-9]+}}, v{{[0-9]+}}, a[32:47] +; FAST: v_mfma_f32_16x16x1f32 a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[32:47] +; GCN: v_mfma_f32_16x16x1f32 a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15] +define amdgpu_kernel void @test_mfma_f32_16x16x1f32(<16 x float> addrspace(1)* %arg) #0 { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 0, i32 0, i32 0) + %mai.2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %mai.1, i32 0, i32 0, i32 0) + %tmp.1 = shufflevector <16 x float> %mai.2, <16 x float> %mai.1, <16 x i32> + %mai.3 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %tmp.1, i32 0, i32 0, i32 0) + store <16 x float> %mai.3, <16 x float> addrspace(1)* %arg + ret void +} + +; This instruction allows the overlap since it only read 4 registers. + +; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32: +; GREEDY: v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] +; GREEDY: v_mfma_f32_4x4x1f32 a[2:5], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] +; FAST: v_mfma_f32_4x4x1f32 a[8:11], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] +; FAST: v_mfma_f32_4x4x1f32 a[4:7], v{{[0-9]+}}, v{{[0-9]+}}, a[8:11] +; GCN: v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] +define amdgpu_kernel void @test_mfma_f32_4x4x1f32(<4 x float> addrspace(1)* %arg) #0 { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 0, i32 0, i32 0) + %mai.2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %mai.1, i32 0, i32 0, i32 0) + %tmp.1 = shufflevector <4 x float> %mai.1, <4 x float> %mai.2, <4 x i32> + %mai.3 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %tmp.1, i32 0, i32 0, i32 0) + store <4 x float> %mai.3, <4 x float> addrspace(1)* %arg + ret void +} + +attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } Index: llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll +++ llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll @@ -41,4 +41,4 @@ ret void } -attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" } +attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="1,1" } Index: llvm/test/CodeGen/AMDGPU/spill-agpr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-agpr.ll +++ llvm/test/CodeGen/AMDGPU/spill-agpr.ll @@ -1,37 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s -; GCN-LABEL: {{^}}max_24regs_32a_used: -; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; GCN-DAG: v_mfma_f32_16x16x1f32 -; GCN-DAG: v_mfma_f32_16x16x1f32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-NOT: buffer_store_dword -; GCN-NOT: buffer_load_dword -; GFX908-NOT: v_accvgpr_write_b32 -; GFX90A: v_accvgpr_write_b32 -; GCN: ScratchSize: 0 -define amdgpu_kernel void @max_24regs_32a_used(<16 x float> addrspace(1)* %arg, float addrspace(1)* %out) #0 { -bb: - %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg - %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 1.0, <16 x float> %in.1, i32 0, i32 0, i32 0) - %mai.2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 1.0, <16 x float> %mai.1, i32 0, i32 0, i32 0) - %elt1 = extractelement <16 x float> %mai.2, i32 0 - %elt2 = extractelement <16 x float> %mai.1, i32 15 - %elt3 = extractelement <16 x float> %mai.1, i32 14 - %elt4 = extractelement <16 x float> %mai.2, i32 1 - store float %elt1, float addrspace(1)* %out - %gep1 = getelementptr float, float addrspace(1)* %out, i64 1 - store float %elt2, float addrspace(1)* %gep1 - %gep2 = getelementptr float, float addrspace(1)* %out, i64 2 - store float %elt3, float addrspace(1)* %gep2 - %gep3 = getelementptr float, float addrspace(1)* %out, i64 3 - store float %elt4, float addrspace(1)* %gep3 - - ret void -} - ; GCN-LABEL: {{^}}max_12regs_13a_used: ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 @@ -152,7 +121,6 @@ declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32) declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) -attributes #0 = { nounwind "amdgpu-num-vgpr"="24" } attributes #1 = { nounwind "amdgpu-num-vgpr"="10" } attributes #2 = { nounwind "amdgpu-num-vgpr"="12" } attributes #3 = { nounwind "amdgpu-num-vgpr"="32" }