diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -14,6 +14,7 @@ #include "SIMachineFunctionInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SetVector.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -441,6 +442,42 @@ //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg()); } +// Find a def of the UseReg, check if it is a reg_seqence and find initializers +// for each subreg, tracking it to foldable inline immediate if possible. +// Returns true on success. +static bool getRegSeqInit( + SmallVectorImpl> &Defs, + Register UseReg, uint8_t OpTy, + const SIInstrInfo *TII, const MachineRegisterInfo &MRI) { + MachineInstr *Def = MRI.getUniqueVRegDef(UseReg); + if (!Def || !Def->isRegSequence()) + return false; + + for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) { + MachineOperand *Sub = &Def->getOperand(I); + assert (Sub->isReg()); + + for (MachineInstr *SubDef = MRI.getUniqueVRegDef(Sub->getReg()); + SubDef && Sub->isReg() && !Sub->getSubReg() && + TII->isFoldableCopy(*SubDef); + SubDef = MRI.getUniqueVRegDef(Sub->getReg())) { + MachineOperand *Op = &SubDef->getOperand(1); + if (Op->isImm()) { + if (TII->isInlineConstant(*Op, OpTy)) + Sub = Op; + break; + } + if (!Op->isReg()) + break; + Sub = Op; + } + + Defs.push_back(std::make_pair(Sub, Def->getOperand(I + 1).getImm())); + } + + return true; +} + static bool tryToFoldACImm(const SIInstrInfo *TII, const MachineOperand &OpToFold, MachineInstr *UseMI, @@ -474,39 +511,30 @@ return false; MachineRegisterInfo &MRI = UseMI->getParent()->getParent()->getRegInfo(); - const MachineInstr *Def = MRI.getUniqueVRegDef(UseReg); - if (!Def || !Def->isRegSequence()) + SmallVector, 32> Defs; + if (!getRegSeqInit(Defs, UseReg, OpTy, TII, MRI)) return false; - int64_t Imm; - MachineOperand *Op; - for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) { - const MachineOperand &Sub = Def->getOperand(I); - if (!Sub.isReg() || Sub.getSubReg()) + int32_t Imm; + for (unsigned I = 0, E = Defs.size(); I != E; ++I) { + const MachineOperand *Op = Defs[I].first; + if (!Op->isImm()) return false; - MachineInstr *SubDef = MRI.getUniqueVRegDef(Sub.getReg()); - while (SubDef && !SubDef->isMoveImmediate() && - !SubDef->getOperand(1).isImm() && TII->isFoldableCopy(*SubDef)) - SubDef = MRI.getUniqueVRegDef(SubDef->getOperand(1).getReg()); - if (!SubDef || !SubDef->isMoveImmediate() || !SubDef->getOperand(1).isImm()) - return false; - Op = &SubDef->getOperand(1); + auto SubImm = Op->getImm(); - if (I == 1) { - if (!TII->isInlineConstant(SubDef->getOperand(1), OpTy)) + if (!I) { + Imm = SubImm; + if (!TII->isInlineConstant(*Op, OpTy) || + !TII->isOperandLegal(*UseMI, UseOpIdx, Op)) return false; - Imm = SubImm; continue; } if (Imm != SubImm) return false; // Can only fold splat constants } - if (!TII->isOperandLegal(*UseMI, UseOpIdx, Op)) - return false; - - appendFoldCandidate(FoldList, UseMI, UseOpIdx, Op); + appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first); return true; } @@ -645,11 +673,92 @@ LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI << '\n'); unsigned Size = TII->getOpSize(*UseMI, 1); - UseMI->getOperand(1).setReg(OpToFold.getReg()); + Register UseReg = OpToFold.getReg(); + UseMI->getOperand(1).setReg(UseReg); UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); UseMI->getOperand(1).setIsKill(false); CopiesToReplace.push_back(UseMI); OpToFold.setIsKill(false); + + // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32 + // can only accept VGPR or inline immediate. Recreate a reg_sequence with + // its initializers right here, so we will rematerialize immediates and + // avoid copies via different reg classes. + SmallVector, 32> Defs; + if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) && + getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32, TII, + *MRI)) { + const DebugLoc &DL = UseMI->getDebugLoc(); + MachineBasicBlock &MBB = *UseMI->getParent(); + + UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE)); + for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I) + UseMI->RemoveOperand(I); + + MachineInstrBuilder B(*MBB.getParent(), UseMI); + DenseMap VGPRCopies; + SmallSetVector SeenAGPRs; + for (unsigned I = 0; I < Size / 4; ++I) { + MachineOperand *Def = Defs[I].first; + TargetInstrInfo::RegSubRegPair CopyToVGPR; + if (Def->isImm() && + TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { + int64_t Imm = Def->getImm(); + + auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); + BuildMI(MBB, UseMI, DL, + TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), Tmp).addImm(Imm); + B.addReg(Tmp); + } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) { + auto Src = getRegSubRegPair(*Def); + Def->setIsKill(false); + if (!SeenAGPRs.insert(Src)) { + // We cannot build a reg_sequence out of the same registers, they + // must be copied. Better do it here before copyPhysReg() created + // several reads to do the AGPR->VGPR->AGPR copy. + CopyToVGPR = Src; + } else { + B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0, + Src.SubReg); + } + } else { + assert(Def->isReg()); + Def->setIsKill(false); + auto Src = getRegSubRegPair(*Def); + + // Direct copy from SGPR to AGPR is not possible. To avoid creation + // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later, + // create a copy here and track if we already have such a copy. + if (TRI->isSGPRReg(*MRI, Src.Reg)) { + CopyToVGPR = Src; + } else { + auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); + BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def); + B.addReg(Tmp); + } + } + + if (CopyToVGPR.Reg) { + Register Vgpr; + if (VGPRCopies.count(CopyToVGPR)) { + Vgpr = VGPRCopies[CopyToVGPR]; + } else { + Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def); + VGPRCopies[CopyToVGPR] = Vgpr; + } + auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); + BuildMI(MBB, UseMI, DL, + TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), Tmp).addReg(Vgpr); + B.addReg(Tmp); + } + + B.addImm(Defs[I].second); + } + LLVM_DEBUG(dbgs() << "Folded " << *UseMI << '\n'); + return; + } + if (Size != 4) return; if (TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) && diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -1,53 +1,115 @@ -; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; GCN-LABEL: {{^}}test_mfma_loop_zeroinit: +; GCN-COUNT32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} + +; Check that we do not copy agprs to vgprs and back inside the loop. + +; GCN: [[LOOP:BB[0-9_]+]]: +; GCN-NOT: v_accvgpr +; GCN: v_mfma_f32_32x32x1f32 +; GCN-NOT: v_accvgpr +; GCN: s_cbranch_scc1 [[LOOP]] + +; Final result should be read only once after the loop. + +; GCN-COUNT32: v_accvgpr_read_b32 + +define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) { +entry: + br label %for.cond.preheader + +for.cond.preheader: + %phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ] + %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) + %inc = add nuw nsw i32 %c, 1 + %cc = icmp eq i32 %inc, 16 + br i1 %cc, label %exit, label %for.cond.preheader + +exit: + store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_splat: + ; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only. ; 3 vgprs are needed to avoid wait states between writes. +; Check that we do not use 32 temp sgprs as well. + +; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000 +; GCN-COUNT32: v_accvgpr_write_b32 a0, [[TMP]] + +; GCN: [[LOOP:BB[0-9_]+]]: +; GCN-NOT: v_accvgpr +; GCN: v_mfma_f32_32x32x1f32 +; GCN-NOT: v_accvgpr +; GCN: s_cbranch_scc1 [[LOOP]] + +; GCN-COUNT32: v_accvgpr_read_b32 + +define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(<32 x float> addrspace(1)* %arg) { +entry: + br label %for.cond.preheader -; FIXME: We should not be using and temporary registers at all. -; At the moment we initialize an sgpr, then copy it via vgprs. +for.cond.preheader: + %phi = phi <32 x float> [ , %entry ], [ %mai.1, %for.cond.preheader ] + %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) + %inc = add nuw nsw i32 %c, 1 + %cc = icmp eq i32 %inc, 16 + br i1 %cc, label %exit, label %for.cond.preheader -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2:v[0-9]+]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3:v[0-9]+]] +exit: + store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg + ret void +} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1:v[0-9]+]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] +; GCN-LABEL: {{^}}test_mfma_loop_non_splat: -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0{{$}} +; GCN-COUNT30: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] +; GCN: [[LOOP:BB[0-9_]+]]: +; GCN-NOT: v_accvgpr +; GCN: v_mfma_f32_32x32x1f32 +; GCN-NOT: v_accvgpr +; GCN: s_cbranch_scc1 [[LOOP]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] +; GCN-COUNT32: v_accvgpr_read_b32 -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] +define amdgpu_kernel void @test_mfma_loop_non_splat(<32 x float> addrspace(1)* %arg) { +entry: + br label %for.cond.preheader -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] +for.cond.preheader: + %phi = phi <32 x float> [ , %entry ], [ %mai.1, %for.cond.preheader ] + %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) + %inc = add nuw nsw i32 %c, 1 + %cc = icmp eq i32 %inc, 16 + br i1 %cc, label %exit, label %for.cond.preheader -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] +exit: + store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg + ret void +} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] +; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_seq: -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] +; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only. +; 3 vgprs are needed to avoid wait states between writes. -; Check that we do not copy agprs to vgprs and back inside the loop. +; GCN: v_mov_b32_e32 [[TMP1:v[0-9]+]], 0x42f60000 +; GCN: v_mov_b32_e32 [[TMP2:v[0-9]+]], 0x42f80000 +; GCN: v_mov_b32_e32 [[TMP3:v[0-9]+]], 0x42fe0000 +; GCN-COUNT29: v_mov_b32_e32 v1, 0x4{{[0-9a-f]+}} +; GCN-COUNT10: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] +; GCN-COUNT11: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] +; GCN-COUNT11: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] ; GCN: [[LOOP:BB[0-9_]+]]: ; GCN-NOT: v_accvgpr @@ -55,16 +117,179 @@ ; GCN-NOT: v_accvgpr ; GCN: s_cbranch_scc1 [[LOOP]] -; Final result should be read only once after the loop. +; GCN-COUNT32: v_accvgpr_read_b32 + +define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(<32 x float> addrspace(1)* %arg) { +entry: + br label %for.cond.preheader + +for.cond.preheader: + %phi = phi <32 x float> [ , %entry ], [ %mai.1, %for.cond.preheader ] + %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) + %inc = add nuw nsw i32 %c, 1 + %cc = icmp eq i32 %inc, 16 + br i1 %cc, label %exit, label %for.cond.preheader + +exit: + store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_loop_vgpr_init: + +; GCN-COUNT32: v_accvgpr_write_b32 a{{[0-9]+}}, v0{{$}} + +; GCN: [[LOOP:BB[0-9_]+]]: +; GCN-NOT: v_accvgpr +; GCN: v_mfma_f32_32x32x1f32 +; GCN-NOT: v_accvgpr +; GCN: s_cbranch_scc1 [[LOOP]] ; GCN-COUNT32: v_accvgpr_read_b32 -define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) { +define amdgpu_kernel void @test_mfma_loop_vgpr_init(<32 x float> addrspace(1)* %arg) { entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %init = bitcast i32 %tid to float + %tmp0 = insertelement <32 x float> undef, float %init, i32 0 + %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1 + %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2 + %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3 + %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4 + %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5 + %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6 + %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7 + %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8 + %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9 + %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10 + %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11 + %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12 + %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13 + %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14 + %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15 + %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16 + %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17 + %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18 + %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19 + %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20 + %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21 + %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22 + %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23 + %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24 + %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25 + %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26 + %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27 + %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28 + %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29 + %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30 + %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31 + br label %for.cond.preheader for.cond.preheader: - %phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ] + %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ] + %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) + %inc = add nuw nsw i32 %c, 1 + %cc = icmp eq i32 %inc, 16 + br i1 %cc, label %exit, label %for.cond.preheader + +exit: + store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_loop_sgpr_init: + +; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}} +; GCN-COUNT32: v_accvgpr_write_b32 a0, [[TMP]] + +; GCN: [[LOOP:BB[0-9_]+]]: +; GCN-NOT: v_accvgpr +; GCN: v_mfma_f32_32x32x1f32 +; GCN-NOT: v_accvgpr +; GCN: s_cbranch_scc1 [[LOOP]] + +; GCN-COUNT32: v_accvgpr_read_b32 + +define amdgpu_kernel void @test_mfma_loop_sgpr_init(<32 x float> addrspace(1)* %arg, float %init) { +entry: + %tmp0 = insertelement <32 x float> undef, float %init, i32 0 + %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1 + %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2 + %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3 + %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4 + %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5 + %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6 + %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7 + %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8 + %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9 + %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10 + %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11 + %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12 + %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13 + %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14 + %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15 + %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16 + %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17 + %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18 + %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19 + %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20 + %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21 + %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22 + %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23 + %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24 + %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25 + %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26 + %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27 + %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28 + %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29 + %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30 + %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31 + + br label %for.cond.preheader + +for.cond.preheader: + %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ] + %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) + %inc = add nuw nsw i32 %c, 1 + %cc = icmp eq i32 %inc, 16 + br i1 %cc, label %exit, label %for.cond.preheader + +exit: + store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_loop_mixed_init: + +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v0 +; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] + +; GCN-COUNT30: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} + +; GCN: [[LOOP:BB[0-9_]+]]: +; GCN-NOT: v_accvgpr +; GCN: v_mfma_f32_32x32x1f32 +; GCN-NOT: v_accvgpr +; GCN: s_cbranch_scc1 [[LOOP]] + +; GCN-COUNT32: v_accvgpr_read_b32 + +define amdgpu_kernel void @test_mfma_loop_mixed_init(<32 x float> addrspace(1)* %arg, float %x) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %init = bitcast i32 %tid to float + %tmp0 = insertelement <32 x float> zeroinitializer, float %init, i32 0 + %tmp1 = insertelement <32 x float> %tmp0, float %x, i32 1 + + br label %for.cond.preheader + +for.cond.preheader: + %phi = phi <32 x float> [ %tmp1, %entry ], [ %mai.1, %for.cond.preheader ] %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) %inc = add nuw nsw i32 %c, 1 @@ -91,8 +316,6 @@ ; GCN-COUNT32: v_accvgpr_read_b32 define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(<32 x float> addrspace(1)* %arg) { entry: - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %init = bitcast i32 %tid to float %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0) br label %for.cond.preheader @@ -110,5 +333,74 @@ ret void } +; GCN-LABEL: {{^}}test_mfma_loop_agpr_init: + +; GCN-COUNT32: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN: v_mfma_f32_32x32x1f32 + +; Check that we are using only one tmp VGPR. + +; GCN: v_accvgpr_read_b32 [[TMP:v[0-9]+]], a{{[0-9]+}} +; GCN-COUNT32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]{{$}} + +; GCN: [[LOOP:BB[0-9_]+]]: +; GCN-NOT: v_accvgpr +; GCN: v_mfma_f32_32x32x1f32 +; GCN-NOT: v_accvgpr +; GCN: s_cbranch_scc1 [[LOOP]] + +; GCN-COUNT32: v_accvgpr_read_b32 +define amdgpu_kernel void @test_mfma_loop_agpr_init(<32 x float> addrspace(1)* %arg) { +entry: + %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0) + %init = extractelement <32 x float> %mai.0, i32 0 + %tmp0 = insertelement <32 x float> undef, float %init, i32 0 + %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1 + %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2 + %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3 + %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4 + %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5 + %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6 + %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7 + %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8 + %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9 + %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10 + %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11 + %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12 + %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13 + %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14 + %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15 + %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16 + %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17 + %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18 + %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19 + %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20 + %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21 + %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22 + %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23 + %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24 + %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25 + %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26 + %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27 + %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28 + %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29 + %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30 + %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31 + + br label %for.cond.preheader + +for.cond.preheader: + %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ] + %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) + %inc = add nuw nsw i32 %c, 1 + %cc = icmp eq i32 %inc, 16 + br i1 %cc, label %exit, label %for.cond.preheader + +exit: + store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg + ret void +} + declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) declare i32 @llvm.amdgcn.workitem.id.x()