Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -48,7 +48,6 @@ FunctionPass *createSIFixSGPRLiveRangesPass(); FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); FunctionPass *createSIInsertWaits(TargetMachine &tm); -FunctionPass *createSIPrepareScratchRegs(); ModulePass *createAMDGPUAnnotateKernelFeaturesPass(); void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1095,34 +1095,12 @@ MachineFunction &MF = CurDAG->getMachineFunction(); const SIRegisterInfo *TRI = static_cast(Subtarget->getRegisterInfo()); - MachineRegisterInfo &MRI = MF.getRegInfo(); - const SITargetLowering& Lowering = - *static_cast(getTargetLowering()); - - unsigned ScratchOffsetReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); - Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass, - ScratchOffsetReg, MVT::i32); - SDValue Sym0 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD0", MVT::i32); - SDValue ScratchRsrcDword0 = - SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym0), 0); - - SDValue Sym1 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD1", MVT::i32); - SDValue ScratchRsrcDword1 = - SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym1), 0); + const SIMachineFunctionInfo *Info = MF.getInfo(); - const SDValue RsrcOps[] = { - CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), - ScratchRsrcDword0, - CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - ScratchRsrcDword1, - CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32), - }; - SDValue ScratchPtr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, - MVT::v2i32, RsrcOps), 0); - Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0); - SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, - MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32); + unsigned ScratchOffsetReg + = TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); + Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); + SOffset = CurDAG->getRegister(ScratchOffsetReg, MVT::i32); // (add n0, c1) if (CurDAG->isBaseWithConstantOffset(Addr)) { Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -327,7 +327,6 @@ } void GCNPassConfig::addPostRegAlloc() { - addPass(createSIPrepareScratchRegs(), false); addPass(createSIShrinkInstructionsPass(), false); } Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -57,7 +57,6 @@ SILowerControlFlow.cpp SILowerI1Copies.cpp SIMachineFunctionInfo.cpp - SIPrepareScratchRegs.cpp SIRegisterInfo.cpp SIShrinkInstructions.cpp SITypeRewriter.cpp Index: lib/Target/AMDGPU/SIFrameLowering.h =================================================================== --- lib/Target/AMDGPU/SIFrameLowering.h +++ lib/Target/AMDGPU/SIFrameLowering.h @@ -21,6 +21,9 @@ AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {} ~SIFrameLowering() override {} + void emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const override; + void processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS = nullptr) const override; Index: lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIFrameLowering.cpp +++ lib/Target/AMDGPU/SIFrameLowering.cpp @@ -8,17 +8,89 @@ //==-----------------------------------------------------------------------===// #include "SIFrameLowering.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/RegisterScavenging.h" using namespace llvm; + +static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo, + const MachineFrameInfo *FrameInfo) { + if (!FuncInfo->hasSpilledSGPRs()) + return false; + + if (FuncInfo->hasSpilledVGPRs()) + return false; + + for (int I = FrameInfo->getObjectIndexBegin(), + E = FrameInfo->getObjectIndexEnd(); I != E; ++I) { + if (!FrameInfo->isSpillSlotObjectIndex(I)) + return false; + } + + return true; +} + +void SIFrameLowering::emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + if (!MF.getFrameInfo()->hasStackObjects()) + return; + + assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); + + const SIMachineFunctionInfo *MFI = MF.getInfo(); + + // If we only have SGPR spills, we won't actually be using scratch memory + // since these spill to VGPRs. + // + // FIXME: We should be cleaning up these unused SGPR spill frame indices + // somewhere. + if (hasOnlySGPRSpills(MFI, MF.getFrameInfo())) + return; + + const SIInstrInfo *TII = + static_cast(MF.getSubtarget().getInstrInfo()); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + + // We need to insert initialization of the scratch resource descriptor. + unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); + assert(ScratchRsrcReg != AMDGPU::NoRegister); + + uint64_t Rsrc23 = TII->getScratchRsrcWords23(); + MachineBasicBlock::iterator I = MBB.begin(); + DebugLoc DL; + + unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); + unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0) + .addExternalSymbol("SCRATCH_RSRC_DWORD0"); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1) + .addExternalSymbol("SCRATCH_RSRC_DWORD1"); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2) + .addImm(Rsrc23 & 0xffffffff); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3) + .addImm(Rsrc23 >> 32); +} + void SIFrameLowering::processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS) const { MachineFrameInfo *MFI = MF.getFrameInfo(); + + if (!MFI->hasStackObjects()) + return; + bool MayNeedScavengingEmergencySlot = MFI->hasStackObjects(); assert((RS || !MayNeedScavengingEmergencySlot) && Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -116,10 +116,6 @@ SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const; - MachineSDNode *buildScratchRSRC(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr) const; - std::pair getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -514,6 +514,7 @@ MachineFunction &MF = DAG.getMachineFunction(); FunctionType *FType = MF.getFunction()->getFunctionType(); SIMachineFunctionInfo *Info = MF.getInfo(); + const AMDGPUSubtarget &ST = MF.getSubtarget(); if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) { const Function *Fn = MF.getFunction(); @@ -584,9 +585,9 @@ // The pointer to the scratch buffer is stored in SGPR2, SGPR3 if (Info->getShaderType() == ShaderType::COMPUTE) { if (Subtarget->isAmdHsaOS()) - Info->NumUserSGPRs = 4; // FIXME: Need to support scratch buffers. + Info->NumUserSGPRs += 4; // FIXME: Need to support scratch buffers. else - Info->NumUserSGPRs = 4; + Info->NumUserSGPRs += 4; unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); @@ -721,6 +722,9 @@ Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx); } + if (MF.getFrameInfo()->hasStackObjects() || ST.isVGPRSpillingEnabled(Info)) + Info->setScratchRSrcReg(TRI); + if (Chains.empty()) return Chain; @@ -2345,15 +2349,6 @@ return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); } -MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr) const { - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); - - return buildRSRC(DAG, DL, Ptr, 0, TII->getScratchRsrcWords23()); -} - SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, unsigned Reg, EVT VT) const { Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -551,15 +551,16 @@ assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); + unsigned ScratchOffsetPreloadReg + = RI.getPreloadedValue(*MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); + unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); MFI->setHasSpilledVGPRs(); BuildMI(MBB, MI, DL, get(Opcode)) .addReg(SrcReg) // src .addFrameIndex(FrameIndex) // frame_idx - // Place-holder registers, these will be filled in by - // SIPrepareScratchRegs. - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef) + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(ScratchOffsetPreloadReg) // scratch_offset .addMemOperand(MMO); } @@ -637,13 +638,14 @@ assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); + unsigned ScratchOffsetPreloadReg + = RI.getPreloadedValue(*MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); + unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // frame_idx - // Place-holder registers, these will be filled in by - // SIPrepareScratchRegs. - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef) + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(ScratchOffsetPreloadReg) // scratch_offset .addMemOperand(MMO); } Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -29,6 +29,8 @@ void anchor() override; unsigned TIDReg; + unsigned ScratchRSrcReg; + bool HasSpilledSGPRs; bool HasSpilledVGPRs; @@ -56,6 +58,14 @@ unsigned getTIDReg() const { return TIDReg; }; void setTIDReg(unsigned Reg) { TIDReg = Reg; } + /// \brief Returns the physical register reserved for use as the resource + /// descriptor for scratch accesses. + unsigned getScratchRSrcReg() const { + return ScratchRSrcReg; + } + + void setScratchRSrcReg(const SIRegisterInfo *TRI); + bool hasSpilledSGPRs() const { return HasSpilledSGPRs; } Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -29,12 +29,21 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), TIDReg(AMDGPU::NoRegister), + ScratchRSrcReg(AMDGPU::NoRegister), HasSpilledSGPRs(false), HasSpilledVGPRs(false), PSInputAddr(0), NumUserSGPRs(0), LDSWaveSpillSize(0) { } +void SIMachineFunctionInfo::setScratchRSrcReg(const SIRegisterInfo *TRI) { + // We need to round up to next multiple of 4. + unsigned NextSReg128 = RoundUpToAlignment(NumUserSGPRs + 5, 4); + unsigned RegSub0 = AMDGPU::SReg_32RegClass.getRegister(NextSReg128); + ScratchRSrcReg = TRI->getMatchingSuperReg(RegSub0, AMDGPU::sub0, + &AMDGPU::SReg_128RegClass); +} + SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( MachineFunction *MF, unsigned FrameIndex, Index: lib/Target/AMDGPU/SIPrepareScratchRegs.cpp =================================================================== --- lib/Target/AMDGPU/SIPrepareScratchRegs.cpp +++ /dev/null @@ -1,196 +0,0 @@ -//===-- SIPrepareScratchRegs.cpp - Use predicates for control flow --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// -/// This pass loads scratch pointer and scratch offset into a register or a -/// frame index which can be used anywhere in the program. These values will -/// be used for spilling VGPRs. -/// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "SIDefines.h" -#include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RegisterScavenging.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/LLVMContext.h" - -using namespace llvm; - -namespace { - -class SIPrepareScratchRegs : public MachineFunctionPass { - -private: - static char ID; - -public: - SIPrepareScratchRegs() : MachineFunctionPass(ID) { } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI prepare scratch registers"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } -}; - -} // End anonymous namespace - -char SIPrepareScratchRegs::ID = 0; - -FunctionPass *llvm::createSIPrepareScratchRegs() { - return new SIPrepareScratchRegs(); -} - -bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) { - SIMachineFunctionInfo *MFI = MF.getInfo(); - const SIInstrInfo *TII = - static_cast(MF.getSubtarget().getInstrInfo()); - const SIRegisterInfo *TRI = &TII->getRegisterInfo(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - MachineFrameInfo *FrameInfo = MF.getFrameInfo(); - MachineBasicBlock *Entry = &MF.front(); - MachineBasicBlock::iterator I = Entry->begin(); - DebugLoc DL = I->getDebugLoc(); - - // FIXME: If we don't have enough VGPRs for SGPR spilling we will need to - // run this pass. - if (!MFI->hasSpilledVGPRs()) - return false; - - unsigned ScratchPtrPreloadReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); - unsigned ScratchOffsetPreloadReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); - - if (!Entry->isLiveIn(ScratchPtrPreloadReg)) - Entry->addLiveIn(ScratchPtrPreloadReg); - - if (!Entry->isLiveIn(ScratchOffsetPreloadReg)) - Entry->addLiveIn(ScratchOffsetPreloadReg); - - // Load the scratch offset. - unsigned ScratchOffsetReg = - TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_32RegClass); - int ScratchOffsetFI = -1; - - if (ScratchOffsetReg != AMDGPU::NoRegister) { - // Found an SGPR to use - BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg) - .addReg(ScratchOffsetPreloadReg); - } else { - // No SGPR is available, we must spill. - ScratchOffsetFI = FrameInfo->CreateSpillStackObject(4,4); - BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S32_SAVE)) - .addReg(ScratchOffsetPreloadReg) - .addFrameIndex(ScratchOffsetFI) - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef); - } - - - // Now that we have the scratch pointer and offset values, we need to - // add them to all the SI_SPILL_V* instructions. - - RegScavenger RS; - unsigned ScratchRsrcFI = FrameInfo->CreateSpillStackObject(16, 4); - RS.addScavengingFrameIndex(ScratchRsrcFI); - - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - - MachineBasicBlock &MBB = *BI; - // Add the scratch offset reg as a live-in so that the register scavenger - // doesn't re-use it. - if (!MBB.isLiveIn(ScratchOffsetReg) && - ScratchOffsetReg != AMDGPU::NoRegister) - MBB.addLiveIn(ScratchOffsetReg); - RS.enterBasicBlock(&MBB); - - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { - MachineInstr &MI = *I; - RS.forward(I); - DebugLoc DL = MI.getDebugLoc(); - if (!TII->isVGPRSpill(MI)) - continue; - - // Scratch resource - unsigned ScratchRsrcReg = - RS.scavengeRegister(&AMDGPU::SReg_128RegClass, 0); - - uint64_t Rsrc23 = TII->getScratchRsrcWords23(); - - unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); - unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); - unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); - unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0) - .addExternalSymbol("SCRATCH_RSRC_DWORD0") - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1) - .addExternalSymbol("SCRATCH_RSRC_DWORD1") - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2) - .addImm(Rsrc23 & 0xffffffff) - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3) - .addImm(Rsrc23 >> 32) - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - // Scratch Offset - if (ScratchOffsetReg == AMDGPU::NoRegister) { - ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); - BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE), - ScratchOffsetReg) - .addFrameIndex(ScratchOffsetFI) - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef); - } else if (!MBB.isLiveIn(ScratchOffsetReg)) { - MBB.addLiveIn(ScratchOffsetReg); - } - - if (ScratchRsrcReg == AMDGPU::NoRegister || - ScratchOffsetReg == AMDGPU::NoRegister) { - LLVMContext &Ctx = MF.getFunction()->getContext(); - Ctx.emitError("ran out of SGPRs for spilling VGPRs"); - ScratchRsrcReg = AMDGPU::SGPR0; - ScratchOffsetReg = AMDGPU::SGPR0; - } - MI.getOperand(2).setReg(ScratchRsrcReg); - MI.getOperand(2).setIsKill(true); - MI.getOperand(2).setIsUndef(false); - MI.getOperand(3).setReg(ScratchOffsetReg); - MI.getOperand(3).setIsUndef(false); - MI.getOperand(3).setIsKill(false); - MI.addOperand(MachineOperand::CreateReg(Rsrc0, false, true, true)); - MI.addOperand(MachineOperand::CreateReg(Rsrc1, false, true, true)); - MI.addOperand(MachineOperand::CreateReg(Rsrc2, false, true, true)); - MI.addOperand(MachineOperand::CreateReg(Rsrc3, false, true, true)); - } - } - return true; -} Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -68,6 +68,22 @@ } } + const SIMachineFunctionInfo *MFI = MF.getInfo(); + unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); + if (ScratchRSrcReg != AMDGPU::NoRegister) { + unsigned ScratchOffsetPreloadReg + = getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); + // We will need to use this user SGPR argument for spilling, and thus never + // want it to be spilled. + reserveRegisterTuples(Reserved, ScratchOffsetPreloadReg); + + // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need + // to spill. + // TODO: May need to reserve a VGPR if doing LDS spilling. + reserveRegisterTuples(Reserved, ScratchRSrcReg); + assert(!isSubRegister(ScratchRSrcReg, ScratchOffsetPreloadReg)); + } + return Reserved; } @@ -243,6 +259,9 @@ .addReg(SubReg) .addImm(Spill.Lane); + // FIXME: Since this spills to another register instead of an actual + // frame index, we should delete the frame index when all references to + // it are fixed. } MI->eraseFromParent(); break; Index: test/CodeGen/AMDGPU/kernel-args.ll =================================================================== --- test/CodeGen/AMDGPU/kernel-args.ll +++ test/CodeGen/AMDGPU/kernel-args.ll @@ -294,8 +294,8 @@ ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X -; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11 -; VI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44 +; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 +; VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind { entry: store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4 @@ -311,7 +311,7 @@ ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X -; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11 +; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind { entry: store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4 @@ -413,8 +413,8 @@ ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X -; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 -; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 +; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 +; VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind { entry: store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4 @@ -438,8 +438,8 @@ ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X -; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 -; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 +; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 +; VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind { entry: store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/large-alloca-compute.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/large-alloca-compute.ll @@ -0,0 +1,42 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s +; XUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s +; XUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s + +; FIXME: align on alloca seems to be ignored for private_segment_alignment + +; ALL-LABEL: {{^}}large_alloca_compute_shader: + +; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN: s_mov_b32 s14, -1 +; CI: s_mov_b32 s15, 0x80f000 +; VI: s_mov_b32 s15, 0x800000 + + +; GCNHSA: .amd_kernel_code_t +; GCNHSA: private_segment_alignment = 4 +; GCNHSA: .end_amd_kernel_code_t + +; GCNHSA: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCNHSA: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCNHSA: s_mov_b32 s10, -1 +; CIHSA: s_mov_b32 s11, 0x180f000 +; VIHSA: s_mov_b32 s11, 0x11800000 + +; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s6 offen +; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s6 offen + +; Scratch size = alloca size + emergency stack slot +; ALL: ; ScratchSize: 32772 +define void @large_alloca_compute_shader(i32 %x, i32 %y) #0 { + %large = alloca [8192 x i32], align 4 + %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191 + store volatile i32 %x, i32* %gep + %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y + %val = load volatile i32, i32* %gep1 + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/large-alloca-graphics.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/large-alloca-graphics.ll @@ -0,0 +1,47 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s + +; ALL-LABEL: {{^}}large_alloca_pixel_shader: +; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN: s_mov_b32 s10, -1 +; CI: s_mov_b32 s11, 0x80f000 +; VI: s_mov_b32 s11, 0x800000 + +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen + +; ALL: ; ScratchSize: 32772 +define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 { + %large = alloca [8192 x i32], align 4 + %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191 + store volatile i32 %x, i32* %gep + %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y + %val = load volatile i32, i32* %gep1 + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; ALL-LABEL: {{^}}large_alloca_pixel_shader_inreg: +; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN: s_mov_b32 s10, -1 +; CI: s_mov_b32 s11, 0x80f000 +; VI: s_mov_b32 s11, 0x800000 + +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen + +; ALL: ; ScratchSize: 32772 +define void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #1 { + %large = alloca [8192 x i32], align 4 + %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191 + store volatile i32 %x, i32* %gep + %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y + %val = load volatile i32, i32* %gep1 + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind "ShaderType"="0" } Index: test/CodeGen/AMDGPU/large-alloca.ll =================================================================== --- test/CodeGen/AMDGPU/large-alloca.ll +++ /dev/null @@ -1,18 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s - -; GCN-LABEL: {{^}}large_alloca: -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen -; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen -; GCN: ScratchSize: 32776 -define void @large_alloca(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { - %large = alloca [8192 x i32], align 4 - %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191 - store i32 %x, i32* %gep - %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y - %load = load i32, i32* %gep1 - store i32 %load, i32 addrspace(1)* %out - ret void -} - -attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/si-sgpr-spill.ll =================================================================== --- test/CodeGen/AMDGPU/si-sgpr-spill.ll +++ test/CodeGen/AMDGPU/si-sgpr-spill.ll @@ -6,6 +6,16 @@ ; CHECK-LABEL: {{^}}main: ; CHECK: s_wqm + +; Make sure not emitting unused scratch resource descriptor setup +; CHECK-NOT: s_mov_b32 +; CHECK-NOT: s_mov_b32 +; CHECK-NOT: s_mov_b32 +; CHECK-NOT: s_mov_b32 + +; CHECK: s_mov_b32 m0 + + ; Writing to M0 from an SMRD instruction will hang the GPU. ; CHECK-NOT: s_buffer_load_dword m0 ; CHECK: s_endpgm Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -0,0 +1,583 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; XUN: llc -march=amdgcn -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA %s +; XUN: llc -march=amdgcn -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA %s + +; This ends up using all 256 registers and requires register +; scavenging which will fail to find an unsued register. + +; Check the ScratchSize to avoid regressions from spilling +; intermediate register class copies. + +; FIXME: The same register is initialized to 0 for every spill. + +declare i32 @llvm.r600.read.tgid.x() #1 +declare i32 @llvm.r600.read.tgid.y() #1 +declare i32 @llvm.r600.read.tgid.z() #1 + +; GCN-LABEL: {{^}}spill_vgpr_compute: + +; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_mov_b32 s15, 0x80f000 +; VI-NEXT: s_mov_b32 s15, 0x800000 + +; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s8 offset:{{[0-9]+}} ; 4-byte Folded Spill + +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s8 offen offset:{{[0-9]+}} +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s8 offen offset:{{[0-9]+}} + +; GCN: NumVgprs: 256 +; GCN: ScratchSize: 1024 + +; s[0:3] input user SGPRs. s4,s5,s6 = workgroup IDs. s8 scratch offset. +define void @spill_vgpr_compute(<4 x float> %arg6, float addrspace(1)* %arg, i32 %arg1, i32 %arg2, float %arg3, float %arg4, float %arg5) #0 { +bb: + %tmp = add i32 %arg1, %arg2 + %tmp7 = extractelement <4 x float> %arg6, i32 0 + %tmp8 = extractelement <4 x float> %arg6, i32 1 + %tmp9 = extractelement <4 x float> %arg6, i32 2 + %tmp10 = extractelement <4 x float> %arg6, i32 3 + %tmp11 = bitcast float %arg5 to i32 + br label %bb12 + +bb12: ; preds = %bb145, %bb + %tmp13 = phi float [ 0.000000e+00, %bb ], [ %tmp338, %bb145 ] + %tmp14 = phi float [ 0.000000e+00, %bb ], [ %tmp337, %bb145 ] + %tmp15 = phi float [ 0.000000e+00, %bb ], [ %tmp336, %bb145 ] + %tmp16 = phi float [ 0.000000e+00, %bb ], [ %tmp339, %bb145 ] + %tmp17 = phi float [ 0.000000e+00, %bb ], [ %tmp335, %bb145 ] + %tmp18 = phi float [ 0.000000e+00, %bb ], [ %tmp334, %bb145 ] + %tmp19 = phi float [ 0.000000e+00, %bb ], [ %tmp333, %bb145 ] + %tmp20 = phi float [ 0.000000e+00, %bb ], [ %tmp340, %bb145 ] + %tmp21 = phi float [ 0.000000e+00, %bb ], [ %tmp332, %bb145 ] + %tmp22 = phi float [ 0.000000e+00, %bb ], [ %tmp331, %bb145 ] + %tmp23 = phi float [ 0.000000e+00, %bb ], [ %tmp330, %bb145 ] + %tmp24 = phi float [ 0.000000e+00, %bb ], [ %tmp341, %bb145 ] + %tmp25 = phi float [ 0.000000e+00, %bb ], [ %tmp329, %bb145 ] + %tmp26 = phi float [ 0.000000e+00, %bb ], [ %tmp328, %bb145 ] + %tmp27 = phi float [ 0.000000e+00, %bb ], [ %tmp327, %bb145 ] + %tmp28 = phi float [ 0.000000e+00, %bb ], [ %tmp342, %bb145 ] + %tmp29 = phi float [ 0.000000e+00, %bb ], [ %tmp326, %bb145 ] + %tmp30 = phi float [ 0.000000e+00, %bb ], [ %tmp325, %bb145 ] + %tmp31 = phi float [ 0.000000e+00, %bb ], [ %tmp324, %bb145 ] + %tmp32 = phi float [ 0.000000e+00, %bb ], [ %tmp343, %bb145 ] + %tmp33 = phi float [ 0.000000e+00, %bb ], [ %tmp323, %bb145 ] + %tmp34 = phi float [ 0.000000e+00, %bb ], [ %tmp322, %bb145 ] + %tmp35 = phi float [ 0.000000e+00, %bb ], [ %tmp321, %bb145 ] + %tmp36 = phi float [ 0.000000e+00, %bb ], [ %tmp344, %bb145 ] + %tmp37 = phi float [ 0.000000e+00, %bb ], [ %tmp320, %bb145 ] + %tmp38 = phi float [ 0.000000e+00, %bb ], [ %tmp319, %bb145 ] + %tmp39 = phi float [ 0.000000e+00, %bb ], [ %tmp318, %bb145 ] + %tmp40 = phi float [ 0.000000e+00, %bb ], [ %tmp345, %bb145 ] + %tmp41 = phi float [ 0.000000e+00, %bb ], [ %tmp317, %bb145 ] + %tmp42 = phi float [ 0.000000e+00, %bb ], [ %tmp316, %bb145 ] + %tmp43 = phi float [ 0.000000e+00, %bb ], [ %tmp315, %bb145 ] + %tmp44 = phi float [ 0.000000e+00, %bb ], [ %tmp346, %bb145 ] + %tmp45 = phi float [ 0.000000e+00, %bb ], [ %tmp314, %bb145 ] + %tmp46 = phi float [ 0.000000e+00, %bb ], [ %tmp313, %bb145 ] + %tmp47 = phi float [ 0.000000e+00, %bb ], [ %tmp312, %bb145 ] + %tmp48 = phi float [ 0.000000e+00, %bb ], [ %tmp347, %bb145 ] + %tmp49 = phi float [ 0.000000e+00, %bb ], [ %tmp311, %bb145 ] + %tmp50 = phi float [ 0.000000e+00, %bb ], [ %tmp310, %bb145 ] + %tmp51 = phi float [ 0.000000e+00, %bb ], [ %tmp309, %bb145 ] + %tmp52 = phi float [ 0.000000e+00, %bb ], [ %tmp348, %bb145 ] + %tmp53 = phi float [ 0.000000e+00, %bb ], [ %tmp308, %bb145 ] + %tmp54 = phi float [ 0.000000e+00, %bb ], [ %tmp307, %bb145 ] + %tmp55 = phi float [ 0.000000e+00, %bb ], [ %tmp306, %bb145 ] + %tmp56 = phi float [ 0.000000e+00, %bb ], [ %tmp349, %bb145 ] + %tmp57 = phi float [ 0.000000e+00, %bb ], [ %tmp305, %bb145 ] + %tmp58 = phi float [ 0.000000e+00, %bb ], [ %tmp304, %bb145 ] + %tmp59 = phi float [ 0.000000e+00, %bb ], [ %tmp303, %bb145 ] + %tmp60 = phi float [ 0.000000e+00, %bb ], [ %tmp350, %bb145 ] + %tmp61 = phi float [ 0.000000e+00, %bb ], [ %tmp302, %bb145 ] + %tmp62 = phi float [ 0.000000e+00, %bb ], [ %tmp301, %bb145 ] + %tmp63 = phi float [ 0.000000e+00, %bb ], [ %tmp300, %bb145 ] + %tmp64 = phi float [ 0.000000e+00, %bb ], [ %tmp351, %bb145 ] + %tmp65 = phi float [ 0.000000e+00, %bb ], [ %tmp299, %bb145 ] + %tmp66 = phi float [ 0.000000e+00, %bb ], [ %tmp298, %bb145 ] + %tmp67 = phi float [ 0.000000e+00, %bb ], [ %tmp297, %bb145 ] + %tmp68 = phi float [ 0.000000e+00, %bb ], [ %tmp352, %bb145 ] + %tmp69 = phi float [ 0.000000e+00, %bb ], [ %tmp296, %bb145 ] + %tmp70 = phi float [ 0.000000e+00, %bb ], [ %tmp295, %bb145 ] + %tmp71 = phi float [ 0.000000e+00, %bb ], [ %tmp294, %bb145 ] + %tmp72 = phi float [ 0.000000e+00, %bb ], [ %tmp353, %bb145 ] + %tmp73 = phi float [ 0.000000e+00, %bb ], [ %tmp293, %bb145 ] + %tmp74 = phi float [ 0.000000e+00, %bb ], [ %tmp292, %bb145 ] + %tmp75 = phi float [ 0.000000e+00, %bb ], [ %tmp291, %bb145 ] + %tmp76 = phi float [ 0.000000e+00, %bb ], [ %tmp354, %bb145 ] + %tmp77 = phi float [ 0.000000e+00, %bb ], [ %tmp290, %bb145 ] + %tmp78 = phi float [ 0.000000e+00, %bb ], [ %tmp289, %bb145 ] + %tmp79 = phi float [ 0.000000e+00, %bb ], [ %tmp288, %bb145 ] + %tmp80 = phi float [ 0.000000e+00, %bb ], [ %tmp355, %bb145 ] + %tmp81 = phi float [ 0.000000e+00, %bb ], [ %tmp287, %bb145 ] + %tmp82 = phi float [ 0.000000e+00, %bb ], [ %tmp286, %bb145 ] + %tmp83 = phi float [ 0.000000e+00, %bb ], [ %tmp285, %bb145 ] + %tmp84 = phi float [ 0.000000e+00, %bb ], [ %tmp356, %bb145 ] + %tmp85 = phi float [ 0.000000e+00, %bb ], [ %tmp284, %bb145 ] + %tmp86 = phi float [ 0.000000e+00, %bb ], [ %tmp283, %bb145 ] + %tmp87 = phi float [ 0.000000e+00, %bb ], [ %tmp282, %bb145 ] + %tmp88 = phi float [ 0.000000e+00, %bb ], [ %tmp357, %bb145 ] + %tmp89 = phi float [ 0.000000e+00, %bb ], [ %tmp281, %bb145 ] + %tmp90 = phi float [ 0.000000e+00, %bb ], [ %tmp280, %bb145 ] + %tmp91 = phi float [ 0.000000e+00, %bb ], [ %tmp279, %bb145 ] + %tmp92 = phi float [ 0.000000e+00, %bb ], [ %tmp358, %bb145 ] + %tmp93 = phi float [ 0.000000e+00, %bb ], [ %tmp359, %bb145 ] + %tmp94 = phi float [ 0.000000e+00, %bb ], [ %tmp360, %bb145 ] + %tmp95 = phi float [ 0.000000e+00, %bb ], [ %tmp409, %bb145 ] + %tmp96 = phi float [ 0.000000e+00, %bb ], [ %tmp361, %bb145 ] + %tmp97 = phi float [ 0.000000e+00, %bb ], [ %tmp362, %bb145 ] + %tmp98 = phi float [ 0.000000e+00, %bb ], [ %tmp363, %bb145 ] + %tmp99 = phi float [ 0.000000e+00, %bb ], [ %tmp364, %bb145 ] + %tmp100 = phi float [ 0.000000e+00, %bb ], [ %tmp365, %bb145 ] + %tmp101 = phi float [ 0.000000e+00, %bb ], [ %tmp366, %bb145 ] + %tmp102 = phi float [ 0.000000e+00, %bb ], [ %tmp367, %bb145 ] + %tmp103 = phi float [ 0.000000e+00, %bb ], [ %tmp368, %bb145 ] + %tmp104 = phi float [ 0.000000e+00, %bb ], [ %tmp369, %bb145 ] + %tmp105 = phi float [ 0.000000e+00, %bb ], [ %tmp370, %bb145 ] + %tmp106 = phi float [ 0.000000e+00, %bb ], [ %tmp371, %bb145 ] + %tmp107 = phi float [ 0.000000e+00, %bb ], [ %tmp372, %bb145 ] + %tmp108 = phi float [ 0.000000e+00, %bb ], [ %tmp373, %bb145 ] + %tmp109 = phi float [ 0.000000e+00, %bb ], [ %tmp374, %bb145 ] + %tmp110 = phi float [ 0.000000e+00, %bb ], [ %tmp375, %bb145 ] + %tmp111 = phi float [ 0.000000e+00, %bb ], [ %tmp376, %bb145 ] + %tmp112 = phi float [ 0.000000e+00, %bb ], [ %tmp377, %bb145 ] + %tmp113 = phi float [ 0.000000e+00, %bb ], [ %tmp378, %bb145 ] + %tmp114 = phi float [ 0.000000e+00, %bb ], [ %tmp379, %bb145 ] + %tmp115 = phi float [ 0.000000e+00, %bb ], [ %tmp380, %bb145 ] + %tmp116 = phi float [ 0.000000e+00, %bb ], [ %tmp381, %bb145 ] + %tmp117 = phi float [ 0.000000e+00, %bb ], [ %tmp382, %bb145 ] + %tmp118 = phi float [ 0.000000e+00, %bb ], [ %tmp383, %bb145 ] + %tmp119 = phi float [ 0.000000e+00, %bb ], [ %tmp384, %bb145 ] + %tmp120 = phi float [ 0.000000e+00, %bb ], [ %tmp385, %bb145 ] + %tmp121 = phi float [ 0.000000e+00, %bb ], [ %tmp386, %bb145 ] + %tmp122 = phi float [ 0.000000e+00, %bb ], [ %tmp387, %bb145 ] + %tmp123 = phi float [ 0.000000e+00, %bb ], [ %tmp388, %bb145 ] + %tmp124 = phi float [ 0.000000e+00, %bb ], [ %tmp389, %bb145 ] + %tmp125 = phi float [ 0.000000e+00, %bb ], [ %tmp390, %bb145 ] + %tmp126 = phi float [ 0.000000e+00, %bb ], [ %tmp391, %bb145 ] + %tmp127 = phi float [ 0.000000e+00, %bb ], [ %tmp392, %bb145 ] + %tmp128 = phi float [ 0.000000e+00, %bb ], [ %tmp393, %bb145 ] + %tmp129 = phi float [ 0.000000e+00, %bb ], [ %tmp394, %bb145 ] + %tmp130 = phi float [ 0.000000e+00, %bb ], [ %tmp395, %bb145 ] + %tmp131 = phi float [ 0.000000e+00, %bb ], [ %tmp396, %bb145 ] + %tmp132 = phi float [ 0.000000e+00, %bb ], [ %tmp397, %bb145 ] + %tmp133 = phi float [ 0.000000e+00, %bb ], [ %tmp398, %bb145 ] + %tmp134 = phi float [ 0.000000e+00, %bb ], [ %tmp399, %bb145 ] + %tmp135 = phi float [ 0.000000e+00, %bb ], [ %tmp400, %bb145 ] + %tmp136 = phi float [ 0.000000e+00, %bb ], [ %tmp401, %bb145 ] + %tmp137 = phi float [ 0.000000e+00, %bb ], [ %tmp402, %bb145 ] + %tmp138 = phi float [ 0.000000e+00, %bb ], [ %tmp403, %bb145 ] + %tmp139 = phi float [ 0.000000e+00, %bb ], [ %tmp404, %bb145 ] + %tmp140 = phi float [ 0.000000e+00, %bb ], [ %tmp405, %bb145 ] + %tmp141 = phi float [ 0.000000e+00, %bb ], [ %tmp406, %bb145 ] + %tmp142 = bitcast float %tmp95 to i32 + %tmp143 = icmp sgt i32 %tmp142, 125 + br i1 %tmp143, label %bb144, label %bb145 + +bb144: ; preds = %bb12 + store volatile float %arg3, float addrspace(1)* %arg + store volatile float %tmp91, float addrspace(1)* %arg + store volatile float %tmp90, float addrspace(1)* %arg + store volatile float %tmp89, float addrspace(1)* %arg + store volatile float %tmp87, float addrspace(1)* %arg + store volatile float %tmp86, float addrspace(1)* %arg + store volatile float %tmp85, float addrspace(1)* %arg + store volatile float %tmp83, float addrspace(1)* %arg + store volatile float %tmp82, float addrspace(1)* %arg + store volatile float %tmp81, float addrspace(1)* %arg + store volatile float %tmp79, float addrspace(1)* %arg + store volatile float %tmp78, float addrspace(1)* %arg + store volatile float %tmp77, float addrspace(1)* %arg + store volatile float %tmp75, float addrspace(1)* %arg + store volatile float %tmp74, float addrspace(1)* %arg + store volatile float %tmp73, float addrspace(1)* %arg + store volatile float %tmp71, float addrspace(1)* %arg + store volatile float %tmp70, float addrspace(1)* %arg + store volatile float %tmp69, float addrspace(1)* %arg + store volatile float %tmp67, float addrspace(1)* %arg + store volatile float %tmp66, float addrspace(1)* %arg + store volatile float %tmp65, float addrspace(1)* %arg + store volatile float %tmp63, float addrspace(1)* %arg + store volatile float %tmp62, float addrspace(1)* %arg + store volatile float %tmp61, float addrspace(1)* %arg + store volatile float %tmp59, float addrspace(1)* %arg + store volatile float %tmp58, float addrspace(1)* %arg + store volatile float %tmp57, float addrspace(1)* %arg + store volatile float %tmp55, float addrspace(1)* %arg + store volatile float %tmp54, float addrspace(1)* %arg + store volatile float %tmp53, float addrspace(1)* %arg + store volatile float %tmp51, float addrspace(1)* %arg + store volatile float %tmp50, float addrspace(1)* %arg + store volatile float %tmp49, float addrspace(1)* %arg + store volatile float %tmp47, float addrspace(1)* %arg + store volatile float %tmp46, float addrspace(1)* %arg + store volatile float %tmp45, float addrspace(1)* %arg + store volatile float %tmp43, float addrspace(1)* %arg + store volatile float %tmp42, float addrspace(1)* %arg + store volatile float %tmp41, float addrspace(1)* %arg + store volatile float %tmp39, float addrspace(1)* %arg + store volatile float %tmp38, float addrspace(1)* %arg + store volatile float %tmp37, float addrspace(1)* %arg + store volatile float %tmp35, float addrspace(1)* %arg + store volatile float %tmp34, float addrspace(1)* %arg + store volatile float %tmp33, float addrspace(1)* %arg + store volatile float %tmp31, float addrspace(1)* %arg + store volatile float %tmp30, float addrspace(1)* %arg + store volatile float %tmp29, float addrspace(1)* %arg + store volatile float %tmp27, float addrspace(1)* %arg + store volatile float %tmp26, float addrspace(1)* %arg + store volatile float %tmp25, float addrspace(1)* %arg + store volatile float %tmp23, float addrspace(1)* %arg + store volatile float %tmp22, float addrspace(1)* %arg + store volatile float %tmp21, float addrspace(1)* %arg + store volatile float %tmp19, float addrspace(1)* %arg + store volatile float %tmp18, float addrspace(1)* %arg + store volatile float %tmp17, float addrspace(1)* %arg + store volatile float %tmp15, float addrspace(1)* %arg + store volatile float %tmp14, float addrspace(1)* %arg + store volatile float %tmp13, float addrspace(1)* %arg + store volatile float %tmp16, float addrspace(1)* %arg + store volatile float %tmp20, float addrspace(1)* %arg + store volatile float %tmp24, float addrspace(1)* %arg + store volatile float %tmp28, float addrspace(1)* %arg + store volatile float %tmp32, float addrspace(1)* %arg + store volatile float %tmp36, float addrspace(1)* %arg + store volatile float %tmp40, float addrspace(1)* %arg + store volatile float %tmp44, float addrspace(1)* %arg + store volatile float %tmp48, float addrspace(1)* %arg + store volatile float %tmp52, float addrspace(1)* %arg + store volatile float %tmp56, float addrspace(1)* %arg + store volatile float %tmp60, float addrspace(1)* %arg + store volatile float %tmp64, float addrspace(1)* %arg + store volatile float %tmp68, float addrspace(1)* %arg + store volatile float %tmp72, float addrspace(1)* %arg + store volatile float %tmp76, float addrspace(1)* %arg + store volatile float %tmp80, float addrspace(1)* %arg + store volatile float %tmp84, float addrspace(1)* %arg + store volatile float %tmp88, float addrspace(1)* %arg + store volatile float %tmp92, float addrspace(1)* %arg + store volatile float %tmp93, float addrspace(1)* %arg + store volatile float %tmp94, float addrspace(1)* %arg + store volatile float %tmp96, float addrspace(1)* %arg + store volatile float %tmp97, float addrspace(1)* %arg + store volatile float %tmp98, float addrspace(1)* %arg + store volatile float %tmp99, float addrspace(1)* %arg + store volatile float %tmp100, float addrspace(1)* %arg + store volatile float %tmp101, float addrspace(1)* %arg + store volatile float %tmp102, float addrspace(1)* %arg + store volatile float %tmp103, float addrspace(1)* %arg + store volatile float %tmp104, float addrspace(1)* %arg + store volatile float %tmp105, float addrspace(1)* %arg + store volatile float %tmp106, float addrspace(1)* %arg + store volatile float %tmp107, float addrspace(1)* %arg + store volatile float %tmp108, float addrspace(1)* %arg + store volatile float %tmp109, float addrspace(1)* %arg + store volatile float %tmp110, float addrspace(1)* %arg + store volatile float %tmp111, float addrspace(1)* %arg + store volatile float %tmp112, float addrspace(1)* %arg + store volatile float %tmp113, float addrspace(1)* %arg + store volatile float %tmp114, float addrspace(1)* %arg + store volatile float %tmp115, float addrspace(1)* %arg + store volatile float %tmp116, float addrspace(1)* %arg + store volatile float %tmp117, float addrspace(1)* %arg + store volatile float %tmp118, float addrspace(1)* %arg + store volatile float %tmp119, float addrspace(1)* %arg + store volatile float %tmp120, float addrspace(1)* %arg + store volatile float %tmp121, float addrspace(1)* %arg + store volatile float %tmp122, float addrspace(1)* %arg + store volatile float %tmp123, float addrspace(1)* %arg + store volatile float %tmp124, float addrspace(1)* %arg + store volatile float %tmp125, float addrspace(1)* %arg + store volatile float %tmp126, float addrspace(1)* %arg + store volatile float %tmp127, float addrspace(1)* %arg + store volatile float %tmp128, float addrspace(1)* %arg + store volatile float %tmp129, float addrspace(1)* %arg + store volatile float %tmp130, float addrspace(1)* %arg + store volatile float %tmp131, float addrspace(1)* %arg + store volatile float %tmp132, float addrspace(1)* %arg + store volatile float %tmp133, float addrspace(1)* %arg + store volatile float %tmp134, float addrspace(1)* %arg + store volatile float %tmp135, float addrspace(1)* %arg + store volatile float %tmp136, float addrspace(1)* %arg + store volatile float %tmp137, float addrspace(1)* %arg + store volatile float %tmp138, float addrspace(1)* %arg + store volatile float %tmp139, float addrspace(1)* %arg + store volatile float %arg4, float addrspace(1)* %arg + store volatile float %tmp7, float addrspace(1)* %arg + store volatile float %tmp8, float addrspace(1)* %arg + store volatile float %tmp9, float addrspace(1)* %arg + store volatile float %tmp10, float addrspace(1)* %arg + ret void + +bb145: ; preds = %bb12 + %tmp146 = bitcast float %tmp95 to i32 + %tmp147 = bitcast float %tmp95 to i32 + %tmp148 = add i32 %tmp11, %tmp147 + %tmp149 = bitcast i32 %tmp148 to float + %tmp150 = insertelement <128 x float> undef, float %tmp91, i32 0 + %tmp151 = insertelement <128 x float> %tmp150, float %tmp90, i32 1 + %tmp152 = insertelement <128 x float> %tmp151, float %tmp89, i32 2 + %tmp153 = insertelement <128 x float> %tmp152, float %tmp87, i32 3 + %tmp154 = insertelement <128 x float> %tmp153, float %tmp86, i32 4 + %tmp155 = insertelement <128 x float> %tmp154, float %tmp85, i32 5 + %tmp156 = insertelement <128 x float> %tmp155, float %tmp83, i32 6 + %tmp157 = insertelement <128 x float> %tmp156, float %tmp82, i32 7 + %tmp158 = insertelement <128 x float> %tmp157, float %tmp81, i32 8 + %tmp159 = insertelement <128 x float> %tmp158, float %tmp79, i32 9 + %tmp160 = insertelement <128 x float> %tmp159, float %tmp78, i32 10 + %tmp161 = insertelement <128 x float> %tmp160, float %tmp77, i32 11 + %tmp162 = insertelement <128 x float> %tmp161, float %tmp75, i32 12 + %tmp163 = insertelement <128 x float> %tmp162, float %tmp74, i32 13 + %tmp164 = insertelement <128 x float> %tmp163, float %tmp73, i32 14 + %tmp165 = insertelement <128 x float> %tmp164, float %tmp71, i32 15 + %tmp166 = insertelement <128 x float> %tmp165, float %tmp70, i32 16 + %tmp167 = insertelement <128 x float> %tmp166, float %tmp69, i32 17 + %tmp168 = insertelement <128 x float> %tmp167, float %tmp67, i32 18 + %tmp169 = insertelement <128 x float> %tmp168, float %tmp66, i32 19 + %tmp170 = insertelement <128 x float> %tmp169, float %tmp65, i32 20 + %tmp171 = insertelement <128 x float> %tmp170, float %tmp63, i32 21 + %tmp172 = insertelement <128 x float> %tmp171, float %tmp62, i32 22 + %tmp173 = insertelement <128 x float> %tmp172, float %tmp61, i32 23 + %tmp174 = insertelement <128 x float> %tmp173, float %tmp59, i32 24 + %tmp175 = insertelement <128 x float> %tmp174, float %tmp58, i32 25 + %tmp176 = insertelement <128 x float> %tmp175, float %tmp57, i32 26 + %tmp177 = insertelement <128 x float> %tmp176, float %tmp55, i32 27 + %tmp178 = insertelement <128 x float> %tmp177, float %tmp54, i32 28 + %tmp179 = insertelement <128 x float> %tmp178, float %tmp53, i32 29 + %tmp180 = insertelement <128 x float> %tmp179, float %tmp51, i32 30 + %tmp181 = insertelement <128 x float> %tmp180, float %tmp50, i32 31 + %tmp182 = insertelement <128 x float> %tmp181, float %tmp49, i32 32 + %tmp183 = insertelement <128 x float> %tmp182, float %tmp47, i32 33 + %tmp184 = insertelement <128 x float> %tmp183, float %tmp46, i32 34 + %tmp185 = insertelement <128 x float> %tmp184, float %tmp45, i32 35 + %tmp186 = insertelement <128 x float> %tmp185, float %tmp43, i32 36 + %tmp187 = insertelement <128 x float> %tmp186, float %tmp42, i32 37 + %tmp188 = insertelement <128 x float> %tmp187, float %tmp41, i32 38 + %tmp189 = insertelement <128 x float> %tmp188, float %tmp39, i32 39 + %tmp190 = insertelement <128 x float> %tmp189, float %tmp38, i32 40 + %tmp191 = insertelement <128 x float> %tmp190, float %tmp37, i32 41 + %tmp192 = insertelement <128 x float> %tmp191, float %tmp35, i32 42 + %tmp193 = insertelement <128 x float> %tmp192, float %tmp34, i32 43 + %tmp194 = insertelement <128 x float> %tmp193, float %tmp33, i32 44 + %tmp195 = insertelement <128 x float> %tmp194, float %tmp31, i32 45 + %tmp196 = insertelement <128 x float> %tmp195, float %tmp30, i32 46 + %tmp197 = insertelement <128 x float> %tmp196, float %tmp29, i32 47 + %tmp198 = insertelement <128 x float> %tmp197, float %tmp27, i32 48 + %tmp199 = insertelement <128 x float> %tmp198, float %tmp26, i32 49 + %tmp200 = insertelement <128 x float> %tmp199, float %tmp25, i32 50 + %tmp201 = insertelement <128 x float> %tmp200, float %tmp23, i32 51 + %tmp202 = insertelement <128 x float> %tmp201, float %tmp22, i32 52 + %tmp203 = insertelement <128 x float> %tmp202, float %tmp21, i32 53 + %tmp204 = insertelement <128 x float> %tmp203, float %tmp19, i32 54 + %tmp205 = insertelement <128 x float> %tmp204, float %tmp18, i32 55 + %tmp206 = insertelement <128 x float> %tmp205, float %tmp17, i32 56 + %tmp207 = insertelement <128 x float> %tmp206, float %tmp15, i32 57 + %tmp208 = insertelement <128 x float> %tmp207, float %tmp14, i32 58 + %tmp209 = insertelement <128 x float> %tmp208, float %tmp13, i32 59 + %tmp210 = insertelement <128 x float> %tmp209, float %tmp16, i32 60 + %tmp211 = insertelement <128 x float> %tmp210, float %tmp20, i32 61 + %tmp212 = insertelement <128 x float> %tmp211, float %tmp24, i32 62 + %tmp213 = insertelement <128 x float> %tmp212, float %tmp28, i32 63 + %tmp214 = insertelement <128 x float> %tmp213, float %tmp32, i32 64 + %tmp215 = insertelement <128 x float> %tmp214, float %tmp36, i32 65 + %tmp216 = insertelement <128 x float> %tmp215, float %tmp40, i32 66 + %tmp217 = insertelement <128 x float> %tmp216, float %tmp44, i32 67 + %tmp218 = insertelement <128 x float> %tmp217, float %tmp48, i32 68 + %tmp219 = insertelement <128 x float> %tmp218, float %tmp52, i32 69 + %tmp220 = insertelement <128 x float> %tmp219, float %tmp56, i32 70 + %tmp221 = insertelement <128 x float> %tmp220, float %tmp60, i32 71 + %tmp222 = insertelement <128 x float> %tmp221, float %tmp64, i32 72 + %tmp223 = insertelement <128 x float> %tmp222, float %tmp68, i32 73 + %tmp224 = insertelement <128 x float> %tmp223, float %tmp72, i32 74 + %tmp225 = insertelement <128 x float> %tmp224, float %tmp76, i32 75 + %tmp226 = insertelement <128 x float> %tmp225, float %tmp80, i32 76 + %tmp227 = insertelement <128 x float> %tmp226, float %tmp84, i32 77 + %tmp228 = insertelement <128 x float> %tmp227, float %tmp88, i32 78 + %tmp229 = insertelement <128 x float> %tmp228, float %tmp92, i32 79 + %tmp230 = insertelement <128 x float> %tmp229, float %tmp93, i32 80 + %tmp231 = insertelement <128 x float> %tmp230, float %tmp94, i32 81 + %tmp232 = insertelement <128 x float> %tmp231, float %tmp96, i32 82 + %tmp233 = insertelement <128 x float> %tmp232, float %tmp97, i32 83 + %tmp234 = insertelement <128 x float> %tmp233, float %tmp98, i32 84 + %tmp235 = insertelement <128 x float> %tmp234, float %tmp99, i32 85 + %tmp236 = insertelement <128 x float> %tmp235, float %tmp100, i32 86 + %tmp237 = insertelement <128 x float> %tmp236, float %tmp101, i32 87 + %tmp238 = insertelement <128 x float> %tmp237, float %tmp102, i32 88 + %tmp239 = insertelement <128 x float> %tmp238, float %tmp103, i32 89 + %tmp240 = insertelement <128 x float> %tmp239, float %tmp104, i32 90 + %tmp241 = insertelement <128 x float> %tmp240, float %tmp105, i32 91 + %tmp242 = insertelement <128 x float> %tmp241, float %tmp106, i32 92 + %tmp243 = insertelement <128 x float> %tmp242, float %tmp107, i32 93 + %tmp244 = insertelement <128 x float> %tmp243, float %tmp108, i32 94 + %tmp245 = insertelement <128 x float> %tmp244, float %tmp109, i32 95 + %tmp246 = insertelement <128 x float> %tmp245, float %tmp110, i32 96 + %tmp247 = insertelement <128 x float> %tmp246, float %tmp111, i32 97 + %tmp248 = insertelement <128 x float> %tmp247, float %tmp112, i32 98 + %tmp249 = insertelement <128 x float> %tmp248, float %tmp113, i32 99 + %tmp250 = insertelement <128 x float> %tmp249, float %tmp114, i32 100 + %tmp251 = insertelement <128 x float> %tmp250, float %tmp115, i32 101 + %tmp252 = insertelement <128 x float> %tmp251, float %tmp116, i32 102 + %tmp253 = insertelement <128 x float> %tmp252, float %tmp117, i32 103 + %tmp254 = insertelement <128 x float> %tmp253, float %tmp118, i32 104 + %tmp255 = insertelement <128 x float> %tmp254, float %tmp119, i32 105 + %tmp256 = insertelement <128 x float> %tmp255, float %tmp120, i32 106 + %tmp257 = insertelement <128 x float> %tmp256, float %tmp121, i32 107 + %tmp258 = insertelement <128 x float> %tmp257, float %tmp122, i32 108 + %tmp259 = insertelement <128 x float> %tmp258, float %tmp123, i32 109 + %tmp260 = insertelement <128 x float> %tmp259, float %tmp124, i32 110 + %tmp261 = insertelement <128 x float> %tmp260, float %tmp125, i32 111 + %tmp262 = insertelement <128 x float> %tmp261, float %tmp126, i32 112 + %tmp263 = insertelement <128 x float> %tmp262, float %tmp127, i32 113 + %tmp264 = insertelement <128 x float> %tmp263, float %tmp128, i32 114 + %tmp265 = insertelement <128 x float> %tmp264, float %tmp129, i32 115 + %tmp266 = insertelement <128 x float> %tmp265, float %tmp130, i32 116 + %tmp267 = insertelement <128 x float> %tmp266, float %tmp131, i32 117 + %tmp268 = insertelement <128 x float> %tmp267, float %tmp132, i32 118 + %tmp269 = insertelement <128 x float> %tmp268, float %tmp133, i32 119 + %tmp270 = insertelement <128 x float> %tmp269, float %tmp134, i32 120 + %tmp271 = insertelement <128 x float> %tmp270, float %tmp135, i32 121 + %tmp272 = insertelement <128 x float> %tmp271, float %tmp136, i32 122 + %tmp273 = insertelement <128 x float> %tmp272, float %tmp137, i32 123 + %tmp274 = insertelement <128 x float> %tmp273, float %tmp138, i32 124 + %tmp275 = insertelement <128 x float> %tmp274, float %tmp139, i32 125 + %tmp276 = insertelement <128 x float> %tmp275, float %tmp140, i32 126 + %tmp277 = insertelement <128 x float> %tmp276, float %tmp141, i32 127 + %tmp278 = insertelement <128 x float> %tmp277, float %tmp149, i32 %tmp146 + %tmp279 = extractelement <128 x float> %tmp278, i32 0 + %tmp280 = extractelement <128 x float> %tmp278, i32 1 + %tmp281 = extractelement <128 x float> %tmp278, i32 2 + %tmp282 = extractelement <128 x float> %tmp278, i32 3 + %tmp283 = extractelement <128 x float> %tmp278, i32 4 + %tmp284 = extractelement <128 x float> %tmp278, i32 5 + %tmp285 = extractelement <128 x float> %tmp278, i32 6 + %tmp286 = extractelement <128 x float> %tmp278, i32 7 + %tmp287 = extractelement <128 x float> %tmp278, i32 8 + %tmp288 = extractelement <128 x float> %tmp278, i32 9 + %tmp289 = extractelement <128 x float> %tmp278, i32 10 + %tmp290 = extractelement <128 x float> %tmp278, i32 11 + %tmp291 = extractelement <128 x float> %tmp278, i32 12 + %tmp292 = extractelement <128 x float> %tmp278, i32 13 + %tmp293 = extractelement <128 x float> %tmp278, i32 14 + %tmp294 = extractelement <128 x float> %tmp278, i32 15 + %tmp295 = extractelement <128 x float> %tmp278, i32 16 + %tmp296 = extractelement <128 x float> %tmp278, i32 17 + %tmp297 = extractelement <128 x float> %tmp278, i32 18 + %tmp298 = extractelement <128 x float> %tmp278, i32 19 + %tmp299 = extractelement <128 x float> %tmp278, i32 20 + %tmp300 = extractelement <128 x float> %tmp278, i32 21 + %tmp301 = extractelement <128 x float> %tmp278, i32 22 + %tmp302 = extractelement <128 x float> %tmp278, i32 23 + %tmp303 = extractelement <128 x float> %tmp278, i32 24 + %tmp304 = extractelement <128 x float> %tmp278, i32 25 + %tmp305 = extractelement <128 x float> %tmp278, i32 26 + %tmp306 = extractelement <128 x float> %tmp278, i32 27 + %tmp307 = extractelement <128 x float> %tmp278, i32 28 + %tmp308 = extractelement <128 x float> %tmp278, i32 29 + %tmp309 = extractelement <128 x float> %tmp278, i32 30 + %tmp310 = extractelement <128 x float> %tmp278, i32 31 + %tmp311 = extractelement <128 x float> %tmp278, i32 32 + %tmp312 = extractelement <128 x float> %tmp278, i32 33 + %tmp313 = extractelement <128 x float> %tmp278, i32 34 + %tmp314 = extractelement <128 x float> %tmp278, i32 35 + %tmp315 = extractelement <128 x float> %tmp278, i32 36 + %tmp316 = extractelement <128 x float> %tmp278, i32 37 + %tmp317 = extractelement <128 x float> %tmp278, i32 38 + %tmp318 = extractelement <128 x float> %tmp278, i32 39 + %tmp319 = extractelement <128 x float> %tmp278, i32 40 + %tmp320 = extractelement <128 x float> %tmp278, i32 41 + %tmp321 = extractelement <128 x float> %tmp278, i32 42 + %tmp322 = extractelement <128 x float> %tmp278, i32 43 + %tmp323 = extractelement <128 x float> %tmp278, i32 44 + %tmp324 = extractelement <128 x float> %tmp278, i32 45 + %tmp325 = extractelement <128 x float> %tmp278, i32 46 + %tmp326 = extractelement <128 x float> %tmp278, i32 47 + %tmp327 = extractelement <128 x float> %tmp278, i32 48 + %tmp328 = extractelement <128 x float> %tmp278, i32 49 + %tmp329 = extractelement <128 x float> %tmp278, i32 50 + %tmp330 = extractelement <128 x float> %tmp278, i32 51 + %tmp331 = extractelement <128 x float> %tmp278, i32 52 + %tmp332 = extractelement <128 x float> %tmp278, i32 53 + %tmp333 = extractelement <128 x float> %tmp278, i32 54 + %tmp334 = extractelement <128 x float> %tmp278, i32 55 + %tmp335 = extractelement <128 x float> %tmp278, i32 56 + %tmp336 = extractelement <128 x float> %tmp278, i32 57 + %tmp337 = extractelement <128 x float> %tmp278, i32 58 + %tmp338 = extractelement <128 x float> %tmp278, i32 59 + %tmp339 = extractelement <128 x float> %tmp278, i32 60 + %tmp340 = extractelement <128 x float> %tmp278, i32 61 + %tmp341 = extractelement <128 x float> %tmp278, i32 62 + %tmp342 = extractelement <128 x float> %tmp278, i32 63 + %tmp343 = extractelement <128 x float> %tmp278, i32 64 + %tmp344 = extractelement <128 x float> %tmp278, i32 65 + %tmp345 = extractelement <128 x float> %tmp278, i32 66 + %tmp346 = extractelement <128 x float> %tmp278, i32 67 + %tmp347 = extractelement <128 x float> %tmp278, i32 68 + %tmp348 = extractelement <128 x float> %tmp278, i32 69 + %tmp349 = extractelement <128 x float> %tmp278, i32 70 + %tmp350 = extractelement <128 x float> %tmp278, i32 71 + %tmp351 = extractelement <128 x float> %tmp278, i32 72 + %tmp352 = extractelement <128 x float> %tmp278, i32 73 + %tmp353 = extractelement <128 x float> %tmp278, i32 74 + %tmp354 = extractelement <128 x float> %tmp278, i32 75 + %tmp355 = extractelement <128 x float> %tmp278, i32 76 + %tmp356 = extractelement <128 x float> %tmp278, i32 77 + %tmp357 = extractelement <128 x float> %tmp278, i32 78 + %tmp358 = extractelement <128 x float> %tmp278, i32 79 + %tmp359 = extractelement <128 x float> %tmp278, i32 80 + %tmp360 = extractelement <128 x float> %tmp278, i32 81 + %tmp361 = extractelement <128 x float> %tmp278, i32 82 + %tmp362 = extractelement <128 x float> %tmp278, i32 83 + %tmp363 = extractelement <128 x float> %tmp278, i32 84 + %tmp364 = extractelement <128 x float> %tmp278, i32 85 + %tmp365 = extractelement <128 x float> %tmp278, i32 86 + %tmp366 = extractelement <128 x float> %tmp278, i32 87 + %tmp367 = extractelement <128 x float> %tmp278, i32 88 + %tmp368 = extractelement <128 x float> %tmp278, i32 89 + %tmp369 = extractelement <128 x float> %tmp278, i32 90 + %tmp370 = extractelement <128 x float> %tmp278, i32 91 + %tmp371 = extractelement <128 x float> %tmp278, i32 92 + %tmp372 = extractelement <128 x float> %tmp278, i32 93 + %tmp373 = extractelement <128 x float> %tmp278, i32 94 + %tmp374 = extractelement <128 x float> %tmp278, i32 95 + %tmp375 = extractelement <128 x float> %tmp278, i32 96 + %tmp376 = extractelement <128 x float> %tmp278, i32 97 + %tmp377 = extractelement <128 x float> %tmp278, i32 98 + %tmp378 = extractelement <128 x float> %tmp278, i32 99 + %tmp379 = extractelement <128 x float> %tmp278, i32 100 + %tmp380 = extractelement <128 x float> %tmp278, i32 101 + %tmp381 = extractelement <128 x float> %tmp278, i32 102 + %tmp382 = extractelement <128 x float> %tmp278, i32 103 + %tmp383 = extractelement <128 x float> %tmp278, i32 104 + %tmp384 = extractelement <128 x float> %tmp278, i32 105 + %tmp385 = extractelement <128 x float> %tmp278, i32 106 + %tmp386 = extractelement <128 x float> %tmp278, i32 107 + %tmp387 = extractelement <128 x float> %tmp278, i32 108 + %tmp388 = extractelement <128 x float> %tmp278, i32 109 + %tmp389 = extractelement <128 x float> %tmp278, i32 110 + %tmp390 = extractelement <128 x float> %tmp278, i32 111 + %tmp391 = extractelement <128 x float> %tmp278, i32 112 + %tmp392 = extractelement <128 x float> %tmp278, i32 113 + %tmp393 = extractelement <128 x float> %tmp278, i32 114 + %tmp394 = extractelement <128 x float> %tmp278, i32 115 + %tmp395 = extractelement <128 x float> %tmp278, i32 116 + %tmp396 = extractelement <128 x float> %tmp278, i32 117 + %tmp397 = extractelement <128 x float> %tmp278, i32 118 + %tmp398 = extractelement <128 x float> %tmp278, i32 119 + %tmp399 = extractelement <128 x float> %tmp278, i32 120 + %tmp400 = extractelement <128 x float> %tmp278, i32 121 + %tmp401 = extractelement <128 x float> %tmp278, i32 122 + %tmp402 = extractelement <128 x float> %tmp278, i32 123 + %tmp403 = extractelement <128 x float> %tmp278, i32 124 + %tmp404 = extractelement <128 x float> %tmp278, i32 125 + %tmp405 = extractelement <128 x float> %tmp278, i32 126 + %tmp406 = extractelement <128 x float> %tmp278, i32 127 + %tmp407 = bitcast float %tmp95 to i32 + %tmp408 = add i32 %tmp407, 1 + %tmp409 = bitcast i32 %tmp408 to float + br label %bb12 +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll =================================================================== --- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -1,7 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling < %s | FileCheck %s - -; FIXME: Enable -verify-instructions +; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; This ends up using all 255 registers and requires register ; scavenging which will fail to find an unsued register. @@ -11,9 +9,19 @@ ; FIXME: The same register is initialized to 0 for every spill. -; CHECK-LABEL: {{^}}main: -; CHECK: NumVgprs: 256 -; CHECK: ScratchSize: 1024 +; GCN-LABEL: {{^}}main: + +; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s11, 0x80f000 +; VI-NEXT: s_mov_b32 s11, 0x800000 + +; s12 is offset user SGPR +; GCN: buffer_store_dword {{v[0-9]+}}, s[8:11], s12 offset:{{[0-9]+}} ; 4-byte Folded Spill + +; GCN: NumVgprs: 256 +; GCN: ScratchSize: 1024 define void @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 { bb: