diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1284,6 +1284,10 @@ def EnableLateCFGStructurize : Predicate< "EnableLateStructurizeCFG">; +def EnableFlatScratch : Predicate<"Subtarget->enableFlatScratch()">; + +def DisableFlatScratch : Predicate<"!Subtarget->enableFlatScratch()">; + // Include AMDGPU TD files include "SISchedule.td" include "GCNProcessors.td" diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -240,6 +240,8 @@ SDValue &Offset) const; bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset) const; + bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, + SDValue &Offset) const; bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool &Imm) const; @@ -1672,9 +1674,11 @@ SDValue &Offset) const { int64_t OffsetVal = 0; + unsigned AS = findMemSDNode(N)->getAddressSpace(); + if (Subtarget->hasFlatInstOffsets() && (!Subtarget->hasFlatSegmentOffsetBug() || - findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)) { + AS != AMDGPUAS::FLAT_ADDRESS)) { SDValue N0, N1; if (CurDAG->isBaseWithConstantOffset(Addr)) { N0 = Addr.getOperand(0); @@ -1686,7 +1690,6 @@ uint64_t COffsetVal = cast(N1)->getSExtValue(); const SIInstrInfo *TII = Subtarget->getInstrInfo(); - unsigned AS = findMemSDNode(N)->getAddressSpace(); if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) { Addr = N0; OffsetVal = COffsetVal; @@ -1719,39 +1722,52 @@ OffsetVal = ImmField; - // TODO: Should this try to use a scalar add pseudo if the base address - // is uniform and saddr is usable? - SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); - SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); - - SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, - MVT::i32, N0, Sub0); - SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, - MVT::i32, N0, Sub1); - SDValue AddOffsetLo = getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); - SDValue AddOffsetHi = - getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); - - SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); - SDNode *Add = - CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs, - {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); - - SDNode *Addc = CurDAG->getMachineNode( - AMDGPU::V_ADDC_U32_e64, DL, VTs, - {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); - - SDValue RegSequenceArgs[] = { - CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32), - SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1}; - - Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, - MVT::i64, RegSequenceArgs), - 0); + if (Addr.getValueType().getSizeInBits() == 32) { + SmallVector Opnds; + Opnds.push_back(N0); + Opnds.push_back(AddOffsetLo); + unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32; + if (Subtarget->hasAddNoCarry()) { + AddOp = AMDGPU::V_ADD_U32_e64; + Opnds.push_back(Clamp); + } + Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0); + } else { + // TODO: Should this try to use a scalar add pseudo if the base address + // is uniform and saddr is usable? + SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); + SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); + + SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub0); + SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub1); + + SDValue AddOffsetHi = + getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); + + SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); + + SDNode *Add = + CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs, + {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); + + SDNode *Addc = CurDAG->getMachineNode( + AMDGPU::V_ADDC_U32_e64, DL, VTs, + {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); + + SDValue RegSequenceArgs[] = { + CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32), + SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1}; + + Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::i64, RegSequenceArgs), + 0); + } } } } @@ -1824,6 +1840,64 @@ return true; } +// Match (32-bit SGPR base) + sext(imm offset) +bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *N, + SDValue Addr, + SDValue &SAddr, + SDValue &Offset) const { + if (Addr->isDivergent()) + return false; + + SAddr = Addr; + int64_t COffsetVal = 0; + + if (CurDAG->isBaseWithConstantOffset(Addr)) { + COffsetVal = cast(Addr.getOperand(1))->getSExtValue(); + SAddr = Addr.getOperand(0); + } + + if (auto FI = dyn_cast(SAddr)) { + SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); + } else if (SAddr.getOpcode() == ISD::ADD && + isa(SAddr.getOperand(0))) { + // Materialize this into a scalar move for scalar address to avoid + // readfirstlane. + auto FI = cast(SAddr.getOperand(0)); + SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(), + FI->getValueType(0)); + SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, SDLoc(SAddr), + MVT::i32, TFI, SAddr.getOperand(1)), + 0); + } + + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + + if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) { + int64_t RemainderOffset = COffsetVal; + int64_t ImmField = 0; + const unsigned NumBits = TII->getNumFlatOffsetBits(true); + // Use signed division by a power of two to truncate towards 0. + int64_t D = 1LL << (NumBits - 1); + RemainderOffset = (COffsetVal / D) * D; + ImmField = COffsetVal - RemainderOffset; + + assert(TII->isLegalFLATOffset(ImmField, AMDGPUAS::PRIVATE_ADDRESS, true)); + assert(RemainderOffset + ImmField == COffsetVal); + + COffsetVal = ImmField; + + SDLoc DL(N); + SDValue AddOffset = + getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); + SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, DL, MVT::i32, + SAddr, AddOffset), 0); + } + + Offset = CurDAG->getTargetConstant(COffsetVal, SDLoc(), MVT::i16); + + return true; +} + bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool &Imm) const { ConstantSDNode *C = dyn_cast(ByteOffsetNode); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -959,6 +959,8 @@ return true; } + bool enableFlatScratch() const; + void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -50,6 +50,11 @@ cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false)); +static cl::opt EnableFlatScratch( + "amdgpu-enable-flat-scratch", + cl::desc("Use flat scratch instructions"), + cl::init(false)); + GCNSubtarget::~GCNSubtarget() = default; R600Subtarget & @@ -286,6 +291,10 @@ *this, *static_cast(RegBankInfo.get()), TM)); } +bool GCNSubtarget::enableFlatScratch() const { + return EnableFlatScratch && hasFlatScratchInsts(); +} + unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { if (getGeneration() < GFX10) return 1; diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1592,6 +1592,7 @@ >; } +let OtherPredicates = [DisableFlatScratch] in { defm : MUBUFScratchLoadPat ; defm : MUBUFScratchLoadPat ; defm : MUBUFScratchLoadPat ; @@ -1610,7 +1611,7 @@ defm : MUBUFScratchLoadPat ; defm : MUBUFScratchLoadPat ; -let OtherPredicates = [D16PreservesUnusedBits] in { +let OtherPredicates = [D16PreservesUnusedBits, DisableFlatScratch] in { defm : MUBUFScratchLoadPat_D16; defm : MUBUFScratchLoadPat_D16; defm : MUBUFScratchLoadPat_D16; @@ -1626,6 +1627,8 @@ defm : MUBUFScratchLoadPat_D16; } +} // End OtherPredicates = [DisableFlatScratch] + multiclass MUBUFStore_Atomic_Pattern { // Store follows atomic op convention so address is first @@ -1676,6 +1679,7 @@ >; } +let OtherPredicates = [DisableFlatScratch] in { defm : MUBUFScratchStorePat ; defm : MUBUFScratchStorePat ; defm : MUBUFScratchStorePat ; @@ -1690,7 +1694,7 @@ defm : MUBUFScratchStorePat ; -let OtherPredicates = [D16PreservesUnusedBits] in { +let OtherPredicates = [D16PreservesUnusedBits, DisableFlatScratch] in { // Hiding the extract high pattern in the PatFrag seems to not // automatically increase the complexity. let AddedComplexity = 1 in { @@ -1698,6 +1702,7 @@ defm : MUBUFScratchStorePat ; } } +} // End OtherPredicates = [DisableFlatScratch] //===----------------------------------------------------------------------===// // MTBUF Patterns diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -8,8 +8,10 @@ def FLATOffset : ComplexPattern", [], [SDNPWantRoot], -10>; def FLATOffsetSigned : ComplexPattern", [], [SDNPWantRoot], -10>; +def ScratchOffset : ComplexPattern", [], [SDNPWantRoot], -10>; def GlobalSAddr : ComplexPattern; +def ScratchSAddr : ComplexPattern; //===----------------------------------------------------------------------===// // FLAT classes @@ -233,6 +235,11 @@ let maybeAtomic = 1; } +class FlatScratchInst { + string SVOp = sv_op; + string Mode = mode; +} + class FLAT_Scratch_Load_Pseudo { let is_flat_scratch = 1 in { - def "" : FLAT_Scratch_Load_Pseudo; - def _SADDR : FLAT_Scratch_Load_Pseudo; + def "" : FLAT_Scratch_Load_Pseudo, + FlatScratchInst; + def _SADDR : FLAT_Scratch_Load_Pseudo, + FlatScratchInst; let SubtargetPredicate = HasFlatScratchSTMode in - def _ST : FLAT_Scratch_Load_Pseudo; + def _ST : FLAT_Scratch_Load_Pseudo, + FlatScratchInst; } } multiclass FLAT_Scratch_Store_Pseudo { let is_flat_scratch = 1 in { - def "" : FLAT_Scratch_Store_Pseudo; - def _SADDR : FLAT_Scratch_Store_Pseudo; + def "" : FLAT_Scratch_Store_Pseudo, + FlatScratchInst; + def _SADDR : FLAT_Scratch_Store_Pseudo, + FlatScratchInst; let SubtargetPredicate = HasFlatScratchSTMode in - def _ST : FLAT_Scratch_Store_Pseudo; + def _ST : FLAT_Scratch_Store_Pseudo, + FlatScratchInst; } } @@ -852,6 +865,37 @@ (inst $vaddr, $data, $offset) >; +class ScratchLoadSignedPat : GCNPat < + (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset))), + (inst $vaddr, $offset) +>; + +class ScratchLoadSignedPat_D16 : GCNPat < + (node (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset), vt:$in), + (inst $vaddr, $offset, 0, 0, 0, $in) +>; + +class ScratchStoreSignedPat : GCNPat < + (node vt:$data, (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset)), + (inst getVregSrcForVT.ret:$data, $vaddr, $offset) +>; + +class ScratchLoadSaddrPat : GCNPat < + (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i16:$offset))), + (inst $saddr, $offset) +>; + +class ScratchLoadSaddrPat_D16 : GCNPat < + (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i16:$offset), vt:$in)), + (inst $saddr, $offset, 0, 0, 0, $in) +>; + +class ScratchStoreSaddrPat : GCNPat < + (node vt:$data, (ScratchSAddr (i32 SGPR_32:$saddr), i16:$offset)), + (inst getVregSrcForVT.ret:$data, $saddr, $offset) +>; + let OtherPredicates = [HasFlatAddressSpace] in { def : FlatLoadPat ; @@ -1009,6 +1053,37 @@ } } +multiclass ScratchFLATLoadPats { + def : ScratchLoadSignedPat { + let AddedComplexity = 25; + } + + def : ScratchLoadSaddrPat(!cast(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 26; + } +} + +multiclass ScratchFLATStorePats { + def : ScratchStoreSignedPat { + let AddedComplexity = 25; + } + + def : ScratchStoreSaddrPat(!cast(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 26; + } +} + +multiclass ScratchFLATLoadPats_D16 { + def : ScratchLoadSignedPat_D16 { + let AddedComplexity = 25; + } + + def : ScratchLoadSaddrPat_D16(!cast(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 26; + } +} + let OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATLoadPats ; @@ -1109,6 +1184,62 @@ } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 +let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in { + +defm : ScratchFLATLoadPats ; +defm : ScratchFLATLoadPats ; +defm : ScratchFLATLoadPats ; +defm : ScratchFLATLoadPats ; +defm : ScratchFLATLoadPats ; +defm : ScratchFLATLoadPats ; +defm : ScratchFLATLoadPats ; +defm : ScratchFLATLoadPats ; +defm : ScratchFLATLoadPats ; +defm : ScratchFLATLoadPats ; + +foreach vt = Reg32Types.types in { +defm : ScratchFLATLoadPats ; +defm : ScratchFLATStorePats ; +} + +foreach vt = VReg_64.RegTypes in { +defm : ScratchFLATLoadPats ; +defm : ScratchFLATStorePats ; +} + +defm : ScratchFLATLoadPats ; + +foreach vt = VReg_128.RegTypes in { +defm : ScratchFLATLoadPats ; +defm : ScratchFLATStorePats ; +} + +defm : ScratchFLATStorePats ; +defm : ScratchFLATStorePats ; +defm : ScratchFLATStorePats ; +defm : ScratchFLATStorePats ; +defm : ScratchFLATStorePats ; + +let OtherPredicates = [D16PreservesUnusedBits, HasFlatScratchInsts, EnableFlatScratch] in { +defm : ScratchFLATStorePats ; +defm : ScratchFLATStorePats ; + +defm : ScratchFLATLoadPats_D16 ; +defm : ScratchFLATLoadPats_D16 ; +defm : ScratchFLATLoadPats_D16 ; +defm : ScratchFLATLoadPats_D16 ; +defm : ScratchFLATLoadPats_D16 ; +defm : ScratchFLATLoadPats_D16 ; + +defm : ScratchFLATLoadPats_D16 ; +defm : ScratchFLATLoadPats_D16 ; +defm : ScratchFLATLoadPats_D16 ; +defm : ScratchFLATLoadPats_D16 ; +defm : ScratchFLATLoadPats_D16 ; +defm : ScratchFLATLoadPats_D16 ; +} + +} // End OtherPredicates = [HasFlatScratchInsts,EnableFlatScratch] //===----------------------------------------------------------------------===// // Target diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -173,7 +173,7 @@ int OpNo, const MachineOperand &OpToFold) { return OpToFold.isFI() && - (TII->isMUBUF(UseMI) || TII->isFLATScratch(UseMI)) && + TII->isMUBUF(UseMI) && OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::vaddr); } diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -134,7 +134,8 @@ // We need to specially emit stack operations here because a different frame // register is used than in the rest of the function, as getFrameRegister would // use. -static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, +static void buildPrologSpill(const GCNSubtarget &ST, LivePhysRegs &LiveRegs, + MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const SIInstrInfo *TII, Register SpillReg, Register ScratchRsrcReg, Register SPReg, int FI) { @@ -147,7 +148,19 @@ MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4, MFI.getObjectAlign(FI)); - if (SIInstrInfo::isLegalMUBUFImmOffset(Offset)) { + if (ST.enableFlatScratch()) { + if (TII->isLegalFLATOffset(Offset, AMDGPUAS::PRIVATE_ADDRESS, true)) { + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_STORE_DWORD_SADDR)) + .addReg(SpillReg, RegState::Kill) + .addReg(SPReg) + .addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // dlc + .addMemOperand(MMO); + return; + } + } else if (SIInstrInfo::isLegalMUBUFImmOffset(Offset)) { BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET)) .addReg(SpillReg, RegState::Kill) .addReg(ScratchRsrcReg) @@ -166,29 +179,48 @@ // offset in the spill. LiveRegs.addReg(SpillReg); - MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( - MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass); + if (ST.enableFlatScratch()) { + MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( + MF->getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0RegClass); - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg) - .addImm(Offset); + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_ADD_U32), OffsetReg) + .addReg(SPReg) + .addImm(Offset); - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN)) - .addReg(SpillReg, RegState::Kill) - .addReg(OffsetReg, RegState::Kill) - .addReg(ScratchRsrcReg) - .addReg(SPReg) - .addImm(0) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .addImm(0) // dlc - .addImm(0) // swz - .addMemOperand(MMO); + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_STORE_DWORD_SADDR)) + .addReg(SpillReg, RegState::Kill) + .addReg(OffsetReg, RegState::Kill) + .addImm(0) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // dlc + .addMemOperand(MMO); + } else { + MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( + MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass); + + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg) + .addImm(Offset); + + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN)) + .addReg(SpillReg, RegState::Kill) + .addReg(OffsetReg, RegState::Kill) + .addReg(ScratchRsrcReg) + .addReg(SPReg) + .addImm(0) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addImm(0) // dlc + .addImm(0) // swz + .addMemOperand(MMO); + } LiveRegs.removeReg(SpillReg); } -static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, +static void buildEpilogReload(const GCNSubtarget &ST, LivePhysRegs &LiveRegs, + MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const SIInstrInfo *TII, Register SpillReg, Register ScratchRsrcReg, Register SPReg, int FI) { @@ -200,6 +232,35 @@ MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4, MFI.getObjectAlign(FI)); + if (ST.enableFlatScratch()) { + if (TII->isLegalFLATOffset(Offset, AMDGPUAS::PRIVATE_ADDRESS, true)) { + BuildMI(MBB, I, DebugLoc(), + TII->get(AMDGPU::SCRATCH_LOAD_DWORD_SADDR), SpillReg) + .addReg(SPReg) + .addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // dlc + .addMemOperand(MMO); + return; + } + MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( + MF->getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0RegClass); + + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_ADD_U32), OffsetReg) + .addReg(SPReg) + .addImm(Offset); + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_LOAD_DWORD_SADDR), + SpillReg) + .addReg(OffsetReg, RegState::Kill) + .addImm(0) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // dlc + .addMemOperand(MMO); + return; + } + if (SIInstrInfo::isLegalMUBUFImmOffset(Offset)) { BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg) @@ -256,6 +317,7 @@ Register FlatScratchInitReg = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); + assert(FlatScratchInitReg); MachineRegisterInfo &MRI = MF.getRegInfo(); MRI.addLiveIn(FlatScratchInitReg); @@ -365,6 +427,10 @@ return ScratchRsrcReg; } +static unsigned getScratchScaleFactor(const GCNSubtarget &ST) { + return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize(); +} + void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); @@ -461,7 +527,7 @@ Register SPReg = MFI->getStackPtrOffsetReg(); assert(SPReg != AMDGPU::SP_REG); BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg) - .addImm(MF.getFrameInfo().getStackSize() * ST.getWavefrontSize()); + .addImm(MF.getFrameInfo().getStackSize() * getScratchScaleFactor(ST)); } if (hasFP(MF)) { @@ -780,7 +846,7 @@ if (!ScratchExecCopy) ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); - buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR, + buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, Reg.VGPR, FuncInfo->getScratchRSrcReg(), StackPtrReg, Reg.FI.getValue()); @@ -798,7 +864,7 @@ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) .addReg(FramePtrReg); - buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR, + buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, FuncInfo->getScratchRSrcReg(), StackPtrReg, FuncInfo->FramePointerSaveIndex.getValue()); } @@ -815,7 +881,7 @@ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) .addReg(BasePtrReg); - buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR, + buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, FuncInfo->getScratchRSrcReg(), StackPtrReg, *FuncInfo->BasePointerSaveIndex); } @@ -888,11 +954,11 @@ // s_and_b32 s32, tmp_reg, 0b111...0000 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg) .addReg(StackPtrReg) - .addImm((Alignment - 1) * ST.getWavefrontSize()) + .addImm((Alignment - 1) * getScratchScaleFactor(ST)) .setMIFlag(MachineInstr::FrameSetup); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg) .addReg(ScratchSPReg, RegState::Kill) - .addImm(-Alignment * ST.getWavefrontSize()) + .addImm(-Alignment * getScratchScaleFactor(ST)) .setMIFlag(MachineInstr::FrameSetup); FuncInfo->setIsStackRealigned(true); } else if ((HasFP = hasFP(MF))) { @@ -914,7 +980,7 @@ if (HasFP && RoundedSize != 0) { BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg) .addReg(StackPtrReg) - .addImm(RoundedSize * ST.getWavefrontSize()) + .addImm(RoundedSize * getScratchScaleFactor(ST)) .setMIFlag(MachineInstr::FrameSetup); } @@ -976,7 +1042,7 @@ if (RoundedSize != 0 && hasFP(MF)) { BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) .addReg(StackPtrReg) - .addImm(RoundedSize * ST.getWavefrontSize()) + .addImm(RoundedSize * getScratchScaleFactor(ST)) .setMIFlag(MachineInstr::FrameDestroy); } @@ -1002,7 +1068,7 @@ MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::VGPR_32RegClass); - buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR, + buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, TempVGPR, FuncInfo->getScratchRSrcReg(), StackPtrReg, FI); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg) .addReg(TempVGPR, RegState::Kill); @@ -1028,7 +1094,7 @@ MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::VGPR_32RegClass); - buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR, + buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, TempVGPR, FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg) .addReg(TempVGPR, RegState::Kill); @@ -1053,7 +1119,7 @@ if (!ScratchExecCopy) ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); - buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR, + buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, Reg.VGPR, FuncInfo->getScratchRSrcReg(), StackPtrReg, Reg.FI.getValue()); } @@ -1264,7 +1330,7 @@ unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; BuildMI(MBB, I, DL, TII->get(Op), SPReg) .addReg(SPReg) - .addImm(Amount * ST.getWavefrontSize()); + .addImm(Amount * getScratchScaleFactor(ST)); } else if (CalleePopAmount != 0) { llvm_unreachable("is this used?"); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2298,7 +2298,8 @@ } assert(!Info->hasDispatchPtr() && - !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && + !Info->hasKernargSegmentPtr() && + (!Info->hasFlatScratchInit() || Subtarget->enableFlatScratch()) && !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -507,11 +507,20 @@ return (Flags & SIInstrFlags::FLAT) && !(Flags & SIInstrFlags::LGKM_CNT); } + bool isSegmentSpecificFLAT(uint16_t Opcode) const { + auto Flags = get(Opcode).TSFlags; + return (Flags & SIInstrFlags::FLAT) && !(Flags & SIInstrFlags::LGKM_CNT); + } + // FIXME: Make this more precise static bool isFLATScratch(const MachineInstr &MI) { return isSegmentSpecificFLAT(MI); } + bool isFLATScratch(uint16_t Opcode) const { + return isSegmentSpecificFLAT(Opcode); + } + // Any FLAT encoded instruction, including global_* and scratch_*. bool isFLAT(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::FLAT; @@ -1147,6 +1156,9 @@ LLVM_READONLY int getVCMPXNoSDstOp(uint16_t Opcode); + LLVM_READONLY + int getFlatScratchInstSTfromSS(uint16_t Opcode); + const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19); const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2564,6 +2564,16 @@ let ValueCols = [["1"]]; } +// Maps flat scratch opcodes by addressing modes +def getFlatScratchInstSTfromSS : InstrMapping { + let FilterClass = "FlatScratchInst"; + let RowFields = ["SVOp"]; + let ColFields = ["Mode"]; + let KeyCol = ["SS"]; + let ValueCols = [["ST"]]; +} + + include "SIInstructions.td" include "DSInstructions.td" diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -167,11 +167,12 @@ if (UseFixedABI || F.hasFnAttribute("amdgpu-kernarg-segment-ptr")) KernargSegmentPtr = true; - if (ST.hasFlatAddressSpace() && isEntryFunction() && isAmdHsaOrMesa) { + if (ST.hasFlatAddressSpace() && isEntryFunction() && + (isAmdHsaOrMesa || ST.enableFlatScratch())) { // TODO: This could be refined a lot. The attribute is a poor way of // detecting calls or stack objects that may require it before argument // lowering. - if (HasCalls || HasStackObjects) + if (HasCalls || HasStackObjects || ST.enableFlatScratch()) FlatScratchInit = true; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -89,7 +89,7 @@ const MachineFunction &MF) const override; bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override; - int64_t getMUBUFInstrOffset(const MachineInstr *MI) const; + int64_t getScratchInstrOffset(const MachineInstr *MI) const; int64_t getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const override; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -388,8 +388,8 @@ return true; } -int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const { - assert(SIInstrInfo::isMUBUF(*MI)); +int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const { + assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI)); int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::offset); @@ -398,23 +398,29 @@ int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const { - if (!SIInstrInfo::isMUBUF(*MI)) + if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) return 0; - assert(Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::vaddr) && + assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::vaddr) || + (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::saddr))) && "Should never see frame index on non-address operand"); - return getMUBUFInstrOffset(MI); + return getScratchInstrOffset(MI); } bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { if (!MI->mayLoadOrStore()) return false; - int64_t FullOffset = Offset + getMUBUFInstrOffset(MI); + int64_t FullOffset = Offset + getScratchInstrOffset(MI); - return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset); + if (SIInstrInfo::isMUBUF(*MI)) + return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset); + + const SIInstrInfo *TII = ST.getInstrInfo(); + return TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, true); } void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, @@ -429,9 +435,11 @@ MachineFunction *MF = MBB->getParent(); const SIInstrInfo *TII = ST.getInstrInfo(); + unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32 + : AMDGPU::V_MOV_B32_e32; if (Offset == 0) { - BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg) + BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg) .addFrameIndex(FrameIdx); return; } @@ -439,13 +447,22 @@ MachineRegisterInfo &MRI = MF->getRegInfo(); Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - Register FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register FIReg = MRI.createVirtualRegister( + ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass + : &AMDGPU::VGPR_32RegClass); BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) .addImm(Offset); - BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), FIReg) + BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg) .addFrameIndex(FrameIdx); + if (ST.enableFlatScratch() ) { + BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_U32), BaseReg) + .addReg(OffsetReg, RegState::Kill) + .addReg(FIReg); + return; + } + TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) .addReg(OffsetReg, RegState::Kill) .addReg(FIReg) @@ -455,6 +472,7 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const { const SIInstrInfo *TII = ST.getInstrInfo(); + bool IsFlat = TII->isFLATScratch(MI); #ifndef NDEBUG // FIXME: Is it possible to be storing a frame index to itself? @@ -469,12 +487,25 @@ } #endif - MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); + MachineOperand *FIOp = + TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr + : AMDGPU::OpName::vaddr); #ifndef NDEBUG MachineBasicBlock *MBB = MI.getParent(); MachineFunction *MF = MBB->getParent(); assert(FIOp && FIOp->isFI() && "frame index must be address operand"); - assert(TII->isMUBUF(MI)); + assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI)); + + MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); + int64_t NewOffset = OffsetOp->getImm() + Offset; + + if (IsFlat) { + assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true) && + "offset should be legal"); + FIOp->ChangeToRegister(BaseReg, false); + OffsetOp->setImm(NewOffset); + return; + } MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset); assert((SOffset->isReg() && @@ -483,8 +514,6 @@ (SOffset->isImm() && SOffset->getImm() == 0)); #endif - MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); - int64_t NewOffset = OffsetOp->getImm() + Offset; assert(SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) && "offset should be legal"); @@ -495,12 +524,16 @@ bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg, int64_t Offset) const { - if (!SIInstrInfo::isMUBUF(*MI)) + if (!SIInstrInfo::isMUBUF(*MI) && !!SIInstrInfo::isFLATScratch(*MI)) return false; - int64_t NewOffset = Offset + getMUBUFInstrOffset(MI); + int64_t NewOffset = Offset + getScratchInstrOffset(MI); - return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset); + if (SIInstrInfo::isMUBUF(*MI)) + return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset); + + const SIInstrInfo *TII = ST.getInstrInfo(); + return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true); } const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( @@ -722,9 +755,10 @@ const MachineFrameInfo &MFI = MF->getFrameInfo(); const SIMachineFunctionInfo *FuncInfo = MF->getInfo(); - const MCInstrDesc &Desc = TII->get(LoadStoreOp); + const MCInstrDesc *Desc = &TII->get(LoadStoreOp); const DebugLoc &DL = MI->getDebugLoc(); - bool IsStore = Desc.mayStore(); + bool IsStore = Desc->mayStore(); + bool IsFlat = TII->isFLATScratch(LoadStoreOp); bool Scavenged = false; MCRegister SOffset = ScratchOffsetReg; @@ -734,6 +768,7 @@ unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / (EltSize * CHAR_BIT); unsigned Size = NumSubRegs * EltSize; int64_t Offset = InstOffset + MFI.getObjectOffset(Index); + int64_t MaxOffset = Offset + Size - EltSize; int64_t ScratchOffsetRegDelta = 0; Align Alignment = MFI.getObjectAlign(Index); @@ -741,13 +776,17 @@ assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset"); - if (!SIInstrInfo::isLegalMUBUFImmOffset(Offset + Size - EltSize)) { + bool IsOffsetLegal = IsFlat + ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, true) + : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset); + if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) { SOffset = MCRegister(); // We currently only support spilling VGPRs to EltSize boundaries, meaning // we can simplify the adjustment of Offset here to just scale with // WavefrontSize. - Offset *= ST.getWavefrontSize(); + if (!IsFlat) + Offset *= ST.getWavefrontSize(); // We don't have access to the register scavenger if this function is called // during PEI::scavengeFrameVirtualRegs(). @@ -785,8 +824,18 @@ Offset = 0; } + if (IsFlat && SOffset == AMDGPU::NoRegister) { + assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 + && "Unexpected vaddr for flat scratch with a FI operand"); + + assert(ST.hasFlatScratchSTMode()); + LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); + Desc = &TII->get(LoadStoreOp); + } + Register TmpReg; + // FIXME: Flat scratch does not have to be limited to a dword per store. for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) { Register SubReg = NumSubRegs == 1 ? Register(ValueReg) @@ -831,22 +880,26 @@ MF->getMachineMemOperand(PInfo, MMO->getFlags(), EltSize, commonAlignment(Alignment, EltSize * i)); - MIB = BuildMI(*MBB, MI, DL, Desc) + MIB = BuildMI(*MBB, MI, DL, *Desc) .addReg(SubReg, - getDefRegState(!IsStore) | getKillRegState(IsKill)) - .addReg(ScratchRsrcReg); + getDefRegState(!IsStore) | getKillRegState(IsKill)); + if (!IsFlat) + MIB.addReg(ScratchRsrcReg); + if (SOffset == AMDGPU::NoRegister) { - MIB.addImm(0); + if (!IsFlat) + MIB.addImm(0); } else { MIB.addReg(SOffset, SOffsetRegState); } MIB.addImm(Offset) .addImm(0) // glc .addImm(0) // slc - .addImm(0) // tfe - .addImm(0) // dlc - .addImm(0) // swz - .addMemOperand(NewMMO); + .addImm(0); // tfe for MUBUF or dlc for FLAT + if (!IsFlat) + MIB.addImm(0) // dlc + .addImm(0); // swz + MIB.addMemOperand(NewMMO); if (!IsAGPR && NeedSuperRegDef) MIB.addReg(ValueReg, RegState::ImplicitDefine); @@ -947,14 +1000,18 @@ EltSize, Alignment); if (IsLoad) { - buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, + unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR + : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; + buildSpillLoadStore(MI, Opc, Index, VGPR, false, MFI->getScratchRSrcReg(), FrameReg, Offset * EltSize, MMO, RS); } else { - buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, Index, VGPR, + unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR + : AMDGPU::BUFFER_STORE_DWORD_OFFSET; + buildSpillLoadStore(MI, Opc, Index, VGPR, IsKill, MFI->getScratchRSrcReg(), FrameReg, Offset * EltSize, MMO, RS); // This only ever adds one VGPR spill @@ -1294,7 +1351,9 @@ assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == MFI->getStackPtrOffsetReg()); - buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, + unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR + : AMDGPU::BUFFER_STORE_DWORD_OFFSET; + buildSpillLoadStore(MI, Opc, Index, VData->getReg(), VData->isKill(), TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), @@ -1328,7 +1387,9 @@ assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == MFI->getStackPtrOffsetReg()); - buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, + unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR + : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; + buildSpillLoadStore(MI, Opc, Index, VData->getReg(), VData->isKill(), TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), @@ -1342,6 +1403,113 @@ default: { const DebugLoc &DL = MI->getDebugLoc(); + + int64_t Offset = FrameInfo.getObjectOffset(Index); + if (ST.enableFlatScratch()) { + if (TII->isFLATScratch(*MI)) { + // The offset is always swizzled, just replace it + if (FrameReg) + FIOp.ChangeToRegister(FrameReg, false); + + if (!Offset) + return; + + MachineOperand *OffsetOp = + TII->getNamedOperand(*MI, AMDGPU::OpName::offset); + int64_t NewOffset = Offset + OffsetOp->getImm(); + if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, + true)) { + OffsetOp->setImm(NewOffset); + if (FrameReg) + return; + Offset = 0; + } + + assert(!TII->getNamedOperand(*MI, AMDGPU::OpName::vaddr) && + "Unexpected vaddr for flat scratch with a FI operand"); + + // On GFX10 we have ST mode to use no registers for an address. + // Otherwise we need to materialize 0 into an SGPR. + if (!Offset && ST.hasFlatScratchSTMode()) { + unsigned Opc = MI->getOpcode(); + unsigned NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc); + MI->RemoveOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr)); + MI->setDesc(TII->get(NewOpc)); + return; + } + } + + if (!FrameReg) { + FIOp.ChangeToImmediate(Offset); + if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) + return; + } + + // We need to use register here. Check if we can use an SGPR or need + // a VGPR. + FIOp.ChangeToRegister(AMDGPU::M0, false); + bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp); + + if (!Offset && FrameReg && UseSGPR) { + FIOp.setReg(FrameReg); + return; + } + + const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass + : &AMDGPU::VGPR_32RegClass; + + Register TmpReg = RS->scavengeRegister(RC, MI, 0, !UseSGPR); + FIOp.setReg(TmpReg); + FIOp.setIsKill(true); + + if ((!FrameReg || !Offset) && TmpReg) { + unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; + auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg); + if (FrameReg) + MIB.addReg(FrameReg); + else + MIB.addImm(Offset); + + return; + } + + Register TmpSReg = + UseSGPR ? TmpReg + : RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, + !UseSGPR); + + // TODO: for flat scratch another attempt can be made with a VGPR index + // if no SGPRs can be scavenged. + if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) + report_fatal_error("Cannot scavenge register in FI elimination!"); + + if (!TmpSReg) { + // Use frame register and restore it after. + TmpSReg = FrameReg; + FIOp.setReg(FrameReg); + FIOp.setIsKill(false); + } + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), TmpSReg) + .addReg(FrameReg) + .addImm(Offset); + + if (!UseSGPR) + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) + .addReg(TmpSReg, RegState::Kill); + + if (TmpSReg == FrameReg) { + // Undo frame register modification. + BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_SUB_U32), + FrameReg) + .addReg(FrameReg) + .addImm(Offset); + } + + return; + } + bool IsMUBUF = TII->isMUBUF(*MI); if (!IsMUBUF && !MFI->isEntryFunction()) { @@ -1471,7 +1639,6 @@ // If the offset is simply too big, don't convert to a scratch wave offset // relative index. - int64_t Offset = FrameInfo.getObjectOffset(Index); FIOp.ChangeToImmediate(Offset); if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll --- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -1,6 +1,7 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FLATSCR %s declare hidden void @external_void_func_void() #0 @@ -22,7 +23,8 @@ } ; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: -; GCN: buffer_store_dword +; MUBUF: buffer_store_dword +; FLATSCR: scratch_store_dword ; GCN: v_writelane_b32 v40, s33, 4 ; GCN: v_writelane_b32 v40, s34, 0 ; GCN: v_writelane_b32 v40, s35, 1 @@ -39,7 +41,8 @@ ; GCN: v_readlane_b32 s34, v40, 0 ; GCN: v_readlane_b32 s33, v40, 4 -; GCN: buffer_load_dword +; MUBUF: buffer_load_dword +; FLATSCR: scratch_load_dword ; GCN: s_setpc_b64 define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 { call void @external_void_func_void() @@ -49,16 +52,19 @@ } ; GCN-LABEL: {{^}}test_func_call_external_void_funcx2: -; GCN: buffer_store_dword v40 +; MUBUF: buffer_store_dword v40 +; FLATSCR: scratch_store_dword off, v40 ; GCN: v_writelane_b32 v40, s33, 4 ; GCN: s_mov_b32 s33, s32 -; GCN: s_add_u32 s32, s32, 0x400 +; MUBUF: s_add_u32 s32, s32, 0x400 +; FLATSCR: s_add_u32 s32, s32, 16 ; GCN: s_swappc_b64 ; GCN-NEXT: s_swappc_b64 ; GCN: v_readlane_b32 s33, v40, 4 -; GCN: buffer_load_dword v40, +; MUBUF: buffer_load_dword v40 +; FLATSCR: scratch_load_dword v40 define void @test_func_call_external_void_funcx2() #0 { call void @external_void_func_void() call void @external_void_func_void() diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -1,5 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MUBUF %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,MUBUF %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-flat-scratch < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FLATSCR %s ; GCN-LABEL: {{^}}callee_no_stack: ; GCN: ; %bb.0: @@ -32,7 +33,8 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt ; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}} -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32{{$}} +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32{{$}} +; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @callee_with_stack() #0 { @@ -48,10 +50,13 @@ ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_mov_b32 s4, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_add_u32 s32, s32, 0x200 +; MUBUF-NEXT: s_add_u32 s32, s32, 0x200 +; FLATSCR-NEXT: s_add_u32 s32, s32, 8 ; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}} -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4{{$}} -; GCN-NEXT: s_sub_u32 s32, s32, 0x200 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4{{$}} +; FLATSCR-NEXT: scratch_store_dword off, v0, s33 offset:4{{$}} +; MUBUF-NEXT: s_sub_u32 s32, s32, 0x200 +; FLATSCR-NEXT: s_sub_u32 s32, s32, 8 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 @@ -65,7 +70,8 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt ; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}} -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32{{$}} +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32{{$}} +; FLATSCR-NEXT: scratch_store_dword off, v0, s32{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @callee_with_stack_no_fp_elim_non_leaf() #2 { @@ -78,26 +84,31 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN: v_writelane_b32 [[CSR_VGPR]], s33, 2 ; GCN-DAG: s_mov_b32 s33, s32 -; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}} +; MUBUF-DAG: s_add_u32 s32, s32, 0x400{{$}} +; FLATSCR-DAG: s_add_u32 s32, s32, 16{{$}} ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, -; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}} +; MUBUF-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}} +; FLATSCR-DAG: scratch_store_dword off, [[ZERO]], s33{{$}} ; GCN: s_swappc_b64 ; GCN-DAG: v_readlane_b32 s5, [[CSR_VGPR]] ; GCN-DAG: v_readlane_b32 s4, [[CSR_VGPR]] -; GCN: s_sub_u32 s32, s32, 0x400{{$}} +; MUBUF: s_sub_u32 s32, s32, 0x400{{$}} +; FLATSCR: s_sub_u32 s32, s32, 16{{$}} ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -118,9 +129,11 @@ ; GCN-LABEL: {{^}}callee_no_stack_with_call: ; GCN: s_waitcnt ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-DAG: s_add_u32 s32, s32, 0x400 +; MUBUF-DAG: s_add_u32 s32, s32, 0x400 +; FLATSCR-DAG: s_add_u32 s32, s32, 16 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s33, [[FP_SPILL_LANE:[0-9]+]] ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, 0 @@ -130,10 +143,12 @@ ; GCN-DAG: v_readlane_b32 s4, v40, 0 ; GCN-DAG: v_readlane_b32 s5, v40, 1 -; GCN: s_sub_u32 s32, s32, 0x400 +; MUBUF: s_sub_u32 s32, s32, 0x400 +; FLATSCR: s_sub_u32 s32, s32, 16 ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], [[FP_SPILL_LANE]] ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 @@ -149,7 +164,8 @@ ; ; GCN-LABEL: {{^}}callee_func_sgpr_spill_no_calls: ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN: v_writelane_b32 [[CSR_VGPR]], s ; GCN: v_writelane_b32 [[CSR_VGPR]], s @@ -159,7 +175,8 @@ ; GCN: v_readlane_b32 s{{[0-9]+}}, [[CSR_VGPR]] ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -208,16 +225,21 @@ ; GCN-NEXT:s_mov_b32 [[FP_COPY:s[0-9]+]], s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:8 +; MUBUF-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:8 +; FLATSCR-DAG: scratch_store_dword off, [[ZERO]], s33 offset:8 ; GCN: ;;#ASMSTART ; GCN-NEXT: ; clobber v41 ; GCN-NEXT: ;;#ASMEND -; GCN: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN: s_add_u32 s32, s32, 0x300 -; GCN-NEXT: s_sub_u32 s32, s32, 0x300 +; MUBUF: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload +; MUBUF: s_add_u32 s32, s32, 0x300 +; MUBUF-NEXT: s_sub_u32 s32, s32, 0x300 +; FLATSCR: s_add_u32 s32, s32, 12 +; FLATSCR-NEXT: s_sub_u32 s32, s32, 12 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 @@ -233,14 +255,18 @@ ; GCN: s_waitcnt ; GCN-NEXT: v_writelane_b32 v1, s33, 63 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill ; GCN-COUNT-63: v_writelane_b32 v1 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:8 +; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:8 +; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33 offset:8 ; GCN: ;;#ASMSTART ; GCN-COUNT-63: v_readlane_b32 s{{[0-9]+}}, v1 -; GCN: s_add_u32 s32, s32, 0x300 -; GCN-NEXT: s_sub_u32 s32, s32, 0x300 +; MUBUF: s_add_u32 s32, s32, 0x300 +; MUBUF-NEXT: s_sub_u32 s32, s32, 0x300 +; FLATSCR: s_add_u32 s32, s32, 12 +; FLATSCR-NEXT: s_sub_u32 s32, s32, 12 ; GCN-NEXT: v_readlane_b32 s33, v1, 63 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 @@ -265,16 +291,21 @@ ; GCN: s_waitcnt ; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill ; GCN-COUNT-64: v_writelane_b32 v1, -; GCN: buffer_store_dword +; MUBUF: buffer_store_dword +; FLATSCR: scratch_store_dword ; GCN: ;;#ASMSTART ; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1 -; GCN: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN: s_add_u32 s32, s32, 0x300 -; GCN-NEXT: s_sub_u32 s32, s32, 0x300 +; MUBUF: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload +; MUBUF: s_add_u32 s32, s32, 0x300 +; MUBUF-NEXT: s_sub_u32 s32, s32, 0x300 +; FLATSCR: s_add_u32 s32, s32, 12 +; FLATSCR-NEXT: s_sub_u32 s32, s32, 12 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 @@ -296,13 +327,18 @@ ; GCN-LABEL: {{^}}realign_stack_no_fp_elim: ; GCN: s_waitcnt -; GCN-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x7ffc0 -; GCN-NEXT: s_mov_b32 s4, s33 -; GCN-NEXT: s_and_b32 s33, [[SCRATCH]], 0xfff80000 -; GCN-NEXT: s_add_u32 s32, s32, 0x100000 -; GCN-NEXT: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; GCN-NEXT: buffer_store_dword [[ZERO]], off, s[0:3], s33 -; GCN-NEXT: s_sub_u32 s32, s32, 0x100000 +; MUBUF-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x7ffc0 +; FLATSCR-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x1fff +; GCN-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_and_b32 s33, [[SCRATCH]], 0xfff80000 +; FLATSCR-NEXT: s_and_b32 s33, [[SCRATCH]], 0xffffe000 +; MUBUF-NEXT: s_add_u32 s32, s32, 0x100000 +; FLATSCR-NEXT: s_add_u32 s32, s32, 0x4000 +; GCN-NEXT: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 +; MUBUF-NEXT: buffer_store_dword [[ZERO]], off, s[0:3], s33 +; FLATSCR-NEXT: scratch_store_dword off, [[ZERO]], s33 +; MUBUF-NEXT: s_sub_u32 s32, s32, 0x100000 +; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x4000 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 @@ -319,12 +355,15 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; GCN: v_writelane_b32 v1, s31, 1 -; GCN: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4 +; MUBUF: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4 +; FLATSCR: scratch_store_dword off, [[ZERO]], s33 offset:4 ; GCN: ;;#ASMSTART ; GCN: v_readlane_b32 s4, v1, 0 -; GCN-NEXT: s_add_u32 s32, s32, 0x200 -; GCN-NEXT: v_readlane_b32 s5, v1, 1 -; GCN-NEXT: s_sub_u32 s32, s32, 0x200 +; MUBUF-NEXT: s_add_u32 s32, s32, 0x200 +; FLATSCR-NEXT: s_add_u32 s32, s32, 8 +; GCN-NEXT: v_readlane_b32 s5, v1, 1 +; MUBUF-NEXT: s_sub_u32 s32, s32, 0x200 +; FLATSCR-NEXT: s_sub_u32 s32, s32, 8 ; GCN-NEXT: v_readlane_b32 s33, v1, 2 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] @@ -346,24 +385,29 @@ ; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr: ; GCN: s_waitcnt ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2 ; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1 -; GCN-DAG: buffer_store_dword -; GCN: s_add_u32 s32, s32, 0x300{{$}} +; MUBUF-DAG: buffer_store_dword +; FLATSCR-DAG: scratch_store_dword +; MUBUF: s_add_u32 s32, s32, 0x300{{$}} +; FLATSCR: s_add_u32 s32, s32, 12{{$}} ; GCN: ;;#ASMSTART -; GCN: v_readlane_b32 s4, [[CSR_VGPR]], 0 -; GCN-NEXT: v_readlane_b32 s5, [[CSR_VGPR]], 1 -; GCN-NEXT: s_sub_u32 s32, s32, 0x300{{$}} +; GCN: v_readlane_b32 s4, [[CSR_VGPR]], 0 +; GCN-NEXT: v_readlane_b32 s5, [[CSR_VGPR]], 1 +; MUBUF-NEXT: s_sub_u32 s32, s32, 0x300{{$}} +; FLATSCR-NEXT: s_sub_u32 s32, s32, 12{{$}} ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 @@ -392,25 +436,32 @@ ; GCN-LABEL: {{^}}scratch_reg_needed_mubuf_offset: ; GCN: s_waitcnt ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008 -; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Spill +; MUBUF-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008 +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Spill +; FLATSCR-NEXT: s_add_u32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x1008 +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], [[SCRATCH_SGPR]] ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2 -; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s30, 0 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1 -; GCN-DAG: s_add_u32 s32, s32, 0x40300{{$}} -; GCN-DAG: buffer_store_dword +; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, 0 +; GCN-DAG: s_mov_b32 s33, s32 +; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1 +; MUBUF-DAG: s_add_u32 s32, s32, 0x40300{{$}} +; FLATSCR-DAG: s_add_u32 s32, s32, 0x100c{{$}} +; MUBUF-DAG: buffer_store_dword +; FLATSCR-DAG: scratch_store_dword ; GCN: ;;#ASMSTART ; GCN: v_readlane_b32 s4, [[CSR_VGPR]], 0 ; GCN-NEXT: v_readlane_b32 s5, [[CSR_VGPR]], 1 -; GCN-NEXT: s_sub_u32 s32, s32, 0x40300{{$}} +; MUBUF-NEXT: s_sub_u32 s32, s32, 0x40300{{$}} +; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x100c{{$}} ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008 -; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Reload +; MUBUF-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008 +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Reload +; FLATSCR-NEXT: s_add_u32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x1008 +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, [[SCRATCH_SGPR]] ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 @@ -447,10 +498,13 @@ ; GCN-LABEL: {{^}}ipra_call_with_stack: ; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 ; GCN: s_mov_b32 s33, s32 -; GCN: s_add_u32 s32, s32, 0x400 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33{{$}} -; GCN: s_swappc_b64 -; GCN: s_sub_u32 s32, s32, 0x400 +; MUBUF: s_add_u32 s32, s32, 0x400 +; FLATSCR: s_add_u32 s32, s32, 16 +; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33{{$}} +; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33{{$}} +; GCN: s_swappc_b64 +; MUBUF: s_sub_u32 s32, s32, 0x400 +; FLATSCR: s_sub_u32 s32, s32, 16 ; GCN: s_mov_b32 s33, [[FP_COPY:s[0-9]+]] define void @ipra_call_with_stack() #0 { %alloca = alloca i32, addrspace(5) @@ -463,11 +517,13 @@ ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory: ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33 -; GCN: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:4 +; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:4 +; FLATSCR: scratch_store_dword off, [[TMP_VGPR1]], s32 offset:4 ; GCN: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN: s_mov_b32 s33, s32 ; GCN: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:4 +; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:4 +; FLATSCR: scratch_load_dword [[TMP_VGPR2:v[0-9]+]], off, s32 offset:4 ; GCN: s_waitcnt vmcnt(0) ; GCN: v_readfirstlane_b32 s33, [[TMP_VGPR2]] ; GCN: s_mov_b64 exec, [[COPY_EXEC2]] @@ -494,13 +550,15 @@ ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory_full_reserved_vgpr: ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33 -; GCN: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:[[OFF:[0-9]+]] +; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:[[OFF:[0-9]+]] +; FLATSCR: scratch_store_dword off, [[TMP_VGPR1]], s32 offset:[[OFF:[0-9]+]] ; GCN: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NOT: v_writelane_b32 v40, s33 ; GCN: s_mov_b32 s33, s32 ; GCN-NOT: v_readlane_b32 s33, v40 ; GCN: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:[[OFF]] +; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:[[OFF]] +; FLATSCR: scratch_load_dword [[TMP_VGPR2:v[0-9]+]], off, s32 offset:[[OFF]] ; GCN: v_readfirstlane_b32 s33, [[TMP_VGPR2]] ; GCN: s_mov_b64 exec, [[COPY_EXEC2]] ; GCN: s_setpc_b64 @@ -529,10 +587,13 @@ ; scratch VGPR to hold the offset. ; GCN-LABEL: {{^}}spill_fp_to_memory_scratch_reg_needed_mubuf_offset ; GCN: s_or_saveexec_b64 s[4:5], -1 -; GCN: v_mov_b32_e32 v0, s33 +; MUBUF: v_mov_b32_e32 v0, s33 ; GCN-NOT: v_mov_b32_e32 v0, 0x1008 -; GCN-NEXT: v_mov_b32_e32 v1, 0x1008 -; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1008 +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen ; 4-byte Folded Spill +; FLATSCR-NEXT: s_add_u32 [[SOFF:s[0-9]+]], s32, 0x1008 +; FLATSCR-NEXT: v_mov_b32_e32 v0, s33 +; FLATSCR-NEXT: scratch_store_dword off, v0, [[SOFF]] ; 4-byte Folded Spill define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset([4096 x i8] addrspace(5)* byval align 4 %arg) #3 { %alloca = alloca i32, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -1,15 +1,27 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s define <2 x half> @chain_hi_to_lo_private() { -; GCN-LABEL: chain_hi_to_lo_private: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], 0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: chain_hi_to_lo_private: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], 0 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: chain_hi_to_lo_private: +; FLATSCR: ; %bb.0: ; %bb +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s4, 2 +; FLATSCR-NEXT: scratch_load_ushort v0, off, s4 +; FLATSCR-NEXT: s_mov_b32 s4, 0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s4 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds half, half addrspace(5)* null, i64 1 %load_lo = load half, half addrspace(5)* %gep_lo @@ -23,14 +35,23 @@ } define <2 x half> @chain_hi_to_lo_private_different_bases(half addrspace(5)* %base_lo, half addrspace(5)* %base_hi) { -; GCN-LABEL: chain_hi_to_lo_private_different_bases: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: chain_hi_to_lo_private_different_bases: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: chain_hi_to_lo_private_different_bases: +; FLATSCR: ; %bb.0: ; %bb +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: scratch_load_ushort v0, v0, off +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_load_short_d16_hi v0, v1, off +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] bb: %load_lo = load half, half addrspace(5)* %base_lo %load_hi = load half, half addrspace(5)* %base_hi @@ -42,14 +63,23 @@ } define <2 x half> @chain_hi_to_lo_arithmatic(half addrspace(5)* %base, half %in) { -; GCN-LABEL: chain_hi_to_lo_arithmatic: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_add_f16_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: chain_hi_to_lo_arithmatic: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GFX900-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: chain_hi_to_lo_arithmatic: +; FLATSCR: ; %bb.0: ; %bb +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: v_add_f16_e32 v1, 1.0, v1 +; FLATSCR-NEXT: scratch_load_short_d16_hi v1, v0, off +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: v_mov_b32_e32 v0, v1 +; FLATSCR-NEXT: s_setpc_b64 s[30:31] bb: %arith_lo = fadd half %in, 1.0 %load_hi = load half, half addrspace(5)* %base @@ -191,38 +221,75 @@ ; Make sure we don't lose any of the private stores. define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %in, <2 x i16> addrspace(1)* nocapture %out) #0 { -; GCN-LABEL: vload2_private: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: global_load_ushort v2, v[0:1], off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 -; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:2 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:6 -; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:4 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:8 -; GCN-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:4 -; GCN-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:6 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v3, v4 -; GCN-NEXT: buffer_load_short_d16_hi v3, off, s[0:3], 0 offset:8 -; GCN-NEXT: v_lshl_or_b32 v2, v4, 16, v2 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx2 v[0:1], v[2:3], off -; GCN-NEXT: s_endpgm +; GFX900-LABEL: vload2_private: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX900-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX900-NEXT: s_add_u32 s0, s0, s9 +; GFX900-NEXT: s_addc_u32 s1, s1, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v1, s5 +; GFX900-NEXT: global_load_ushort v2, v[0:1], off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 +; GFX900-NEXT: global_load_ushort v2, v[0:1], off offset:2 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:6 +; GFX900-NEXT: global_load_ushort v2, v[0:1], off offset:4 +; GFX900-NEXT: v_mov_b32_e32 v0, s6 +; GFX900-NEXT: v_mov_b32_e32 v1, s7 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:8 +; GFX900-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:4 +; GFX900-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:6 +; GFX900-NEXT: s_waitcnt vmcnt(1) +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: buffer_load_short_d16_hi v3, off, s[0:3], 0 offset:8 +; GFX900-NEXT: v_lshl_or_b32 v2, v4, 16, v2 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX900-NEXT: s_endpgm +; +; FLATSCR-LABEL: vload2_private: +; FLATSCR: ; %bb.0: ; %entry +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 +; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; FLATSCR-NEXT: v_mov_b32_e32 v0, s4 +; FLATSCR-NEXT: v_mov_b32_e32 v1, s5 +; FLATSCR-NEXT: global_load_ushort v2, v[0:1], off +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_short off, v2, vcc_hi offset:4 +; FLATSCR-NEXT: global_load_ushort v2, v[0:1], off offset:2 +; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_short off, v2, vcc_hi offset:6 +; FLATSCR-NEXT: global_load_ushort v2, v[0:1], off offset:4 +; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 +; FLATSCR-NEXT: v_mov_b32_e32 v0, s6 +; FLATSCR-NEXT: v_mov_b32_e32 v1, s7 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_short off, v2, vcc_hi offset:8 +; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 +; FLATSCR-NEXT: scratch_load_ushort v2, off, vcc_hi offset:4 +; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 +; FLATSCR-NEXT: scratch_load_ushort v4, off, vcc_hi offset:6 +; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 +; FLATSCR-NEXT: s_waitcnt vmcnt(1) +; FLATSCR-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: v_mov_b32_e32 v3, v4 +; FLATSCR-NEXT: scratch_load_short_d16_hi v3, off, vcc_hi offset:8 +; FLATSCR-NEXT: v_lshl_or_b32 v2, v4, 16, v2 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; FLATSCR-NEXT: s_endpgm entry: %loc = alloca [3 x i16], align 2, addrspace(5) %loc.0.sroa_cast1 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)* @@ -297,16 +364,27 @@ } define <2 x i16> @chain_hi_to_lo_private_other_dep(i16 addrspace(5)* %ptr) { -; GCN-LABEL: chain_hi_to_lo_private_other_dep: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] -; GCN-NEXT: buffer_load_short_d16 v1, v0, s[0:3], 0 offen offset:2 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: chain_hi_to_lo_private_other_dep: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] +; GFX900-NEXT: buffer_load_short_d16 v1, v0, s[0:3], 0 offen offset:2 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: chain_hi_to_lo_private_other_dep: +; FLATSCR: ; %bb.0: ; %bb +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: scratch_load_short_d16_hi v1, v0, off +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] +; FLATSCR-NEXT: scratch_load_short_d16 v1, v0, off offset:2 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: v_mov_b32_e32 v0, v1 +; FLATSCR-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 1 %load_lo = load i16, i16 addrspace(5)* %gep_lo diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -mattr=-unaligned-scratch-access < %s | FileCheck -check-prefixes=GCN,GFX7-ALIGNED %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -mattr=+unaligned-scratch-access < %s | FileCheck -check-prefixes=GCN,GFX7-UNALIGNED %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+unaligned-scratch-access < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+unaligned-scratch-access -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,GFX9-FLASTSCR %s ; Should not merge this to a dword load define i32 @private_load_2xi16_align2(i16 addrspace(5)* %p) #0 { @@ -35,6 +36,15 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLASTSCR-LABEL: private_load_2xi16_align2: +; GFX9-FLASTSCR: ; %bb.0: +; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLASTSCR-NEXT: scratch_load_ushort v1, v0, off +; GFX9-FLASTSCR-NEXT: scratch_load_ushort v0, v0, off offset:2 +; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLASTSCR-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 %p.0 = load i16, i16 addrspace(5)* %p, align 2 %p.1 = load i16, i16 addrspace(5)* %gep.p, align 2 @@ -78,6 +88,16 @@ ; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLASTSCR-LABEL: private_store_2xi16_align2: +; GFX9-FLASTSCR: ; %bb.0: +; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-FLASTSCR-NEXT: scratch_store_short v1, v0, off +; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v0, 2 +; GFX9-FLASTSCR-NEXT: scratch_store_short v1, v0, off offset:2 +; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 store i16 1, i16 addrspace(5)* %r, align 2 store i16 2, i16 addrspace(5)* %gep.r, align 2 @@ -124,6 +144,17 @@ ; GFX9-NEXT: v_bfi_b32 v1, v1, 0, v0 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLASTSCR-LABEL: private_load_2xi16_align1: +; GFX9-FLASTSCR: ; %bb.0: +; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off +; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-FLASTSCR-NEXT: s_mov_b32 s4, 0xffff +; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLASTSCR-NEXT: v_bfi_b32 v1, v1, 0, v0 +; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 %p.0 = load i16, i16 addrspace(5)* %p, align 1 %p.1 = load i16, i16 addrspace(5)* %gep.p, align 1 @@ -167,6 +198,14 @@ ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLASTSCR-LABEL: private_store_2xi16_align1: +; GFX9-FLASTSCR: ; %bb.0: +; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX9-FLASTSCR-NEXT: scratch_store_dword v1, v0, off +; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 store i16 1, i16 addrspace(5)* %r, align 1 store i16 2, i16 addrspace(5)* %gep.r, align 1 @@ -206,6 +245,17 @@ ; GFX9-NEXT: v_bfi_b32 v1, v1, 0, v0 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLASTSCR-LABEL: private_load_2xi16_align4: +; GFX9-FLASTSCR: ; %bb.0: +; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off +; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-FLASTSCR-NEXT: s_mov_b32 s4, 0xffff +; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLASTSCR-NEXT: v_bfi_b32 v1, v1, 0, v0 +; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 %p.0 = load i16, i16 addrspace(5)* %p, align 4 %p.1 = load i16, i16 addrspace(5)* %gep.p, align 2 @@ -228,13 +278,37 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GCN-LABEL: private_store_2xi16_align4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0x20001 -; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX7-ALIGNED-LABEL: private_store_2xi16_align4: +; GFX7-ALIGNED: ; %bb.0: +; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX7-ALIGNED-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-UNALIGNED-LABEL: private_store_2xi16_align4: +; GFX7-UNALIGNED: ; %bb.0: +; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX7-UNALIGNED-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: private_store_2xi16_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLASTSCR-LABEL: private_store_2xi16_align4: +; GFX9-FLASTSCR: ; %bb.0: +; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX9-FLASTSCR-NEXT: scratch_store_dword v1, v0, off +; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 store i16 1, i16 addrspace(5)* %r, align 4 store i16 2, i16 addrspace(5)* %gep.r, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -0,0 +1,1295 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s + +define amdgpu_kernel void @zero_init_kernel() { +; GFX9-LABEL: zero_init_kernel: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:76 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:72 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:68 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:64 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:60 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:56 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:52 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:48 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:44 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:40 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:36 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:32 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:28 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:24 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:20 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:16 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: zero_init_kernel: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: scratch_store_dword off, v0, off offset:76 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:72 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:68 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:64 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:60 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:56 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:52 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:48 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:44 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:40 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:36 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:32 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:28 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:24 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:20 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:16 +; GFX10-NEXT: s_endpgm + %alloca = alloca [32 x i16], align 2, addrspace(5) + %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* + call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) + ret void +} + +define void @zero_init_foo() { +; GFX9-LABEL: zero_init_foo: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:60 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:56 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:52 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:48 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:44 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:40 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:36 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:32 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:28 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:24 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:20 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:16 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:12 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:8 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 +; GFX9-NEXT: scratch_store_dword off, v0, s32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: zero_init_foo: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:60 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:56 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:52 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:48 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:44 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:40 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:36 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:32 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:28 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:24 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:20 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:16 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:12 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:8 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 +; GFX10-NEXT: scratch_store_dword off, v0, s32 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %alloca = alloca [32 x i16], align 2, addrspace(5) + %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* + call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) + ret void +} + +define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { +; GFX9-LABEL: store_load_sindex_kernel: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-NEXT: s_lshl_b32 s1, s0, 2 +; GFX9-NEXT: s_and_b32 s0, s0, 15 +; GFX9-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-NEXT: s_add_u32 s1, 4, s1 +; GFX9-NEXT: scratch_store_dword off, v0, s1 +; GFX9-NEXT: s_add_u32 s0, 4, s0 +; GFX9-NEXT: scratch_load_dword v0, off, s0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_load_sindex_kernel: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_add_u32 s2, s2, s5 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 15 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_and_b32 s1, s0, 15 +; GFX10-NEXT: s_lshl_b32 s0, s0, 2 +; GFX10-NEXT: s_lshl_b32 s1, s1, 2 +; GFX10-NEXT: s_add_u32 s0, 4, s0 +; GFX10-NEXT: s_add_u32 s1, 4, s1 +; GFX10-NEXT: scratch_store_dword off, v0, s0 +; GFX10-NEXT: scratch_load_dword v0, off, s1 +; GFX10-NEXT: s_endpgm +bb: + %i = alloca [32 x float], align 4, addrspace(5) + %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* + %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx + %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* + store volatile i32 15, i32 addrspace(5)* %i8, align 4 + %i9 = and i32 %idx, 15 + %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 + %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* + %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + ret void +} + +define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { +; GFX9-LABEL: store_load_sindex_foo: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: s_lshl_b32 s0, s2, 2 +; GFX9-NEXT: s_add_u32 s0, 4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-NEXT: scratch_store_dword off, v0, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 15 +; GFX9-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-NEXT: s_add_u32 s0, 4, s0 +; GFX9-NEXT: scratch_load_dword v0, off, s0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_load_sindex_foo: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: s_and_b32 s0, s2, 15 +; GFX10-NEXT: v_mov_b32_e32 v0, 15 +; GFX10-NEXT: s_lshl_b32 s1, s2, 2 +; GFX10-NEXT: s_lshl_b32 s0, s0, 2 +; GFX10-NEXT: s_add_u32 s1, 4, s1 +; GFX10-NEXT: s_add_u32 s0, 4, s0 +; GFX10-NEXT: scratch_store_dword off, v0, s1 +; GFX10-NEXT: scratch_load_dword v0, off, s0 +; GFX10-NEXT: s_endpgm +bb: + %i = alloca [32 x float], align 4, addrspace(5) + %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* + %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx + %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* + store volatile i32 15, i32 addrspace(5)* %i8, align 4 + %i9 = and i32 %idx, 15 + %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 + %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* + %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + ret void +} + +define amdgpu_kernel void @store_load_vindex_kernel() { +; GFX9-LABEL: store_load_vindex_kernel: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 +; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_load_vindex_kernel: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: v_mov_b32_e32 v1, 4 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX10-NEXT: scratch_store_dword v2, v3, off +; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX10-NEXT: s_endpgm +bb: + %i = alloca [32 x float], align 4, addrspace(5) + %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* + %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() + %i3 = zext i32 %i2 to i64 + %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 + %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* + store volatile i32 15, i32 addrspace(5)* %i8, align 4 + %i9 = sub nsw i32 31, %i2 + %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 + %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* + %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + ret void +} + +define void @store_load_vindex_foo(i32 %idx) { +; GFX9-LABEL: store_load_vindex_foo: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s32 +; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 +; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX9-NEXT: scratch_load_dword v0, v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: store_load_vindex_foo: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-NEXT: v_mov_b32_e32 v2, s32 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 +; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 +; GFX10-NEXT: scratch_store_dword v0, v1, off +; GFX10-NEXT: scratch_load_dword v0, v2, off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %i = alloca [32 x float], align 4, addrspace(5) + %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* + %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx + %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* + store volatile i32 15, i32 addrspace(5)* %i8, align 4 + %i9 = and i32 %idx, 15 + %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 + %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* + %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + ret void +} + +define void @private_ptr_foo(float addrspace(5)* nocapture %arg) { +; GFX9-LABEL: private_ptr_foo: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 +; GFX9-NEXT: scratch_store_dword v0, v1, off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: private_ptr_foo: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x41200000 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1 + store float 1.000000e+01, float addrspace(5)* %gep, align 4 + ret void +} + +define amdgpu_kernel void @zero_init_small_offset_kernel() { +; GFX9-LABEL: zero_init_small_offset_kernel: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:284 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:280 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:276 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:272 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:300 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:296 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:292 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:288 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:316 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:312 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:308 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:304 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:332 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:328 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:324 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:320 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: zero_init_small_offset_kernel: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: scratch_store_dword off, v0, off offset:284 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:280 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:276 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:272 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:300 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:296 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:292 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:288 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:316 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:312 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:308 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:304 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:332 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:328 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:324 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:320 +; GFX10-NEXT: s_endpgm + %padding = alloca [64 x i32], align 4, addrspace(5) + %alloca = alloca [32 x i16], align 2, addrspace(5) + %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef + %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 + %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* + call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) + ret void +} + +define void @zero_init_small_offset_foo() { +; GFX9-LABEL: zero_init_small_offset_foo: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: scratch_load_dword v0, off, s32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:268 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:264 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:260 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:256 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:284 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:280 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:276 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:272 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:300 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:296 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:292 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:288 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:316 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:312 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:308 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:304 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: zero_init_small_offset_foo: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, off, s32 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:268 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:264 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:260 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:256 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:284 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:280 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:276 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:272 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:300 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:296 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:292 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:288 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:316 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:312 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:308 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:304 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %padding = alloca [64 x i32], align 4, addrspace(5) + %alloca = alloca [32 x i16], align 2, addrspace(5) + %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef + %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 + %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* + call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) + ret void +} + +define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { +; GFX9-LABEL: store_load_sindex_small_offset_kernel: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 +; GFX9-NEXT: s_lshl_b32 s1, s0, 2 +; GFX9-NEXT: s_and_b32 s0, s0, 15 +; GFX9-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-NEXT: s_add_u32 s1, 0x104, s1 +; GFX9-NEXT: scratch_store_dword off, v0, s1 +; GFX9-NEXT: s_add_u32 s0, 0x104, s0 +; GFX9-NEXT: scratch_load_dword v0, off, s0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_load_sindex_small_offset_kernel: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_add_u32 s2, s2, s5 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 15 +; GFX10-NEXT: s_and_b32 s1, s0, 15 +; GFX10-NEXT: s_lshl_b32 s0, s0, 2 +; GFX10-NEXT: s_lshl_b32 s1, s1, 2 +; GFX10-NEXT: s_add_u32 s0, 0x104, s0 +; GFX10-NEXT: s_add_u32 s1, 0x104, s1 +; GFX10-NEXT: scratch_store_dword off, v0, s0 +; GFX10-NEXT: scratch_load_dword v0, off, s1 +; GFX10-NEXT: s_endpgm +bb: + %padding = alloca [64 x i32], align 4, addrspace(5) + %i = alloca [32 x float], align 4, addrspace(5) + %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef + %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 + %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* + %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx + %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* + store volatile i32 15, i32 addrspace(5)* %i8, align 4 + %i9 = and i32 %idx, 15 + %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 + %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* + %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + ret void +} + +define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { +; GFX9-LABEL: store_load_sindex_small_offset_foo: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: s_lshl_b32 s0, s2, 2 +; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 +; GFX9-NEXT: s_add_u32 s0, 0x104, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-NEXT: scratch_store_dword off, v0, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 15 +; GFX9-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-NEXT: s_add_u32 s0, 0x104, s0 +; GFX9-NEXT: scratch_load_dword v0, off, s0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_load_sindex_small_offset_foo: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 +; GFX10-NEXT: s_and_b32 s0, s2, 15 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 15 +; GFX10-NEXT: s_lshl_b32 s1, s2, 2 +; GFX10-NEXT: s_lshl_b32 s0, s0, 2 +; GFX10-NEXT: s_add_u32 s1, 0x104, s1 +; GFX10-NEXT: s_add_u32 s0, 0x104, s0 +; GFX10-NEXT: scratch_store_dword off, v0, s1 +; GFX10-NEXT: scratch_load_dword v0, off, s0 +; GFX10-NEXT: s_endpgm +bb: + %padding = alloca [64 x i32], align 4, addrspace(5) + %i = alloca [32 x float], align 4, addrspace(5) + %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef + %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 + %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* + %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx + %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* + store volatile i32 15, i32 addrspace(5)* %i8, align 4 + %i9 = and i32 %idx, 15 + %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 + %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* + %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + ret void +} + +define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { +; GFX9-LABEL: store_load_vindex_small_offset_kernel: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0x104 +; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 +; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_load_vindex_small_offset_kernel: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x104 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 +; GFX10-NEXT: scratch_store_dword v2, v3, off +; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX10-NEXT: s_endpgm +bb: + %padding = alloca [64 x i32], align 4, addrspace(5) + %i = alloca [32 x float], align 4, addrspace(5) + %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef + %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 + %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* + %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() + %i3 = zext i32 %i2 to i64 + %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 + %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* + store volatile i32 15, i32 addrspace(5)* %i8, align 4 + %i9 = sub nsw i32 31, %i2 + %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 + %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* + %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + ret void +} + +define void @store_load_vindex_small_offset_foo(i32 %idx) { +; GFX9-LABEL: store_load_vindex_small_offset_foo: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: scratch_load_dword v1, off, s32 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x100 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi +; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 +; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX9-NEXT: scratch_load_dword v0, v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: store_load_vindex_small_offset_foo: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x100 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 +; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 +; GFX10-NEXT: scratch_load_dword v3, off, s32 +; GFX10-NEXT: scratch_store_dword v0, v1, off +; GFX10-NEXT: scratch_load_dword v0, v2, off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %padding = alloca [64 x i32], align 4, addrspace(5) + %i = alloca [32 x float], align 4, addrspace(5) + %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef + %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 + %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* + %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx + %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* + store volatile i32 15, i32 addrspace(5)* %i8, align 4 + %i9 = and i32 %idx, 15 + %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 + %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* + %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + ret void +} + +define amdgpu_kernel void @zero_init_large_offset_kernel() { +; GFX9-LABEL: zero_init_large_offset_kernel: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:12 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:8 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:28 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:24 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:20 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:16 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:44 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:40 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:36 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:32 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:60 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:56 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:52 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:48 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: zero_init_large_offset_kernel: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:12 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:8 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:4 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:28 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:24 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:20 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:16 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:44 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:40 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:36 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:32 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:60 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:56 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:52 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:48 +; GFX10-NEXT: s_endpgm + %padding = alloca [4096 x i32], align 4, addrspace(5) + %alloca = alloca [32 x i16], align 2, addrspace(5) + %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef + %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 + %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* + call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) + ret void +} + +define void @zero_init_large_offset_foo() { +; GFX9-LABEL: zero_init_large_offset_foo: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: scratch_load_dword v0, off, s32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:12 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:8 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:28 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:24 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:20 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:16 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:44 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:40 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:36 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:32 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:60 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:56 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:52 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:48 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: zero_init_large_offset_foo: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, off, s32 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:12 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:8 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:4 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:28 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:24 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:20 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:16 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:44 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:40 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:36 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:32 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:60 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:56 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:52 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:48 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %padding = alloca [4096 x i32], align 4, addrspace(5) + %alloca = alloca [32 x i16], align 2, addrspace(5) + %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef + %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 + %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* + call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) + ret void +} + +define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { +; GFX9-LABEL: store_load_sindex_large_offset_kernel: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 +; GFX9-NEXT: s_lshl_b32 s1, s0, 2 +; GFX9-NEXT: s_and_b32 s0, s0, 15 +; GFX9-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-NEXT: s_add_u32 s1, 0x4004, s1 +; GFX9-NEXT: scratch_store_dword off, v0, s1 +; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 +; GFX9-NEXT: scratch_load_dword v0, off, s0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_load_sindex_large_offset_kernel: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_add_u32 s2, s2, s5 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 15 +; GFX10-NEXT: s_and_b32 s1, s0, 15 +; GFX10-NEXT: s_lshl_b32 s0, s0, 2 +; GFX10-NEXT: s_lshl_b32 s1, s1, 2 +; GFX10-NEXT: s_add_u32 s0, 0x4004, s0 +; GFX10-NEXT: s_add_u32 s1, 0x4004, s1 +; GFX10-NEXT: scratch_store_dword off, v0, s0 +; GFX10-NEXT: scratch_load_dword v0, off, s1 +; GFX10-NEXT: s_endpgm +bb: + %padding = alloca [4096 x i32], align 4, addrspace(5) + %i = alloca [32 x float], align 4, addrspace(5) + %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef + %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 + %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* + %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx + %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* + store volatile i32 15, i32 addrspace(5)* %i8, align 4 + %i9 = and i32 %idx, 15 + %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 + %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* + %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + ret void +} + +define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { +; GFX9-LABEL: store_load_sindex_large_offset_foo: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: s_lshl_b32 s0, s2, 2 +; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 +; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-NEXT: scratch_store_dword off, v0, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 15 +; GFX9-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 +; GFX9-NEXT: scratch_load_dword v0, off, s0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_load_sindex_large_offset_foo: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 +; GFX10-NEXT: s_and_b32 s0, s2, 15 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 15 +; GFX10-NEXT: s_lshl_b32 s1, s2, 2 +; GFX10-NEXT: s_lshl_b32 s0, s0, 2 +; GFX10-NEXT: s_add_u32 s1, 0x4004, s1 +; GFX10-NEXT: s_add_u32 s0, 0x4004, s0 +; GFX10-NEXT: scratch_store_dword off, v0, s1 +; GFX10-NEXT: scratch_load_dword v0, off, s0 +; GFX10-NEXT: s_endpgm +bb: + %padding = alloca [4096 x i32], align 4, addrspace(5) + %i = alloca [32 x float], align 4, addrspace(5) + %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef + %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 + %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* + %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx + %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* + store volatile i32 15, i32 addrspace(5)* %i8, align 4 + %i9 = and i32 %idx, 15 + %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 + %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* + %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + ret void +} + +define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { +; GFX9-LABEL: store_load_vindex_large_offset_kernel: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4004 +; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 +; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_load_vindex_large_offset_kernel: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x4004 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 +; GFX10-NEXT: scratch_store_dword v2, v3, off +; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX10-NEXT: s_endpgm +bb: + %padding = alloca [4096 x i32], align 4, addrspace(5) + %i = alloca [32 x float], align 4, addrspace(5) + %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef + %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 + %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* + %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() + %i3 = zext i32 %i2 to i64 + %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 + %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* + store volatile i32 15, i32 addrspace(5)* %i8, align 4 + %i9 = sub nsw i32 31, %i2 + %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 + %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* + %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + ret void +} + +define void @store_load_vindex_large_offset_foo(i32 %idx) { +; GFX9-LABEL: store_load_vindex_large_offset_foo: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: scratch_load_dword v1, off, s32 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi +; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 +; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX9-NEXT: scratch_load_dword v0, v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: store_load_vindex_large_offset_foo: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 +; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 +; GFX10-NEXT: scratch_load_dword v3, off, s32 +; GFX10-NEXT: scratch_store_dword v0, v1, off +; GFX10-NEXT: scratch_load_dword v0, v2, off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %padding = alloca [4096 x i32], align 4, addrspace(5) + %i = alloca [32 x float], align 4, addrspace(5) + %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef + %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 + %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* + %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx + %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* + store volatile i32 15, i32 addrspace(5)* %i8, align 4 + %i9 = and i32 %idx, 15 + %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 + %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* + %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + ret void +} + +define amdgpu_kernel void @store_load_large_imm_offset_kernel() { +; GFX9-LABEL: store_load_large_imm_offset_kernel: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: s_movk_i32 s0, 0x3000 +; GFX9-NEXT: v_mov_b32_e32 v0, 13 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 +; GFX9-NEXT: s_add_u32 s0, 4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 +; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_load_large_imm_offset_kernel: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: v_mov_b32_e32 v0, 13 +; GFX10-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-NEXT: s_movk_i32 s0, 0x3800 +; GFX10-NEXT: s_add_u32 s0, 4, s0 +; GFX10-NEXT: scratch_store_dword off, v0, off offset:4 +; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 +; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 +; GFX10-NEXT: s_endpgm +bb: + %i = alloca [4096 x i32], align 4, addrspace(5) + %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef + store volatile i32 13, i32 addrspace(5)* %i1, align 4 + %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 + store volatile i32 15, i32 addrspace(5)* %i7, align 4 + %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 + %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 + ret void +} + +define void @store_load_large_imm_offset_foo() { +; GFX9-LABEL: store_load_large_imm_offset_foo: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_movk_i32 s4, 0x3000 +; GFX9-NEXT: v_mov_b32_e32 v0, 13 +; GFX9-NEXT: scratch_store_dword off, v0, s32 +; GFX9-NEXT: s_add_u32 s4, s32, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-NEXT: scratch_store_dword off, v0, s4 offset:3712 +; GFX9-NEXT: scratch_load_dword v0, off, s4 offset:3712 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: store_load_large_imm_offset_foo: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 13 +; GFX10-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-NEXT: s_movk_i32 s4, 0x3800 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_add_u32 s4, s32, s4 +; GFX10-NEXT: scratch_store_dword off, v0, s32 +; GFX10-NEXT: scratch_store_dword off, v1, s4 offset:1664 +; GFX10-NEXT: scratch_load_dword v0, off, s4 offset:1664 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %i = alloca [4096 x i32], align 4, addrspace(5) + %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef + store volatile i32 13, i32 addrspace(5)* %i1, align 4 + %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 + store volatile i32 15, i32 addrspace(5)* %i7, align 4 + %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 + %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 + ret void +} + +define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { +; GFX9-LABEL: store_load_vidx_sidx_offset: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024 +; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_load_vidx_sidx_offset: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_add_u32 s2, s2, s5 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, 4 +; GFX10-NEXT: scratch_store_dword v0, v1, off offset:1024 +; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 +; GFX10-NEXT: s_endpgm +bb: + %alloca = alloca [32 x i32], align 4, addrspace(5) + %vidx = tail call i32 @llvm.amdgcn.workitem.id.x() + %add1 = add nsw i32 %sidx, %vidx + %add2 = add nsw i32 %add1, 256 + %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2 + store volatile i32 15, i32 addrspace(5)* %gep, align 4 + %load = load volatile i32, i32 addrspace(5)* %gep, align 4 + ret void +} + +; FIXME: Multi-DWORD scratch shall be supported +define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) { +; GFX9-LABEL: store_load_i64_aligned: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: scratch_store_dword v0, v1, off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-NEXT: scratch_store_dword v0, v1, off +; GFX9-NEXT: scratch_load_dword v1, v0, off offset:4 +; GFX9-NEXT: scratch_load_dword v0, v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: store_load_i64_aligned: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4 +; GFX10-NEXT: scratch_store_dword v0, v2, off +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: scratch_load_dword v1, v0, off offset:4 +; GFX10-NEXT: scratch_load_dword v0, v0, off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + store volatile i64 15, i64 addrspace(5)* %arg, align 8 + %load = load volatile i64, i64 addrspace(5)* %arg, align 8 + ret void +} + +; FIXME: Multi-DWORD unaligned scratch shall be supported +define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) { +; GFX9-LABEL: store_load_i64_unaligned: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: scratch_store_byte v0, v1, off offset:7 +; GFX9-NEXT: scratch_store_byte v0, v1, off offset:6 +; GFX9-NEXT: scratch_store_byte v0, v1, off offset:5 +; GFX9-NEXT: scratch_store_byte v0, v1, off offset:4 +; GFX9-NEXT: scratch_store_byte v0, v1, off offset:3 +; GFX9-NEXT: scratch_store_byte v0, v1, off offset:2 +; GFX9-NEXT: scratch_store_byte v0, v1, off offset:1 +; GFX9-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-NEXT: scratch_store_byte v0, v1, off +; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_ubyte v1, v0, off +; GFX9-NEXT: scratch_load_ubyte v0, v0, off offset:1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: store_load_i64_unaligned: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: scratch_store_byte v0, v1, off offset:7 +; GFX10-NEXT: scratch_store_byte v0, v1, off offset:6 +; GFX10-NEXT: scratch_store_byte v0, v1, off offset:5 +; GFX10-NEXT: scratch_store_byte v0, v1, off offset:4 +; GFX10-NEXT: scratch_store_byte v0, v1, off offset:3 +; GFX10-NEXT: scratch_store_byte v0, v1, off offset:2 +; GFX10-NEXT: scratch_store_byte v0, v1, off offset:1 +; GFX10-NEXT: scratch_store_byte v0, v2, off +; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:7 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:5 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:3 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: scratch_load_ubyte v1, v0, off +; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + store volatile i64 15, i64 addrspace(5)* %arg, align 1 + %load = load volatile i64, i64 addrspace(5)* %arg, align 1 + ret void +} + +declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg) +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll --- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -1,5 +1,6 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MUBUF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-MUBUF,MUBUF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-sroa=0 -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLATSCR %s ; Test that non-entry function frame indices are expanded properly to ; give an index relative to the scratch wave offset register @@ -9,9 +10,13 @@ ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshr_b32_e64 v0, s32, 6 -; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s32 +; GFX9-MUBUF-NEXT: v_lshrrev_b32_e64 v0, 6, s32 + +; GFX9-FLATSCR: v_mov_b32_e32 v0, s32 +; GFX9-FLATSCR-NOT: v_lshrrev_b32_e64 + +; MUBUF-NOT: v_mov -; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 define void @func_mov_fi_i32() #0 { %alloca = alloca i32, addrspace(5) @@ -30,11 +35,14 @@ ; CI-NEXT: v_add_i32_e{{32|64}} v0, {{s\[[0-9]+:[0-9]+\]|vcc}}, 4, [[SCALED]] ; CI-NEXT: ds_write_b32 v0, v0 -; GFX9: v_lshrrev_b32_e64 v0, 6, s32 -; GFX9-NEXT: ds_write_b32 v0, v0 -; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 -; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]] -; GFX9-NEXT: ds_write_b32 v0, v0 +; GFX9-MUBUF-NEXT: v_lshrrev_b32_e64 v0, 6, s32 +; GFX9-FLATSCR: v_mov_b32_e32 v0, s32 +; GFX9-FLATSCR: s_add_u32 [[ADD:[^,]+]], s32, 4 +; GFX9-NEXT: ds_write_b32 v0, v0 +; GFX9-MUBUF-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 4, [[SCALED]] +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, [[ADD]] +; GFX9-NEXT: ds_write_b32 v0, v0 define void @func_mov_fi_i32_offset() #0 { %alloca0 = alloca i32, addrspace(5) %alloca1 = alloca i32, addrspace(5) @@ -52,8 +60,11 @@ ; CI: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6 ; CI-NEXT: v_add_i32_e32 v0, vcc, 4, [[SCALED]] -; GFX9: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 -; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]] +; GFX9-MUBUF: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 4, [[SCALED]] + +; GFX9-FLATSCR: v_mov_b32_e32 [[ADD:v[0-9]+]], s32 +; GFX9-FLATSCR-NEXT: v_add_u32_e32 v0, 4, [[ADD]] ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 @@ -71,7 +82,8 @@ ; CI: v_lshr_b32_e64 v0, s32, 6 -; GFX9: v_lshrrev_b32_e64 v0, 6, s32 +; GFX9-MUBUF: v_lshrrev_b32_e64 v0, 6, s32 +; GFX9-FLATSCR: v_mov_b32_e32 v0, s32 ; GCN-NEXT: v_mul_u32_u24_e32 v0, 9, v0 ; GCN-NOT: v_mov @@ -86,7 +98,8 @@ ; GCN-LABEL: {{^}}func_store_private_arg_i32_ptr: ; GCN: v_mov_b32_e32 v1, 15{{$}} -; GCN: buffer_store_dword v1, v0, s[0:3], 0 offen{{$}} +; MUBUF: buffer_store_dword v1, v0, s[0:3], 0 offen{{$}} +; GFX9-FLATSCR: scratch_store_dword v0, v1, off{{$}} define void @func_store_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 { store volatile i32 15, i32 addrspace(5)* %ptr ret void @@ -94,7 +107,8 @@ ; GCN-LABEL: {{^}}func_load_private_arg_i32_ptr: ; GCN: s_waitcnt -; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen{{$}} +; MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen{{$}} +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v0, off{{$}} define void @func_load_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 { %val = load volatile i32, i32 addrspace(5)* %ptr ret void @@ -106,8 +120,11 @@ ; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6 ; CI-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]] -; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32 -; GFX9-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]] +; GFX9-MUBUF: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32 +; GFX9-MUBUF-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]] + +; GFX9-FLATSCR: v_mov_b32_e32 [[SP:v[0-9]+]], s32 +; GFX9-FLATSCR-NEXT: v_or_b32_e32 v0, 4, [[SP]] ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 @@ -121,8 +138,10 @@ ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_value: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_ubyte v0, off, s[0:3], s32 -; GCN_NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; MUBUF-NEXT: buffer_load_ubyte v0, off, s[0:3], s32 +; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; GFX9-FLATSCR-NEXT: scratch_load_ubyte v0, off, s32 +; GFX9-FLATSCR-NEXT: scratch_load_dword v1, off, s32 offset:4 define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 } addrspace(5)* byval %arg0) #0 { %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %arg0, i32 0, i32 0 %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %arg0, i32 0, i32 1 @@ -137,15 +156,17 @@ ; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6 -; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32 +; GFX9-MUBUF: v_lshrrev_b32_e64 [[SP:v[0-9]+]], 6, s32 +; GFX9-FLATSCR: v_mov_b32_e32 [[SP:v[0-9]+]], s32 ; GCN: s_and_saveexec_b64 ; CI: v_add_i32_e32 [[GEP:v[0-9]+]], vcc, 4, [[SHIFT]] ; CI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}} -; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SHIFT]] -; GFX9: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}} +; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SP]] +; GFX9-MUBUF: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}} +; GFX9-FLATSCR: scratch_load_dword v{{[0-9]+}}, [[SP]], off offset:4{{$}} ; GCN: ds_write_b32 v{{[0-9]+}}, [[GEP]] define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 } addrspace(5)* byval %arg0, i32 %arg2) #0 { @@ -170,8 +191,11 @@ ; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6 ; CI: v_add_i32_e32 [[VZ:v[0-9]+]], vcc, [[K]], [[SCALED]] -; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 -; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]] +; GFX9-MUBUF-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 +; GFX9-MUBUF: v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]] + +; GFX9-FLATSCR-DAG: s_add_u32 [[SZ:[^,]+]], s32, 0x200 +; GFX9-FLATSCR: v_mov_b32_e32 [[VZ:v[0-9]+]], [[SZ]] ; GCN: v_mul_u32_u24_e32 [[VZ]], 9, [[VZ]] ; GCN: ds_write_b32 v0, [[VZ]] @@ -193,8 +217,11 @@ ; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6 ; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, [[OFFSET]], [[SCALED]] -; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 -; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]] +; GFX9-MUBUF-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 +; GFX9-MUBUF: v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]] + +; GFX9-FLATSCR-DAG: s_add_u32 [[SZ:[^,]+]], s32, 0x200 +; GFX9-FLATSCR: v_mov_b32_e32 [[VZ:v[0-9]+]], [[SZ]] ; GCN: v_mul_u32_u24_e32 [[VZ]], 9, [[VZ]] ; GCN: ds_write_b32 v0, [[VZ]] @@ -219,10 +246,14 @@ ; GCN-LABEL: {{^}}undefined_stack_store_reg: ; GCN: s_and_saveexec_b64 -; GCN: buffer_store_dword v0, off, s[0:3], s33 offset: -; GCN: buffer_store_dword v0, off, s[0:3], s33 offset: -; GCN: buffer_store_dword v0, off, s[0:3], s33 offset: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset: +; MUBUF: buffer_store_dword v0, off, s[0:3], s33 offset: +; MUBUF: buffer_store_dword v0, off, s[0:3], s33 offset: +; MUBUF: buffer_store_dword v0, off, s[0:3], s33 offset: +; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset: +; FLATSCR: scratch_store_dword v0, off, s33 offset: +; FLATSCR: scratch_store_dword v0, off, s33 offset: +; FLATSCR: scratch_store_dword v0, off, s33 offset: +; FLATSCR: scratch_store_dword v{{[0-9]+}}, off, s33 offset: define void @undefined_stack_store_reg(float %arg, i32 %arg1) #0 { bb: %tmp = alloca <4 x float>, align 16, addrspace(5) @@ -243,13 +274,17 @@ ; GCN-LABEL: {{^}}alloca_ptr_nonentry_block: ; GCN: s_and_saveexec_b64 -; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 +; MUBUF: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 +; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, s32 offset:4 ; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6 ; CI-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]] -; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32 -; GFX9-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]] +; GFX9-MUBUF: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32 +; GFX9-MUBUF-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]] + +; GFX9-FLATSCR: v_mov_b32_e32 [[SP:v[0-9]+]], s32 +; GFX9-FLATSCR-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SP]] ; GCN: ds_write_b32 v{{[0-9]+}}, [[PTR]] define void @alloca_ptr_nonentry_block(i32 %arg0) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll --- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll @@ -1,6 +1,7 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,GFX900-MUBUF %s ; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,GFX900-FLATSCR %s ; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_lo: ; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -493,7 +494,8 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg: ; GCN: s_waitcnt -; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} +; GFX900-MUBUF: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} +; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, s32 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt @@ -512,7 +514,8 @@ ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg: ; GCN: s_waitcnt -; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} +; GFX900-MUBUF: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} +; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, s32 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt @@ -531,7 +534,9 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff: ; GCN: s_waitcnt -; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], 0 offset:4094{{$}} +; GFX900-MUBUFF: buffer_load_short_d16_hi v0, off, s[0:3], 0 offset:4094{{$}} +; GFX900-FLATSCR: s_movk_i32 [[SOFF:[^,]+]], 0xffe +; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, [[SOFF]]{{$}} ; GFX900: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt @@ -549,7 +554,9 @@ ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} +; GFX900-MUBUF-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} +; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe +; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v1, off, [[SOFF]]{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt @@ -567,7 +574,8 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8: ; GCN: s_waitcnt -; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} +; GFX900-MUBUF: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} +; GFX900-FLATSCR: scratch_load_ubyte_d16_hi v0, off, s32 offset:4095{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt @@ -587,7 +595,8 @@ ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_zexti8: ; GCN: s_waitcnt -; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} +; GFX900-MUBUF: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} +; GFX900-FLATSCR: scratch_load_ubyte_d16_hi v0, off, s32 offset:4095{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt @@ -608,7 +617,8 @@ ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_sexti8: ; GCN: s_waitcnt -; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} +; GFX900-MUBUF: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} +; GFX900-FLATSCR: scratch_load_sbyte_d16_hi v0, off, s32 offset:4095{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt @@ -629,7 +639,8 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8: ; GCN: s_waitcnt -; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} +; GFX900-MUBUF: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} +; GFX900-FLATSCR: scratch_load_sbyte_d16_hi v0, off, s32 offset:4095{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt @@ -649,7 +660,9 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} +; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} +; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe +; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]]{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt @@ -668,7 +681,9 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} +; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} +; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe +; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v1, off, [[SOFF]]{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt @@ -687,7 +702,9 @@ ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} +; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} +; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe +; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]]{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt @@ -787,8 +804,10 @@ ; to offset variant. ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset: -; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4094 +; GFX900-MUBUF: buffer_store_dword +; GFX900-MUBUF-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4094 +; GFX900-FLATSCR: scratch_store_dword +; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v{{[0-9]+}}, off, s32 offset:4094 define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -804,8 +823,10 @@ } ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset: -; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095 +; GFX900-MUBUF: buffer_store_dword +; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095 +; GFX900-FLATSCR: scratch_store_dword +; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v{{[0-9]+}}, off, s32 offset:4095 define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -822,8 +843,10 @@ } ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset: -; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095 +; GFX900-MUBUF: buffer_store_dword +; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095 +; GFX900-FLATSCR: scratch_store_dword +; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v{{[0-9]+}}, off, s32 offset:4095 define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -974,9 +997,11 @@ ; FIXME: Is there a cost to using the extload over not? ; GCN-LABEL: {{^}}load_private_v2i16_split: ; GCN: s_waitcnt -; GFX900: buffer_load_ushort v0, off, s[0:3], s32{{$}} +; GFX900-MUBUF: buffer_load_ushort v0, off, s[0:3], s32{{$}} +; GFX900-FLATSCR: scratch_load_ushort v0, off, s32{{$}} ; GFX900-NEXT: s_waitcnt -; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:2 +; GFX900-MUBUF-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:2 +; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s32 offset:2 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 define <2 x i16> @load_private_v2i16_split(i16 addrspace(5)* byval %in) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll --- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,GFX900-MUBUF %s ; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs --amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,GFX900,GFX900-FLATSCR %s define <2 x i16> @load_local_lo_v2i16_undeflo(i16 addrspace(3)* %in) #0 { ; GFX900-LABEL: load_local_lo_v2i16_undeflo: @@ -1177,14 +1178,14 @@ } define void @load_private_lo_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg: ; GFX906: ; %bb.0: ; %entry @@ -1207,6 +1208,15 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v0, off, s32 offset:4094 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047 @@ -1217,16 +1227,16 @@ } define void @load_private_lo_v2i16_reghi_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2i16_reghi_vreg: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reghi_vreg: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX900-MUBUF-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reghi_vreg: ; GFX906: ; %bb.0: ; %entry @@ -1249,6 +1259,17 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reghi_vreg: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: scratch_load_ushort v1, off, s32 offset:4094 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX900-FLATSCR-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047 %load = load i16, i16 addrspace(5)* %gep @@ -1259,14 +1280,14 @@ } define void @load_private_lo_v2f16_reglo_vreg(half addrspace(5)* byval %in, i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg: ; GFX906: ; %bb.0: ; %entry @@ -1290,6 +1311,15 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v0, off, s32 offset:4094 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2047 @@ -1300,14 +1330,14 @@ } define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_nooff: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v1, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v1, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff: ; GFX906: ; %bb.0: ; %entry @@ -1330,6 +1360,16 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe +; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s4 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*) @@ -1339,14 +1379,14 @@ } define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2i16_reghi_vreg_nooff: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v1, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reghi_vreg_nooff: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v1, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reghi_vreg_nooff: ; GFX906: ; %bb.0: ; %entry @@ -1369,6 +1409,16 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reghi_vreg_nooff: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe +; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s4 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*) @@ -1378,14 +1428,14 @@ } define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg_nooff: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v1, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_nooff: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v1, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff: ; GFX906: ; %bb.0: ; %entry @@ -1409,6 +1459,16 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_nooff: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe +; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s4 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*) @@ -1418,14 +1478,14 @@ } define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8: ; GFX906: ; %bb.0: ; %entry @@ -1449,6 +1509,15 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v0, off, s32 offset:4095 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 @@ -1460,14 +1529,14 @@ } define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8: ; GFX906: ; %bb.0: ; %entry @@ -1490,6 +1559,15 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v0, off, s32 offset:4095 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 @@ -1501,14 +1579,14 @@ } define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v1, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v1, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8: ; GFX906: ; %bb.0: ; %entry @@ -1532,6 +1610,16 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe +; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s4 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) @@ -1542,14 +1630,14 @@ } define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], 0 offset:4094 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v1, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], 0 offset:4094 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v1, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8: ; GFX906: ; %bb.0: ; %entry @@ -1572,6 +1660,16 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe +; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v1, off, s4 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) @@ -1582,14 +1680,14 @@ } define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v1, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v1, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8: ; GFX906: ; %bb.0: ; %entry @@ -1614,6 +1712,16 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe +; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s4 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) @@ -1801,16 +1909,16 @@ } define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX900-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX900-MUBUF-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset: ; GFX906: ; %bb.0: ; %entry @@ -1837,6 +1945,17 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32 +; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v0, off, s32 offset:4094 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) %obj1 = alloca [4096 x i16], align 2, addrspace(5) @@ -1851,16 +1970,16 @@ } define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset: ; GFX906: ; %bb.0: ; %entry @@ -1887,6 +2006,17 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32 +; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v0, off, s32 offset:4095 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) %obj1 = alloca [4096 x i8], align 2, addrspace(5) @@ -1902,16 +2032,16 @@ } define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset: ; GFX906: ; %bb.0: ; %entry @@ -1939,6 +2069,17 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32 +; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v0, off, s32 offset:4095 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) %obj1 = alloca [4096 x i8], align 2, addrspace(5) @@ -1954,16 +2095,16 @@ } define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset: ; GFX906: ; %bb.0: ; %entry @@ -1991,6 +2132,17 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32 +; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v0, off, s32 offset:4095 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) %obj1 = alloca [4096 x i8], align 2, addrspace(5) @@ -2007,16 +2159,16 @@ } define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset: ; GFX906: ; %bb.0: ; %entry @@ -2045,6 +2197,17 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32 +; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v0, off, s32 offset:4095 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) %obj1 = alloca [4096 x i8], align 2, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck -check-prefixes=GCN,GFX9,MUBUF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,GFX9,FLATSCR %s ; Make sure we use the correct frame offset is used with the local ; frame area. @@ -16,42 +17,78 @@ ; correct FP offset. define amdgpu_kernel void @local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8 addrspace(1)* %in) { -; GCN-LABEL: local_stack_offset_uses_sp: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: v_mov_b32_e32 v1, 0x3000 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: v_add_u32_e32 v0, 64, v1 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mov_b32_e32 v3, 0x2000 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GCN-NEXT: BB0_1: ; %loadstoreloop -; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_add_u32_e32 v3, s6, v1 -; GCN-NEXT: s_add_i32 s6, s6, 1 -; GCN-NEXT: s_cmpk_lt_u32 s6, 0x2120 -; GCN-NEXT: buffer_store_byte v2, v3, s[0:3], 0 offen -; GCN-NEXT: s_cbranch_scc1 BB0_1 -; GCN-NEXT: ; %bb.2: ; %split -; GCN-NEXT: v_mov_b32_e32 v1, 0x3000 -; GCN-NEXT: v_add_u32_e32 v1, 0x20d0, v1 -; GCN-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 -; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v2, v3 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc -; GCN-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NEXT: global_store_dwordx2 v[2:3], v[0:1], off -; GCN-NEXT: s_endpgm +; MUBUF-LABEL: local_stack_offset_uses_sp: +; MUBUF: ; %bb.0: ; %entry +; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; MUBUF-NEXT: s_add_u32 s0, s0, s9 +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x3000 +; MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; MUBUF-NEXT: v_add_u32_e32 v0, 64, v1 +; MUBUF-NEXT: v_mov_b32_e32 v2, 0 +; MUBUF-NEXT: v_mov_b32_e32 v3, 0x2000 +; MUBUF-NEXT: s_mov_b32 s6, 0 +; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; MUBUF-NEXT: BB0_1: ; %loadstoreloop +; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 +; MUBUF-NEXT: v_add_u32_e32 v3, s6, v1 +; MUBUF-NEXT: s_add_i32 s6, s6, 1 +; MUBUF-NEXT: s_cmpk_lt_u32 s6, 0x2120 +; MUBUF-NEXT: buffer_store_byte v2, v3, s[0:3], 0 offen +; MUBUF-NEXT: s_cbranch_scc1 BB0_1 +; MUBUF-NEXT: ; %bb.2: ; %split +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x3000 +; MUBUF-NEXT: v_add_u32_e32 v1, 0x20d0, v1 +; MUBUF-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen +; MUBUF-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: s_waitcnt vmcnt(1) +; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v2, v3 +; MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; MUBUF-NEXT: v_mov_b32_e32 v2, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; MUBUF-NEXT: v_mov_b32_e32 v3, s5 +; MUBUF-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; MUBUF-NEXT: s_endpgm +; +; FLATSCR-LABEL: local_stack_offset_uses_sp: +; FLATSCR: ; %bb.0: ; %entry +; FLATSCR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: s_movk_i32 vcc_hi, 0x2000 +; FLATSCR-NEXT: s_mov_b32 s6, 0 +; FLATSCR-NEXT: scratch_store_dword off, v0, vcc_hi +; FLATSCR-NEXT: BB0_1: ; %loadstoreloop +; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 +; FLATSCR-NEXT: s_add_u32 s7, 0x3000, s6 +; FLATSCR-NEXT: s_add_i32 s6, s6, 1 +; FLATSCR-NEXT: s_cmpk_lt_u32 s6, 0x2120 +; FLATSCR-NEXT: scratch_store_byte off, v0, s7 +; FLATSCR-NEXT: s_cbranch_scc1 BB0_1 +; FLATSCR-NEXT: ; %bb.2: ; %split +; FLATSCR-NEXT: s_movk_i32 s6, 0x20d0 +; FLATSCR-NEXT: s_add_u32 s6, 0x3000, s6 +; FLATSCR-NEXT: scratch_load_dword v1, off, s6 offset:4 +; FLATSCR-NEXT: s_movk_i32 s6, 0x2000 +; FLATSCR-NEXT: s_add_u32 s6, 0x3000, s6 +; FLATSCR-NEXT: scratch_load_dword v0, off, s6 offset:208 +; FLATSCR-NEXT: s_movk_i32 s6, 0x3000 +; FLATSCR-NEXT: scratch_load_dword v2, off, s6 offset:68 +; FLATSCR-NEXT: s_movk_i32 s6, 0x3000 +; FLATSCR-NEXT: scratch_load_dword v3, off, s6 offset:64 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; FLATSCR-NEXT: v_mov_b32_e32 v2, s4 +; FLATSCR-NEXT: v_mov_b32_e32 v3, s5 +; FLATSCR-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; FLATSCR-NEXT: s_endpgm entry: %pin.low = alloca i32, align 8192, addrspace(5) %local.area = alloca [1060 x i64], align 4096, addrspace(5) @@ -68,43 +105,83 @@ } define void @func_local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8 addrspace(1)* %in) { -; GCN-LABEL: func_local_stack_offset_uses_sp: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_add_u32 s4, s32, 0x7ffc0 -; GCN-NEXT: s_mov_b32 s5, s33 -; GCN-NEXT: s_and_b32 s33, s4, 0xfff80000 -; GCN-NEXT: v_lshrrev_b32_e64 v3, 6, s33 -; GCN-NEXT: v_add_u32_e32 v3, 0x1000, v3 -; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: v_add_u32_e32 v2, 64, v3 -; GCN-NEXT: s_mov_b32 s4, 0 -; GCN-NEXT: s_add_u32 s32, s32, 0x180000 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 -; GCN-NEXT: BB1_1: ; %loadstoreloop -; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_add_u32_e32 v5, s4, v3 -; GCN-NEXT: s_add_i32 s4, s4, 1 -; GCN-NEXT: s_cmpk_lt_u32 s4, 0x2120 -; GCN-NEXT: buffer_store_byte v4, v5, s[0:3], 0 offen -; GCN-NEXT: s_cbranch_scc1 BB1_1 -; GCN-NEXT: ; %bb.2: ; %split -; GCN-NEXT: v_lshrrev_b32_e64 v3, 6, s33 -; GCN-NEXT: v_add_u32_e32 v3, 0x1000, v3 -; GCN-NEXT: v_add_u32_e32 v3, 0x20d0, v3 -; GCN-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen offset:4 -; GCN-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:4 -; GCN-NEXT: s_sub_u32 s32, s32, 0x180000 -; GCN-NEXT: s_mov_b32 s33, s5 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v4, v5 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc -; GCN-NEXT: global_store_dwordx2 v[0:1], v[2:3], off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; MUBUF-LABEL: func_local_stack_offset_uses_sp: +; MUBUF: ; %bb.0: ; %entry +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_add_u32 s4, s32, 0x7ffc0 +; MUBUF-NEXT: s_mov_b32 s5, s33 +; MUBUF-NEXT: s_and_b32 s33, s4, 0xfff80000 +; MUBUF-NEXT: v_lshrrev_b32_e64 v3, 6, s33 +; MUBUF-NEXT: v_add_u32_e32 v3, 0x1000, v3 +; MUBUF-NEXT: v_mov_b32_e32 v4, 0 +; MUBUF-NEXT: v_add_u32_e32 v2, 64, v3 +; MUBUF-NEXT: s_mov_b32 s4, 0 +; MUBUF-NEXT: s_add_u32 s32, s32, 0x180000 +; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s33 +; MUBUF-NEXT: BB1_1: ; %loadstoreloop +; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 +; MUBUF-NEXT: v_add_u32_e32 v5, s4, v3 +; MUBUF-NEXT: s_add_i32 s4, s4, 1 +; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2120 +; MUBUF-NEXT: buffer_store_byte v4, v5, s[0:3], 0 offen +; MUBUF-NEXT: s_cbranch_scc1 BB1_1 +; MUBUF-NEXT: ; %bb.2: ; %split +; MUBUF-NEXT: v_lshrrev_b32_e64 v3, 6, s33 +; MUBUF-NEXT: v_add_u32_e32 v3, 0x1000, v3 +; MUBUF-NEXT: v_add_u32_e32 v3, 0x20d0, v3 +; MUBUF-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen +; MUBUF-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen +; MUBUF-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: s_sub_u32 s32, s32, 0x180000 +; MUBUF-NEXT: s_mov_b32 s33, s5 +; MUBUF-NEXT: s_waitcnt vmcnt(1) +; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v4, v5 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc +; MUBUF-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: func_local_stack_offset_uses_sp: +; FLATSCR: ; %bb.0: ; %entry +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_add_u32 s4, s32, 0x1fff +; FLATSCR-NEXT: s_mov_b32 s6, s33 +; FLATSCR-NEXT: s_and_b32 s33, s4, 0xffffe000 +; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 +; FLATSCR-NEXT: s_mov_b32 s4, 0 +; FLATSCR-NEXT: s_add_u32 s32, s32, 0x6000 +; FLATSCR-NEXT: scratch_store_dword off, v2, s33 +; FLATSCR-NEXT: BB1_1: ; %loadstoreloop +; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 +; FLATSCR-NEXT: s_add_u32 vcc_hi, s33, 0x1000 +; FLATSCR-NEXT: s_add_u32 s5, vcc_hi, s4 +; FLATSCR-NEXT: s_add_i32 s4, s4, 1 +; FLATSCR-NEXT: s_cmpk_lt_u32 s4, 0x2120 +; FLATSCR-NEXT: scratch_store_byte off, v2, s5 +; FLATSCR-NEXT: s_cbranch_scc1 BB1_1 +; FLATSCR-NEXT: ; %bb.2: ; %split +; FLATSCR-NEXT: s_movk_i32 s4, 0x20d0 +; FLATSCR-NEXT: s_add_u32 s5, s33, 0x1000 +; FLATSCR-NEXT: s_add_u32 s4, s5, s4 +; FLATSCR-NEXT: scratch_load_dword v3, off, s4 offset:4 +; FLATSCR-NEXT: s_movk_i32 s4, 0x2000 +; FLATSCR-NEXT: s_add_u32 s5, s33, 0x1000 +; FLATSCR-NEXT: s_add_u32 s4, s5, s4 +; FLATSCR-NEXT: scratch_load_dword v2, off, s4 offset:208 +; FLATSCR-NEXT: s_add_u32 s4, s33, 0x1000 +; FLATSCR-NEXT: scratch_load_dword v4, off, s4 offset:68 +; FLATSCR-NEXT: s_add_u32 s4, s33, 0x1000 +; FLATSCR-NEXT: scratch_load_dword v5, off, s4 offset:64 +; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x6000 +; FLATSCR-NEXT: s_mov_b32 s33, s6 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 +; FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc +; FLATSCR-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %pin.low = alloca i32, align 8192, addrspace(5) %local.area = alloca [1060 x i64], align 4096, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll --- a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll @@ -1,31 +1,56 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s -check-prefix=MUBUF +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-flat-scratch < %s | FileCheck %s -check-prefix=FLATSCR ; Make sure there's no assertion from passing a 0 alignment value define void @memcpy_fixed_align(i8 addrspace(5)* %dst, i8 addrspace(1)* %src) { -; CHECK-LABEL: memcpy_fixed_align: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dword v0, v[1:2], off offset:36 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:36 -; CHECK-NEXT: global_load_dword v0, v[1:2], off offset:32 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 -; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 -; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 -; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 -; CHECK-NEXT: global_load_dwordx4 v[0:3], v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[30:31] +; MUBUF-LABEL: memcpy_fixed_align: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: global_load_dword v0, v[1:2], off offset:36 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:36 +; MUBUF-NEXT: global_load_dword v0, v[1:2], off offset:32 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 +; MUBUF-NEXT: global_load_dwordx4 v[3:6], v[1:2], off offset:16 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 +; MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 +; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 +; MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 +; MUBUF-NEXT: global_load_dwordx4 v[0:3], v[1:2], off +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 +; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: memcpy_fixed_align: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: global_load_dword v0, v[1:2], off offset:36 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:36 +; FLATSCR-NEXT: global_load_dword v0, v[1:2], off offset:32 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:32 +; FLATSCR-NEXT: global_load_dwordx4 v[3:6], v[1:2], off offset:16 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dword off, v6, s32 offset:28 +; FLATSCR-NEXT: scratch_store_dword off, v5, s32 offset:24 +; FLATSCR-NEXT: scratch_store_dword off, v4, s32 offset:20 +; FLATSCR-NEXT: scratch_store_dword off, v3, s32 offset:16 +; FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[1:2], off +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dword off, v3, s32 offset:12 +; FLATSCR-NEXT: scratch_store_dword off, v2, s32 offset:8 +; FLATSCR-NEXT: scratch_store_dword off, v1, s32 offset:4 +; FLATSCR-NEXT: scratch_store_dword off, v0, s32 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca [40 x i8], addrspace(5) %cast = bitcast [40 x i8] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memcpy.p5i8.p1i8.i64(i8 addrspace(5)* align 4 dereferenceable(40) %cast, i8 addrspace(1)* align 4 dereferenceable(40) %src, i64 40, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll --- a/llvm/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll @@ -1,12 +1,17 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s - -; CHECK-LABEL: spill_v2i32: -; CHECK-DAG: buffer_store_dword v{{.*}} offset:16 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:20 ; 4-byte Folded Spill -; CHECK: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-DAG: buffer_load_dword v{{.*}} offset:16 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:20 ; 4-byte Folded Reload +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,MUBUF +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 -amdgpu-enable-flat-scratch < %s | FileCheck %s -check-prefixes=GCN,FLATSCR + +; GCN-LABEL: spill_v2i32: +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:16 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:20 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:16 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:20 ; 4-byte Folded Spill +; GCN: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:16 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:20 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:16 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:20 ; 4-byte Folded Reload define void @spill_v2i32() { entry: @@ -24,13 +29,17 @@ ret void } -; CHECK-LABEL: spill_v2f32: -; CHECK-DAG: buffer_store_dword v{{.*}} offset:16 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:20 ; 4-byte Folded Spill -; CHECK: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-DAG: buffer_load_dword v{{.*}} offset:16 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:20 ; 4-byte Folded Reload +; GCN-LABEL: spill_v2f32: +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:16 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:20 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:16 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:20 ; 4-byte Folded Spill +; GCN: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:16 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:20 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:16 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:20 ; 4-byte Folded Reload define void @spill_v2f32() { entry: @@ -48,15 +57,21 @@ ret void } -; CHECK-LABEL: spill_v3i32: -; CHECK-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill -; CHECK: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload +; GCN-LABEL: spill_v3i32: +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:32 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:36 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:40 ; 4-byte Folded Spill +; GCN: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload define void @spill_v3i32() { entry: @@ -74,15 +89,21 @@ ret void } -; CHECK-LABEL: spill_v3f32: -; CHECK-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill -; CHECK: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload +; GCN-LABEL: spill_v3f32: +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:32 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:36 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:40 ; 4-byte Folded Spill +; GCN: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload define void @spill_v3f32() { entry: @@ -100,17 +121,25 @@ ret void } -; CHECK-LABEL: spill_v4i32: -; CHECK-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:44 ; 4-byte Folded Spill -; CHECK: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:44 ; 4-byte Folded Reload +; GCN-LABEL: spill_v4i32: +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:44 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:32 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:36 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:40 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:44 ; 4-byte Folded Spill +; GCN: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:44 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:44 ; 4-byte Folded Reload define void @spill_v4i32() { entry: @@ -128,17 +157,25 @@ ret void } -; CHECK-LABEL: spill_v4f32: -; CHECK-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:44 ; 4-byte Folded Spill -; CHECK: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:44 ; 4-byte Folded Reload +; GCN-LABEL: spill_v4f32: +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:44 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:32 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:36 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:40 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:44 ; 4-byte Folded Spill +; GCN: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:44 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:44 ; 4-byte Folded Reload define void @spill_v4f32() { entry: @@ -156,17 +193,25 @@ ret void } -; CHECK-LABEL: spill_v5i32: -; CHECK-DAG: buffer_store_dword v{{.*}} offset:64 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:68 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:72 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:76 ; 4-byte Folded Spill -; CHECK: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-DAG: buffer_load_dword v{{.*}} offset:64 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:68 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:72 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:76 ; 4-byte Folded Reload +; GCN-LABEL: spill_v5i32: +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:64 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:68 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:72 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:76 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:64 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:68 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:72 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:76 ; 4-byte Folded Spill +; GCN: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:64 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:68 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:72 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:76 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:64 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:68 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:72 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:76 ; 4-byte Folded Reload define void @spill_v5i32() { entry: %alloca = alloca <5 x i32>, i32 2, align 4, addrspace(5) @@ -183,17 +228,25 @@ ret void } -; CHECK-LABEL: spill_v5f32: -; CHECK-DAG: buffer_store_dword v{{.*}} offset:64 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:68 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:72 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:76 ; 4-byte Folded Spill -; CHECK: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-DAG: buffer_load_dword v{{.*}} offset:64 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:68 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:72 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:76 ; 4-byte Folded Reload +; GCN-LABEL: spill_v5f32: +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:64 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:68 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:72 ; 4-byte Folded Spill +; MUBUF-DAG: buffer_store_dword v{{.*}} offset:76 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:64 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:68 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:72 ; 4-byte Folded Spill +; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:76 ; 4-byte Folded Spill +; GCN: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:64 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:68 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:72 ; 4-byte Folded Reload +; MUBUF-DAG: buffer_load_dword v{{.*}} offset:76 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:64 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:68 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:72 ; 4-byte Folded Reload +; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:76 ; 4-byte Folded Reload define void @spill_v5f32() { entry: %alloca = alloca <5 x i32>, i32 2, align 4, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DEFAULTSIZE %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=GCN,ASSUME1024 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DEFAULTSIZE,MUBUF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=GCN,ASSUME1024,MUBUF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,DEFAULTSIZE,FLATSCR %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-flat-scratch -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=GCN,ASSUME1024,FLATSCR %s ; FIXME: Generated test checks do not check metadata at the end of the ; function, so this also includes manually added checks. @@ -11,44 +13,82 @@ ; FIXME: FunctionLoweringInfo unhelpfully doesn't preserve an ; alignment less than the stack alignment. define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) { -; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_movk_i32 s32, 0x400 -; GCN-NEXT: s_mov_b32 s33, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s8, 0 -; GCN-NEXT: s_cbranch_scc1 BB0_3 -; GCN-NEXT: ; %bb.1: ; %bb.0 -; GCN-NEXT: s_cmp_lg_u32 s9, 0 -; GCN-NEXT: s_cbranch_scc1 BB0_3 -; GCN-NEXT: ; %bb.2: ; %bb.1 -; GCN-NEXT: s_add_i32 s6, s32, 0x1000 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: s_lshl_b32 s7, s10, 2 -; GCN-NEXT: s_mov_b32 s32, s6 -; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v1, 1 -; GCN-NEXT: s_add_i32 s6, s6, s7 -; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_u32_e32 v2, v1, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: global_store_dword v[0:1], v2, off -; GCN-NEXT: BB0_3: ; %bb.2 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: global_store_dword v[0:1], v0, off -; GCN-NEXT: s_endpgm +; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4: +; MUBUF: ; %bb.0: ; %entry +; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; MUBUF-NEXT: s_add_u32 s0, s0, s9 +; MUBUF-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 +; MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; MUBUF-NEXT: s_movk_i32 s32, 0x400 +; MUBUF-NEXT: s_mov_b32 s33, 0 +; MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; MUBUF-NEXT: s_cmp_lg_u32 s8, 0 +; MUBUF-NEXT: s_cbranch_scc1 BB0_3 +; MUBUF-NEXT: ; %bb.1: ; %bb.0 +; MUBUF-NEXT: s_cmp_lg_u32 s9, 0 +; MUBUF-NEXT: s_cbranch_scc1 BB0_3 +; MUBUF-NEXT: ; %bb.2: ; %bb.1 +; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 +; MUBUF-NEXT: v_mov_b32_e32 v1, 0 +; MUBUF-NEXT: v_mov_b32_e32 v2, s6 +; MUBUF-NEXT: s_lshl_b32 s7, s10, 2 +; MUBUF-NEXT: s_mov_b32 s32, s6 +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; MUBUF-NEXT: v_mov_b32_e32 v1, 1 +; MUBUF-NEXT: s_add_i32 s6, s6, s7 +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: v_mov_b32_e32 v1, s6 +; MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen +; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_add_u32_e32 v2, v1, v0 +; MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; MUBUF-NEXT: v_mov_b32_e32 v0, s4 +; MUBUF-NEXT: v_mov_b32_e32 v1, s5 +; MUBUF-NEXT: global_store_dword v[0:1], v2, off +; MUBUF-NEXT: BB0_3: ; %bb.2 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: global_store_dword v[0:1], v0, off +; MUBUF-NEXT: s_endpgm +; +; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4: +; FLATSCR: ; %bb.0: ; %entry +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_mov_b32 s32, 16 +; FLATSCR-NEXT: s_mov_b32 s33, 0 +; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; FLATSCR-NEXT: s_cmp_lg_u32 s8, 0 +; FLATSCR-NEXT: s_cbranch_scc1 BB0_3 +; FLATSCR-NEXT: ; %bb.1: ; %bb.0 +; FLATSCR-NEXT: s_cmp_lg_u32 s9, 0 +; FLATSCR-NEXT: s_cbranch_scc1 BB0_3 +; FLATSCR-NEXT: ; %bb.2: ; %bb.1 +; FLATSCR-NEXT: s_mov_b32 s6, s32 +; FLATSCR-NEXT: s_movk_i32 s7, 0x1000 +; FLATSCR-NEXT: s_add_i32 s8, s6, s7 +; FLATSCR-NEXT: s_add_u32 s6, s6, s7 +; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 +; FLATSCR-NEXT: scratch_store_dword off, v1, s6 +; FLATSCR-NEXT: v_mov_b32_e32 v1, 1 +; FLATSCR-NEXT: s_lshl_b32 s6, s10, 2 +; FLATSCR-NEXT: s_mov_b32 s32, s8 +; FLATSCR-NEXT: scratch_store_dword off, v1, s8 offset:4 +; FLATSCR-NEXT: s_add_i32 s8, s8, s6 +; FLATSCR-NEXT: scratch_load_dword v1, off, s8 +; FLATSCR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: v_add_u32_e32 v2, v1, v0 +; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; FLATSCR-NEXT: v_mov_b32_e32 v0, s4 +; FLATSCR-NEXT: v_mov_b32_e32 v1, s5 +; FLATSCR-NEXT: global_store_dword v[0:1], v2, off +; FLATSCR-NEXT: BB0_3: ; %bb.2 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; FLATSCR-NEXT: s_endpgm entry: %cond0 = icmp eq i32 %arg.cond0, 0 @@ -83,42 +123,75 @@ ; ASSUME1024: ; ScratchSize: 1040 define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) { -; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 -; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_movk_i32 s32, 0x1000 -; GCN-NEXT: s_mov_b32 s33, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_cbranch_scc1 BB1_2 -; GCN-NEXT: ; %bb.1: ; %bb.0 -; GCN-NEXT: s_add_i32 s6, s32, 0x1000 -; GCN-NEXT: s_and_b32 s6, s6, 0xfffff000 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: s_lshl_b32 s7, s7, 2 -; GCN-NEXT: s_mov_b32 s32, s6 -; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v1, 1 -; GCN-NEXT: s_add_i32 s6, s6, s7 -; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_u32_e32 v2, v1, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: global_store_dword v[0:1], v2, off -; GCN-NEXT: BB1_2: ; %bb.1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: global_store_dword v[0:1], v0, off -; GCN-NEXT: s_endpgm +; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: +; MUBUF: ; %bb.0: ; %entry +; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; MUBUF-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 +; MUBUF-NEXT: s_add_u32 s0, s0, s9 +; MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; MUBUF-NEXT: s_movk_i32 s32, 0x1000 +; MUBUF-NEXT: s_mov_b32 s33, 0 +; MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; MUBUF-NEXT: s_cmp_lg_u32 s6, 0 +; MUBUF-NEXT: s_cbranch_scc1 BB1_2 +; MUBUF-NEXT: ; %bb.1: ; %bb.0 +; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 +; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000 +; MUBUF-NEXT: v_mov_b32_e32 v1, 0 +; MUBUF-NEXT: v_mov_b32_e32 v2, s6 +; MUBUF-NEXT: s_lshl_b32 s7, s7, 2 +; MUBUF-NEXT: s_mov_b32 s32, s6 +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; MUBUF-NEXT: v_mov_b32_e32 v1, 1 +; MUBUF-NEXT: s_add_i32 s6, s6, s7 +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: v_mov_b32_e32 v1, s6 +; MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen +; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_add_u32_e32 v2, v1, v0 +; MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; MUBUF-NEXT: v_mov_b32_e32 v0, s4 +; MUBUF-NEXT: v_mov_b32_e32 v1, s5 +; MUBUF-NEXT: global_store_dword v[0:1], v2, off +; MUBUF-NEXT: BB1_2: ; %bb.1 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: global_store_dword v[0:1], v0, off +; MUBUF-NEXT: s_endpgm +; +; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: +; FLATSCR: ; %bb.0: ; %entry +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 +; FLATSCR-NEXT: s_mov_b32 s32, 64 +; FLATSCR-NEXT: s_mov_b32 s33, 0 +; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; FLATSCR-NEXT: s_cmp_lg_u32 s6, 0 +; FLATSCR-NEXT: s_cbranch_scc1 BB1_2 +; FLATSCR-NEXT: ; %bb.1: ; %bb.0 +; FLATSCR-NEXT: s_add_i32 s6, s32, 0x1000 +; FLATSCR-NEXT: s_and_b32 s6, s6, 0xfffff000 +; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 +; FLATSCR-NEXT: scratch_store_dword off, v1, s6 +; FLATSCR-NEXT: v_mov_b32_e32 v1, 1 +; FLATSCR-NEXT: s_lshl_b32 s7, s7, 2 +; FLATSCR-NEXT: s_mov_b32 s32, s6 +; FLATSCR-NEXT: scratch_store_dword off, v1, s6 offset:4 +; FLATSCR-NEXT: s_add_i32 s6, s6, s7 +; FLATSCR-NEXT: scratch_load_dword v1, off, s6 +; FLATSCR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: v_add_u32_e32 v2, v1, v0 +; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; FLATSCR-NEXT: v_mov_b32_e32 v0, s4 +; FLATSCR-NEXT: v_mov_b32_e32 v1, s5 +; FLATSCR-NEXT: global_store_dword v[0:1], v2, off +; FLATSCR-NEXT: BB1_2: ; %bb.1 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; FLATSCR-NEXT: s_endpgm entry: %cond = icmp eq i32 %arg.cond, 0 br i1 %cond, label %bb.0, label %bb.1 @@ -149,41 +222,79 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) { -; GCN-LABEL: func_non_entry_block_static_alloca_align4: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s7, s33 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_add_u32 s32, s32, 0x400 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz BB2_3 -; GCN-NEXT: ; %bb.1: ; %bb.0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GCN-NEXT: s_and_b64 exec, exec, vcc -; GCN-NEXT: s_cbranch_execz BB2_3 -; GCN-NEXT: ; %bb.2: ; %bb.1 -; GCN-NEXT: s_add_i32 s6, s32, 0x1000 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v2, 1 -; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4 -; GCN-NEXT: v_lshl_add_u32 v2, v4, 2, s6 -; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v5 -; GCN-NEXT: s_mov_b32 s32, s6 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_u32_e32 v2, v2, v3 -; GCN-NEXT: global_store_dword v[0:1], v2, off -; GCN-NEXT: BB2_3: ; %bb.2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: global_store_dword v[0:1], v0, off -; GCN-NEXT: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: s_mov_b32 s33, s7 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; MUBUF-LABEL: func_non_entry_block_static_alloca_align4: +; MUBUF: ; %bb.0: ; %entry +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s7, s33 +; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_add_u32 s32, s32, 0x400 +; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc +; MUBUF-NEXT: s_cbranch_execz BB2_3 +; MUBUF-NEXT: ; %bb.1: ; %bb.0 +; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; MUBUF-NEXT: s_and_b64 exec, exec, vcc +; MUBUF-NEXT: s_cbranch_execz BB2_3 +; MUBUF-NEXT: ; %bb.2: ; %bb.1 +; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 +; MUBUF-NEXT: v_mov_b32_e32 v2, 0 +; MUBUF-NEXT: v_mov_b32_e32 v3, s6 +; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; MUBUF-NEXT: v_mov_b32_e32 v2, 1 +; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: v_lshl_add_u32 v2, v4, 2, s6 +; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen +; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v5 +; MUBUF-NEXT: s_mov_b32 s32, s6 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3 +; MUBUF-NEXT: global_store_dword v[0:1], v2, off +; MUBUF-NEXT: BB2_3: ; %bb.2 +; MUBUF-NEXT: s_or_b64 exec, exec, s[4:5] +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: global_store_dword v[0:1], v0, off +; MUBUF-NEXT: s_sub_u32 s32, s32, 0x400 +; MUBUF-NEXT: s_mov_b32 s33, s7 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: func_non_entry_block_static_alloca_align4: +; FLATSCR: ; %bb.0: ; %entry +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s9, s33 +; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_add_u32 s32, s32, 16 +; FLATSCR-NEXT: s_and_saveexec_b64 s[4:5], vcc +; FLATSCR-NEXT: s_cbranch_execz BB2_3 +; FLATSCR-NEXT: ; %bb.1: ; %bb.0 +; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; FLATSCR-NEXT: s_and_b64 exec, exec, vcc +; FLATSCR-NEXT: s_cbranch_execz BB2_3 +; FLATSCR-NEXT: ; %bb.2: ; %bb.1 +; FLATSCR-NEXT: s_mov_b32 s6, s32 +; FLATSCR-NEXT: s_movk_i32 s7, 0x1000 +; FLATSCR-NEXT: s_add_i32 s8, s6, s7 +; FLATSCR-NEXT: s_add_u32 s6, s6, s7 +; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 +; FLATSCR-NEXT: scratch_store_dword off, v2, s6 +; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 +; FLATSCR-NEXT: scratch_store_dword off, v2, s8 offset:4 +; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s8 +; FLATSCR-NEXT: scratch_load_dword v2, v2, off +; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v5 +; FLATSCR-NEXT: s_mov_b32 s32, s8 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3 +; FLATSCR-NEXT: global_store_dword v[0:1], v2, off +; FLATSCR-NEXT: BB2_3: ; %bb.2 +; FLATSCR-NEXT: s_or_b64 exec, exec, s[4:5] +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; FLATSCR-NEXT: s_sub_u32 s32, s32, 16 +; FLATSCR-NEXT: s_mov_b32 s33, s9 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %cond0 = icmp eq i32 %arg.cond0, 0 @@ -213,39 +324,72 @@ } define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) { -; GCN-LABEL: func_non_entry_block_static_alloca_align64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_add_u32 s4, s32, 0xfc0 -; GCN-NEXT: s_mov_b32 s7, s33 -; GCN-NEXT: s_and_b32 s33, s4, 0xfffff000 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_add_u32 s32, s32, 0x2000 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz BB3_2 -; GCN-NEXT: ; %bb.1: ; %bb.0 -; GCN-NEXT: s_add_i32 s6, s32, 0x1000 -; GCN-NEXT: s_and_b32 s6, s6, 0xfffff000 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mov_b32_e32 v5, s6 -; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v2, 1 -; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen offset:4 -; GCN-NEXT: v_lshl_add_u32 v2, v3, 2, s6 -; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v4 -; GCN-NEXT: s_mov_b32 s32, s6 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_u32_e32 v2, v2, v3 -; GCN-NEXT: global_store_dword v[0:1], v2, off -; GCN-NEXT: BB3_2: ; %bb.1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: global_store_dword v[0:1], v0, off -; GCN-NEXT: s_sub_u32 s32, s32, 0x2000 -; GCN-NEXT: s_mov_b32 s33, s7 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; MUBUF-LABEL: func_non_entry_block_static_alloca_align64: +; MUBUF: ; %bb.0: ; %entry +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_add_u32 s4, s32, 0xfc0 +; MUBUF-NEXT: s_mov_b32 s7, s33 +; MUBUF-NEXT: s_and_b32 s33, s4, 0xfffff000 +; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; MUBUF-NEXT: s_add_u32 s32, s32, 0x2000 +; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc +; MUBUF-NEXT: s_cbranch_execz BB3_2 +; MUBUF-NEXT: ; %bb.1: ; %bb.0 +; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 +; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000 +; MUBUF-NEXT: v_mov_b32_e32 v2, 0 +; MUBUF-NEXT: v_mov_b32_e32 v5, s6 +; MUBUF-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; MUBUF-NEXT: v_mov_b32_e32 v2, 1 +; MUBUF-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: v_lshl_add_u32 v2, v3, 2, s6 +; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen +; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v4 +; MUBUF-NEXT: s_mov_b32 s32, s6 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3 +; MUBUF-NEXT: global_store_dword v[0:1], v2, off +; MUBUF-NEXT: BB3_2: ; %bb.1 +; MUBUF-NEXT: s_or_b64 exec, exec, s[4:5] +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: global_store_dword v[0:1], v0, off +; MUBUF-NEXT: s_sub_u32 s32, s32, 0x2000 +; MUBUF-NEXT: s_mov_b32 s33, s7 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: func_non_entry_block_static_alloca_align64: +; FLATSCR: ; %bb.0: ; %entry +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_add_u32 s4, s32, 63 +; FLATSCR-NEXT: s_mov_b32 s7, s33 +; FLATSCR-NEXT: s_and_b32 s33, s4, 0xffffffc0 +; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; FLATSCR-NEXT: s_add_u32 s32, s32, 0x80 +; FLATSCR-NEXT: s_and_saveexec_b64 s[4:5], vcc +; FLATSCR-NEXT: s_cbranch_execz BB3_2 +; FLATSCR-NEXT: ; %bb.1: ; %bb.0 +; FLATSCR-NEXT: s_add_i32 s6, s32, 0x1000 +; FLATSCR-NEXT: s_and_b32 s6, s6, 0xfffff000 +; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 +; FLATSCR-NEXT: scratch_store_dword off, v2, s6 +; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 +; FLATSCR-NEXT: scratch_store_dword off, v2, s6 offset:4 +; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s6 +; FLATSCR-NEXT: scratch_load_dword v2, v2, off +; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v4 +; FLATSCR-NEXT: s_mov_b32 s32, s6 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3 +; FLATSCR-NEXT: global_store_dword v[0:1], v2, off +; FLATSCR-NEXT: BB3_2: ; %bb.1 +; FLATSCR-NEXT: s_or_b64 exec, exec, s[4:5] +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x80 +; FLATSCR-NEXT: s_mov_b32 s33, s7 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %cond = icmp eq i32 %arg.cond, 0 br i1 %cond, label %bb.0, label %bb.1 diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck --check-prefix=MUBUF %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog -amdgpu-enable-flat-scratch %s -o - | FileCheck --check-prefix=FLATSCR %s # Test what happens when an SGPR is unavailable for the unused add. The non-inline constant needs to be folded into the add instruction and not materialized in a register. @@ -21,19 +22,32 @@ bb.0: liveins: $vgpr1 - ; CHECK-LABEL: name: scavenge_sgpr_pei_no_sgprs - ; CHECK: liveins: $sgpr27, $vgpr1 - ; CHECK: $sgpr27 = frame-setup COPY $sgpr33 - ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc - ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc - ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc - ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec - ; CHECK: $vgpr2 = V_ADD_U32_e32 8192, killed $vgpr2, implicit $exec - ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 - ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc - ; CHECK: $sgpr33 = frame-setup COPY $sgpr27 - ; CHECK: S_ENDPGM 0, implicit $vcc + ; MUBUF-LABEL: name: scavenge_sgpr_pei_no_sgprs + ; MUBUF: liveins: $sgpr27, $vgpr1 + ; MUBUF: $sgpr27 = frame-setup COPY $sgpr33 + ; MUBUF: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc + ; MUBUF: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc + ; MUBUF: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc + ; MUBUF: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc + ; MUBUF: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; MUBUF: $vgpr2 = V_ADD_U32_e32 8192, killed $vgpr2, implicit $exec + ; MUBUF: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + ; MUBUF: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc + ; MUBUF: $sgpr33 = frame-setup COPY $sgpr27 + ; MUBUF: S_ENDPGM 0, implicit $vcc + ; FLATSCR-LABEL: name: scavenge_sgpr_pei_no_sgprs + ; FLATSCR: liveins: $sgpr27, $vgpr1 + ; FLATSCR: $sgpr27 = frame-setup COPY $sgpr33 + ; FLATSCR: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 8191, implicit-def $scc + ; FLATSCR: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294959104, implicit-def $scc + ; FLATSCR: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 24576, implicit-def $scc + ; FLATSCR: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc + ; FLATSCR: $sgpr33 = S_ADD_U32 $sgpr33, 8192, implicit-def $scc + ; FLATSCR: $vgpr0 = V_OR_B32_e32 $sgpr33, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + ; FLATSCR: $sgpr33 = S_SUB_U32 $sgpr33, 8192, implicit-def $scc + ; FLATSCR: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 24576, implicit-def $scc + ; FLATSCR: $sgpr33 = frame-setup COPY $sgpr27 + ; FLATSCR: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 S_ENDPGM 0, implicit $vcc diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GFX8 %s # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog -amdgpu-enable-flat-scratch %s -o - | FileCheck -check-prefix=GFX9-FLATSCR %s # Test case where spilling a VGPR to an emergency slot is needed during frame index elimination. @@ -55,6 +56,17 @@ ; GFX9: $sgpr4 = S_ADD_U32 $sgpr33, 524544, implicit-def $scc ; GFX9: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5) ; GFX9: S_ENDPGM 0, csr_amdgpu_allvgprs + ; GFX9-FLATSCR-LABEL: name: pei_scavenge_vgpr_spill + ; GFX9-FLATSCR: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2 + ; GFX9-FLATSCR: $vgpr2 = V_WRITELANE_B32_vi $sgpr33, 0, undef $vgpr2 + ; GFX9-FLATSCR: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 8191, implicit-def $scc + ; GFX9-FLATSCR: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294959104, implicit-def $scc + ; GFX9-FLATSCR: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 24576, implicit-def $scc + ; GFX9-FLATSCR: $vcc_hi = S_ADD_U32 $sgpr33, 8192, implicit-def $scc + ; GFX9-FLATSCR: $vgpr0 = V_OR_B32_e32 killed $vcc_hi, $vgpr1, implicit $exec + ; GFX9-FLATSCR: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 24576, implicit-def $scc + ; GFX9-FLATSCR: $sgpr33 = V_READLANE_B32_vi $vgpr2, 0 + ; GFX9-FLATSCR: S_ENDPGM 0, csr_amdgpu_allvgprs $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec S_ENDPGM 0, csr_amdgpu_allvgprs ... diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll --- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -1,9 +1,11 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI,SIVI %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx803 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI,SIVI %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX9_10 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI,SIVI,MUBUF %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx803 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI,SIVI,MUBUF %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX9_10,MUBUF,GFX9-MUBUF,GFX9_10-MUBUF %s ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -filetype=obj -amdgpu-use-divergent-register-indexing < %s | llvm-readobj -r - | FileCheck --check-prefix=RELS %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W32,GFX9_10 %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W64,GFX9_10 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W32,GFX9_10,MUBUF,GFX10_W32-MUBUF,GFX9_10-MUBUF %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W64,GFX9_10,MUBUF,GFX10_W64-MUBUF,GFX9_10-MUBUF %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX9_10,FLATSCR,GFX9-FLATSCR %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1030 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W32,GFX9_10,FLATSCR,GFX10-FLATSCR,GFX9_10-FLATSCR %s ; RELS: R_AMDGPU_ABS32_LO SCRATCH_RSRC_DWORD0 0x0 ; RELS: R_AMDGPU_ABS32_LO SCRATCH_RSRC_DWORD1 0x0 @@ -14,14 +16,33 @@ ; ; GCN-LABEL: {{^}}ps_main: -; GCN-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 -; GCN-DAG: s_mov_b32 s1, SCRATCH_RSRC_DWORD1 -; GCN-DAG: s_mov_b32 s2, -1 +; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s2 +; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0 + +; GFX10-FLATSCR: s_add_u32 s0, s0, s2 +; GFX10-FLATSCR: s_addc_u32 s1, s1, 0 +; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 + +; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 +; MUBUF-DAG: s_mov_b32 s1, SCRATCH_RSRC_DWORD1 +; MUBUF-DAG: s_mov_b32 s2, -1 ; SI-DAG: s_mov_b32 s3, 0xe8f000 ; VI-DAG: s_mov_b32 s3, 0xe80000 -; GFX9-DAG: s_mov_b32 s3, 0xe00000 -; GFX10_W32-DAG: s_mov_b32 s3, 0x31c16000 -; GFX10_W64-DAG: s_mov_b32 s3, 0x31e16000 +; GFX9-MUBUF-DAG: s_mov_b32 s3, 0xe00000 +; GFX10_W32-MUBUF-DAG: s_mov_b32 s3, 0x31c16000 +; GFX10_W64-MUBUF-DAG: s_mov_b32 s3, 0x31e16000 + +; FLATSCR-NOT: SCRATCH_RSRC_DWORD + +; GFX9-FLATSCR: s_mov_b32 [[SP:[^,]+]], 0 +; GFX9-FLATSCR: scratch_store_dword off, v2, [[SP]] offset: +; GFX9-FLATSCR: s_mov_b32 [[SP:[^,]+]], 0 +; GFX9-FLATSCR: scratch_store_dword off, v2, [[SP]] offset: + +; GFX10-FLATSCR: scratch_store_dword off, v2, off offset: +; GFX10-FLATSCR: scratch_store_dword off, v2, off offset: + ; GCN-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0 ; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]] ; GCN-NOT: s_mov_b32 s0 @@ -29,8 +50,10 @@ ; GCN-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[HI_OFF:v[0-9]+]],{{.*}} 0x280, [[CLAMP_IDX]] ; GCN-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0x80}}, [[CLAMP_IDX]] -; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen -; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; MUBUF: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; MUBUF: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; FLATSCR: scratch_load_dword {{v[0-9]+}}, [[LO_OFF]], off +; FLATSCR: scratch_load_dword {{v[0-9]+}}, [[HI_OFF]], off define amdgpu_ps float @ps_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -39,10 +62,30 @@ } ; GCN-LABEL: {{^}}vs_main: -; GCN-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 +; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s2 +; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0 + +; GFX10-FLATSCR: s_add_u32 s0, s0, s2 +; GFX10-FLATSCR: s_addc_u32 s1, s1, 0 +; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 + +; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 ; GCN-NOT: s_mov_b32 s0 -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen + +; FLATSCR-NOT: SCRATCH_RSRC_DWORD + +; MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen + +; GFX9-FLATSCR: s_mov_b32 [[SP:[^,]+]], 0 +; GFX9-FLATSCR: scratch_store_dword off, v2, [[SP]] offset: +; GFX9-FLATSCR: s_mov_b32 [[SP:[^,]+]], 0 +; GFX9-FLATSCR: scratch_store_dword off, v2, [[SP]] offset: + +; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off +; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off + define amdgpu_vs float @vs_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -51,9 +94,23 @@ } ; GCN-LABEL: {{^}}cs_main: -; GCN-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s2 +; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0 + +; GFX10-FLATSCR: s_add_u32 s0, s0, s2 +; GFX10-FLATSCR: s_addc_u32 s1, s1, 0 +; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 + +; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 + +; FLATSCR-NOT: SCRATCH_RSRC_DWORD + +; MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen + +; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off +; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off define amdgpu_cs float @cs_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -62,15 +119,27 @@ } ; GCN-LABEL: {{^}}hs_main: +; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s5 +; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0 + +; GFX10-FLATSCR: s_add_u32 s0, s0, s5 +; GFX10-FLATSCR: s_addc_u32 s1, s1, 0 +; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 + ; SIVI: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 ; SIVI-NOT: s_mov_b32 s0 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen -; GFX9_10: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 -; GFX9_10-NOT: s_mov_b32 s5 -; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen -; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GFX9_10-MUBUF: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 +; GFX9_10-NOT: s_mov_b32 s5 +; GFX9_10-MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GFX9_10-MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen + +; FLATSCR-NOT: SCRATCH_RSRC_DWORD +; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off +; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off define amdgpu_hs float @hs_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -79,13 +148,25 @@ } ; GCN-LABEL: {{^}}gs_main: +; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s5 +; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0 + +; GFX10-FLATSCR: s_add_u32 s0, s0, s5 +; GFX10-FLATSCR: s_addc_u32 s1, s1, 0 +; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 + ; SIVI: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen -; GFX9_10: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 -; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen -; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GFX9_10-MUBUF: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 +; GFX9_10-MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GFX9_10-MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen + +; FLATSCR-NOT: SCRATCH_RSRC_DWORD +; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off +; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off define amdgpu_gs float @gs_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -99,17 +180,29 @@ ; (i.e. SI_RETURN_TO_EPILOG) can access the scratch wave offset. ; GCN-LABEL: {{^}}hs_ir_uses_scratch_offset: -; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s5 +; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0 + +; GFX10-FLATSCR: s_add_u32 s0, s0, s5 +; GFX10-FLATSCR: s_addc_u32 s1, s1, 0 +; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 + +; MUBUF: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; FLATSCR-NOT: SCRATCH_RSRC_DWORD ; SIVI-NOT: s_mov_b32 s6 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; GFX9_10-NOT: s_mov_b32 s5 -; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen -; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GFX9_10-MUBUF-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GFX9_10-MUBUF-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen + +; MUBUF-DAG: s_mov_b32 s2, s5 -; GCN-DAG: s_mov_b32 s2, s5 +; FLATSCR-DAG: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off +; FLATSCR-DAG: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -120,15 +213,27 @@ } ; GCN-LABEL: {{^}}gs_ir_uses_scratch_offset: -; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s5 +; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0 + +; GFX10-FLATSCR: s_add_u32 s0, s0, s5 +; GFX10-FLATSCR: s_addc_u32 s1, s1, 0 +; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 + +; MUBUF: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; FLATSCR-NOT: SCRATCH_RSRC_DWORD ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen -; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen -; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GFX9_10-MUBUF-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GFX9_10-MUBUF-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen + +; MUBUF-DAG: s_mov_b32 s2, s5 -; GCN-DAG: s_mov_b32 s2, s5 +; FLATSCR-DAG: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off +; FLATSCR-DAG: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir @@ -1,21 +1,28 @@ -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=CHECK -check-prefix=GCN64 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=CHECK -check-prefix=GCN32 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=CHECK,GCN64,MUBUF %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=CHECK,GCN32,MUBUF %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=CHECK,GCN64,FLATSCR %s # CHECK-LABEL: name: check_spill +# FLATSCR: $sgpr33 = S_MOV_B32 0 +# FLATSCR: $flat_scr_lo = S_ADD_U32 $sgpr0, $sgpr11, implicit-def $scc +# FLATSCR: $flat_scr_hi = S_ADDC_U32 $sgpr1, 0, implicit-def $scc, implicit $scc + # S32 with kill # CHECK: V_WRITELANE # CHECK: $sgpr12 = S_MOV_B32 $exec_lo # CHECK: $exec_lo = S_MOV_B32 1 -# CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 4 +# MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 4 +# FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr{{[0-9]+}}, $sgpr33, 4 # CHECK: $exec_lo = S_MOV_B32 killed $sgpr12 # S32 without kill # CHECK: V_WRITELANE # CHECK: $sgpr12 = S_MOV_B32 $exec_lo # CHECK: $exec_lo = S_MOV_B32 1 -# CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 4 +# MUBUF: BUFFER_STORE_DWORD_OFFSET $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 4 +# FLATSCR: SCRATCH_STORE_DWORD_SADDR $vgpr{{[0-9]+}}, $sgpr33, 4 # CHECK: $sgpr12 = V_READLANE # S64 with kill @@ -25,7 +32,8 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 3 # GCN64: $exec = S_MOV_B64 3 -# CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 8 +# MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 8 +# FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr{{[0-9]+}}, $sgpr33, 8 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 @@ -36,7 +44,8 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 3 # GCN64: $exec = S_MOV_B64 3 -# CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 8 +# MUBUF: BUFFER_STORE_DWORD_OFFSET $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 8 +# FLATSCR: SCRATCH_STORE_DWORD_SADDR $vgpr{{[0-9]+}}, $sgpr33, 8 # GCN32: $exec_lo = S_MOV_B32 $sgpr12 # GCN64: $exec = S_MOV_B64 $sgpr12_sgpr13 # GCN64: $sgpr13 = V_READLANE @@ -50,7 +59,8 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 7 # GCN64: $exec = S_MOV_B64 7 -# CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 16 +# MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 16 +# FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr{{[0-9]+}}, $sgpr33, 16 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 @@ -63,7 +73,8 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 15 # GCN64: $exec = S_MOV_B64 15 -# CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 28 +# MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 28 +# FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr{{[0-9]+}}, $sgpr33, 28 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 @@ -77,7 +88,8 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 31 # GCN64: $exec = S_MOV_B64 31 -# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 44 +# MUBUF: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 44 +# FLATSCR: SCRATCH_STORE_DWORD_SADDR {{(killed )?}}$vgpr{{[0-9]+}}, $sgpr33, 44 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 @@ -94,7 +106,8 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 255 # GCN64: $exec = S_MOV_B64 255 -# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 64 +# MUBUF: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 64 +# FLATSCR: SCRATCH_STORE_DWORD_SADDR {{(killed )?}}$vgpr{{[0-9]+}}, $sgpr33, 64 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 @@ -119,7 +132,8 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 65535 # GCN64: $exec = S_MOV_B64 65535 -# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 96 +# MUBUF: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 96 +# FLATSCR: SCRATCH_STORE_DWORD_SADDR {{(killed )?}}$vgpr{{[0-9]+}}, $sgpr33, 96 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 @@ -160,7 +174,8 @@ # GCN64: $sgpr64_sgpr65 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 4294967295 # GCN64: $exec = S_MOV_B64 4294967295 -# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 160 +# MUBUF: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 160 +# FLATSCR: SCRATCH_STORE_DWORD_SADDR {{(killed )?}}$vgpr{{[0-9]+}}, $sgpr33, 160 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr64 # GCN64: $exec = S_MOV_B64 killed $sgpr64_sgpr65 @@ -203,11 +218,12 @@ stackPtrOffsetReg: '$sgpr32' frameOffsetReg: '$sgpr33' argumentInfo: - privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } - dispatchPtr: { reg: '$sgpr4_sgpr5' } - kernargSegmentPtr: { reg: '$sgpr6_sgpr7' } - workGroupIDX: { reg: '$sgpr8' } - privateSegmentWaveByteOffset: { reg: '$sgpr9' } + flatScratchInit: { reg: '$sgpr0_sgpr1' } + dispatchPtr: { reg: '$sgpr2_sgpr3' } + privateSegmentBuffer: { reg: '$sgpr4_sgpr5_sgpr6_sgpr7' } + kernargSegmentPtr: { reg: '$sgpr8_sgpr9' } + workGroupIDX: { reg: '$sgpr10' } + privateSegmentWaveByteOffset: { reg: '$sgpr11' } body: | bb.0: liveins: $sgpr8, $sgpr4_sgpr5, $sgpr6_sgpr7 @@ -245,10 +261,15 @@ # CHECK-LABEL: name: check_reload +# FLATSCR: $sgpr33 = S_MOV_B32 0 +# FLATSCR: $flat_scr_lo = S_ADD_U32 $sgpr0, $sgpr11, implicit-def $scc +# FLATSCR: $flat_scr_hi = S_ADDC_U32 $sgpr1, 0, implicit-def $scc, implicit $scc + # S32 # CHECK: $sgpr12 = S_MOV_B32 $exec_lo # CHECK: $exec_lo = S_MOV_B32 1 -# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 4 +# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 4 +# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4 # CHECK: $exec_lo = S_MOV_B32 killed $sgpr12 # CHECK: $sgpr12 = V_READLANE @@ -257,7 +278,8 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 3 # GCN64: $exec = S_MOV_B64 3 -# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 8 +# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 8 +# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 # CHECK: $sgpr12 = V_READLANE @@ -268,7 +290,8 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 7 # GCN64: $exec = S_MOV_B64 7 -# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 16 +# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 16 +# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 # CHECK: $sgpr12 = V_READLANE @@ -280,7 +303,8 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 15 # GCN64: $exec = S_MOV_B64 15 -# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 28 +# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 28 +# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 28 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 # CHECK: $sgpr12 = V_READLANE @@ -293,7 +317,8 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 31 # GCN64: $exec = S_MOV_B64 31 -# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 44 +# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 44 +# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 44 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 # CHECK: $sgpr12 = V_READLANE @@ -307,7 +332,8 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 255 # GCN64: $exec = S_MOV_B64 255 -# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 64 +# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 64 +# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 64 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 # CHECK: $sgpr12 = V_READLANE @@ -324,7 +350,8 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 65535 # GCN64: $exec = S_MOV_B64 65535 -# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 96 +# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 96 +# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 96 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 # CHECK: $sgpr12 = V_READLANE @@ -349,7 +376,8 @@ # GCN64: $sgpr64_sgpr65 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 4294967295 # GCN64: $exec = S_MOV_B64 4294967295 -# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 160 +# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 160 +# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 160 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr64 # GCN64: $exec = S_MOV_B64 killed $sgpr64_sgpr65 # CHECK: $sgpr64 = V_READLANE @@ -412,11 +440,12 @@ stackPtrOffsetReg: '$sgpr32' frameOffsetReg: '$sgpr33' argumentInfo: - privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } - dispatchPtr: { reg: '$sgpr4_sgpr5' } - kernargSegmentPtr: { reg: '$sgpr6_sgpr7' } - workGroupIDX: { reg: '$sgpr8' } - privateSegmentWaveByteOffset: { reg: '$sgpr9' } + flatScratchInit: { reg: '$sgpr0_sgpr1' } + dispatchPtr: { reg: '$sgpr2_sgpr3' } + privateSegmentBuffer: { reg: '$sgpr4_sgpr5_sgpr6_sgpr7' } + kernargSegmentPtr: { reg: '$sgpr8_sgpr9' } + workGroupIDX: { reg: '$sgpr10' } + privateSegmentWaveByteOffset: { reg: '$sgpr11' } body: | bb.0: liveins: $sgpr8, $sgpr4_sgpr5, $sgpr6_sgpr7 diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -1,5 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=verde -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 < %s | FileCheck -check-prefixes=CHECK,GFX6 %s ; RUN: llc -regalloc=basic -march=amdgcn -mcpu=tonga -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 < %s | FileCheck -check-prefixes=CHECK,GFX7 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=CHECK,GFX9-FLATSCR,FLATSCR %s +; RUN: llc -march=amdgcn -mcpu=gfx1030 -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=CHECK,GFX10-FLATSCR,FLATSCR %s ; ; There is something about Tonga that causes this test to spend a lot of time ; in the default register allocator. @@ -11,6 +13,16 @@ ; Just test that it compiles successfully. ; CHECK-LABEL: test + +; GFX9-FLATSCR: s_mov_b32 [[SOFF1:s[0-9]+]], 4{{$}} +; GFX9-FLATSCR-DAG: scratch_store_dword off, v{{[0-9]+}}, [[SOFF1]] ; 4-byte Folded Spill +; GFX9-FLATSCR-DAG: scratch_store_dword off, v{{[0-9]+}}, [[SOFF1]] offset:{{[0-9]+}} ; 4-byte Folded Spill +; GFX9-FLATSCR: s_movk_i32 [[SOFF2:s[0-9]+]], 0x{{[0-9a-f]+}}{{$}} +; GFX9-FLATSCR-DAG: scratch_load_dword v{{[0-9]+}}, off, [[SOFF2]] ; 4-byte Folded Reload +; GFX9-FLATSCR-DAG: scratch_load_dword v{{[0-9]+}}, off, [[SOFF2]] offset:{{[0-9]+}} ; 4-byte Folded Reload + +; GFX10-FLATSCR: scratch_store_dword off, v{{[0-9]+}}, off offset:{{[0-9]+}} ; 4-byte Folded Spill +; GFX10-FLATSCR: scratch_load_dword v{{[0-9]+}}, off, off offset:{{[0-9]+}} ; 4-byte Folded Reload define amdgpu_kernel void @test(<1280 x i32> addrspace(1)* %out, <1280 x i32> addrspace(1)* %in) { entry: %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) @@ -35,11 +47,17 @@ } ; CHECK-LABEL: test_limited_sgpr -; GFX6: s_add_u32 s32, s32, 0x[[OFFSET:[0-9]+]] +; GFX6: s_add_u32 s32, s32, 0x[[OFFSET:[0-9a-f]+]] ; GFX6-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9:]+}}], s32 -; GFX6-NEXT: s_sub_u32 s32, s32, 0x[[OFFSET:[0-9]+]] +; GFX6-NEXT: s_sub_u32 s32, s32, 0x[[OFFSET:[0-9a-f]+]] ; GFX6: NumSgprs: 48 ; GFX6: ScratchSize: 8608 + +; FLATSCR: s_movk_i32 [[SOFF1:s[0-9]+]], 0x +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dword off, v{{[0-9]+}}, [[SOFF1]] ; 4-byte Folded Spill +; FLATSCR: s_movk_i32 [[SOFF2:s[0-9]+]], 0x +; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, [[SOFF2]] ; 4-byte Folded Reload define amdgpu_kernel void @test_limited_sgpr(<64 x i32> addrspace(1)* %out, <64 x i32> addrspace(1)* %in) #0 { entry: %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll --- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GCN %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=MUBUF %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -amdgpu-enable-flat-scratch -verify-machineinstrs | FileCheck -check-prefix=FLATSCR %s ; FIXME: The MUBUF loads in this test output are incorrect, their SOffset ; should use the frame offset register, not the ABI stack pointer register. We @@ -13,44 +14,89 @@ ; An assert was hit when frame offset register was used to address FrameIndex. define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <4 x i32> addrspace(1)* %input, <4 x float> addrspace(1)* %output, i32 %i) { -; GCN-LABEL: kernel_background_evaluate: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 -; GCN-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GCN-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GCN-NEXT: s_mov_b32 s38, -1 -; GCN-NEXT: s_mov_b32 s39, 0x31c16000 -; GCN-NEXT: s_add_u32 s36, s36, s3 -; GCN-NEXT: s_addc_u32 s37, s37, 0 -; GCN-NEXT: v_mov_b32_e32 v1, 0x2000 -; GCN-NEXT: v_mov_b32_e32 v2, 0x4000 -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: v_mov_b32_e32 v4, 0x400000 -; GCN-NEXT: s_mov_b32 s32, 0xc0000 -; GCN-NEXT: v_add_nc_u32_e64 v40, 4, 0x4000 -; GCN-NEXT: ; implicit-def: $vcc_hi -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, svm_eval_nodes@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, svm_eval_nodes@rel32@hi+12 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: s_mov_b64 s[0:1], s[36:37] -; GCN-NEXT: s_mov_b64 s[2:3], s[38:39] -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GCN-NEXT: s_cbranch_execz BB0_2 -; GCN-NEXT: ; %bb.1: ; %if.then4.i -; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: buffer_load_dword v0, v40, s[36:39], 0 offen -; GCN-NEXT: buffer_load_dword v1, v40, s[36:39], 0 offen offset:4 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: v_mul_lo_u32 v0, 0x41c64e6d, v0 -; GCN-NEXT: v_add_nc_u32_e32 v0, 0x3039, v0 -; GCN-NEXT: buffer_store_dword v0, v0, s[36:39], 0 offen -; GCN-NEXT: BB0_2: ; %shader_eval_surface.exit -; GCN-NEXT: s_endpgm +; MUBUF-LABEL: kernel_background_evaluate: +; MUBUF: ; %bb.0: ; %entry +; MUBUF-NEXT: s_load_dword s0, s[0:1], 0x24 +; MUBUF-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MUBUF-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MUBUF-NEXT: s_mov_b32 s38, -1 +; MUBUF-NEXT: s_mov_b32 s39, 0x31c16000 +; MUBUF-NEXT: s_add_u32 s36, s36, s3 +; MUBUF-NEXT: s_addc_u32 s37, s37, 0 +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x2000 +; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 +; MUBUF-NEXT: v_mov_b32_e32 v3, 0 +; MUBUF-NEXT: v_mov_b32_e32 v4, 0x400000 +; MUBUF-NEXT: s_mov_b32 s32, 0xc0000 +; MUBUF-NEXT: v_add_nc_u32_e64 v40, 4, 0x4000 +; MUBUF-NEXT: ; implicit-def: $vcc_hi +; MUBUF-NEXT: s_getpc_b64 s[4:5] +; MUBUF-NEXT: s_add_u32 s4, s4, svm_eval_nodes@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s5, s5, svm_eval_nodes@rel32@hi+12 +; MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; MUBUF-NEXT: v_mov_b32_e32 v0, s0 +; MUBUF-NEXT: s_mov_b64 s[0:1], s[36:37] +; MUBUF-NEXT: s_mov_b64 s[2:3], s[38:39] +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MUBUF-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; MUBUF-NEXT: s_and_saveexec_b32 s0, vcc_lo +; MUBUF-NEXT: s_cbranch_execz BB0_2 +; MUBUF-NEXT: ; %bb.1: ; %if.then4.i +; MUBUF-NEXT: s_clause 0x1 +; MUBUF-NEXT: buffer_load_dword v0, v40, s[36:39], 0 offen +; MUBUF-NEXT: buffer_load_dword v1, v40, s[36:39], 0 offen offset:4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_add_nc_u32_e32 v0, v1, v0 +; MUBUF-NEXT: v_mul_lo_u32 v0, 0x41c64e6d, v0 +; MUBUF-NEXT: v_add_nc_u32_e32 v0, 0x3039, v0 +; MUBUF-NEXT: buffer_store_dword v0, v0, s[36:39], 0 offen +; MUBUF-NEXT: BB0_2: ; %shader_eval_surface.exit +; MUBUF-NEXT: s_endpgm +; +; FLATSCR-LABEL: kernel_background_evaluate: +; FLATSCR: ; %bb.0: ; %entry +; FLATSCR-NEXT: s_add_u32 s2, s2, s5 +; FLATSCR-NEXT: s_movk_i32 s32, 0x6000 +; FLATSCR-NEXT: s_addc_u32 s3, s3, 0 +; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; FLATSCR-NEXT: s_load_dword s0, s[0:1], 0x24 +; FLATSCR-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; FLATSCR-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; FLATSCR-NEXT: s_mov_b32 s38, -1 +; FLATSCR-NEXT: s_mov_b32 s39, 0x31c16000 +; FLATSCR-NEXT: s_add_u32 s36, s36, s5 +; FLATSCR-NEXT: s_addc_u32 s37, s37, 0 +; FLATSCR-NEXT: v_mov_b32_e32 v1, 0x2000 +; FLATSCR-NEXT: v_mov_b32_e32 v2, 0x4000 +; FLATSCR-NEXT: v_mov_b32_e32 v3, 0 +; FLATSCR-NEXT: v_mov_b32_e32 v4, 0x400000 +; FLATSCR-NEXT: ; implicit-def: $vcc_hi +; FLATSCR-NEXT: s_getpc_b64 s[4:5] +; FLATSCR-NEXT: s_add_u32 s4, s4, svm_eval_nodes@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s5, s5, svm_eval_nodes@rel32@hi+12 +; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; FLATSCR-NEXT: v_mov_b32_e32 v0, s0 +; FLATSCR-NEXT: s_mov_b64 s[0:1], s[36:37] +; FLATSCR-NEXT: s_mov_b64 s[2:3], s[38:39] +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[4:5] +; FLATSCR-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; FLATSCR-NEXT: s_and_saveexec_b32 s0, vcc_lo +; FLATSCR-NEXT: s_cbranch_execz BB0_2 +; FLATSCR-NEXT: ; %bb.1: ; %if.then4.i +; FLATSCR-NEXT: s_movk_i32 vcc_lo, 0x4000 +; FLATSCR-NEXT: s_nop 1 +; FLATSCR-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 +; FLATSCR-NEXT: s_waitcnt_depctr 0xffe3 +; FLATSCR-NEXT: s_movk_i32 vcc_lo, 0x4000 +; FLATSCR-NEXT: scratch_load_dword v1, off, vcc_lo offset:8 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: v_add_nc_u32_e32 v0, v1, v0 +; FLATSCR-NEXT: v_mul_lo_u32 v0, 0x41c64e6d, v0 +; FLATSCR-NEXT: v_add_nc_u32_e32 v0, 0x3039, v0 +; FLATSCR-NEXT: scratch_store_dword off, v0, s0 +; FLATSCR-NEXT: BB0_2: ; %shader_eval_surface.exit +; FLATSCR-NEXT: s_endpgm entry: %sd = alloca < 1339 x i32>, align 8192, addrspace(5) %state = alloca <4 x i32>, align 16, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/store-hi16.ll b/llvm/test/CodeGen/AMDGPU/store-hi16.ll --- a/llvm/test/CodeGen/AMDGPU/store-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/store-hi16.ll @@ -1,6 +1,7 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900,GFX9,GFX900-MUBUF %s ; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX906,GFX9,NO-D16-HI %s ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX803,NO-D16-HI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900,GFX9,GFX900-FLATSCR %s ; GCN-LABEL: {{^}}store_global_hi_v2i16: ; GCN: s_waitcnt @@ -389,7 +390,8 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}} +; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}} +; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI: buffer_store_short v1, v0, s[0:3], 0 offen{{$}} @@ -408,7 +410,8 @@ ; GCN-LABEL: {{^}}store_private_hi_v2f16: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}} +; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}} +; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off{{$}} ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI: buffer_store_short v1, v0, s[0:3], 0 offen{{$}} @@ -427,7 +430,8 @@ ; GCN-LABEL: {{^}}store_private_hi_i32_shift: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}} +; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}} +; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen{{$}} @@ -445,7 +449,8 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}} +; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}} +; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi v0, v1, off{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen{{$}} @@ -464,7 +469,8 @@ ; GCN-LABEL: {{^}}store_private_hi_i8_shift: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}} +; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}} +; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi v0, v1, off{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen{{$}} @@ -481,7 +487,8 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16_max_offset: ; GCN: s_waitcnt -; GFX900: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} +; GFX900-MUBUF: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} +; GFX900-FLATSCR: scratch_store_short_d16_hi off, v0, s32 offset:4094{{$}} ; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0 ; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], s32 offset:4094{{$}} @@ -502,7 +509,9 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16_nooff: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], 0{{$}} +; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], 0{{$}} +; GFX900-FLATSCR-NEXT: s_mov_b32 [[SOFF:s[0-9]+]], 0 +; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, [[SOFF]]{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], 0{{$}} @@ -522,7 +531,9 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_nooff: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], 0{{$}} +; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], 0{{$}} +; GFX900-FLATSCR-NEXT: s_mov_b32 [[SOFF:s[0-9]+]], 0 +; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, [[SOFF]]{{$}} ; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0 ; NO-D16-HI: buffer_store_byte v0, off, s[0:3], 0{{$}} @@ -634,8 +645,10 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16_to_offset: ; GCN: s_waitcnt -; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094 +; GFX900-MUBUF: buffer_store_dword +; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094 +; GFX900-FLATSCR: scratch_store_dword +; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, s32 offset:4094 define void @store_private_hi_v2i16_to_offset(i32 %arg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -651,8 +664,10 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_to_offset: ; GCN: s_waitcnt -; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s32 offset:4095 +; GFX900-MUBUF: buffer_store_dword +; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s32 offset:4095 +; GFX900-FLATSCR: scratch_store_dword +; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, s32 offset:4095 define void @store_private_hi_v2i16_i8_to_offset(i32 %arg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5)