Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -240,6 +240,8 @@ SDValue &Offset) const; bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset) const; + bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, + SDValue &Offset) const; bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool &Imm) const; @@ -1490,6 +1492,8 @@ SDValue Addr, SDValue &Rsrc, SDValue &VAddr, SDValue &SOffset, SDValue &ImmOffset) const { + if (Subtarget->enableFlatScratch()) + return false; SDLoc DL(Addr); MachineFunction &MF = CurDAG->getMachineFunction(); @@ -1562,6 +1566,9 @@ SDValue &SRsrc, SDValue &SOffset, SDValue &Offset) const { + if (Subtarget->enableFlatScratch()) + return false; + ConstantSDNode *CAddr = dyn_cast(Addr); if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) return false; @@ -1680,9 +1687,13 @@ SDValue &Offset) const { int64_t OffsetVal = 0; + unsigned AS = findMemSDNode(N)->getAddressSpace(); + if (!Subtarget->enableFlatScratch() && AS == AMDGPUAS::PRIVATE_ADDRESS) + return false; + if (Subtarget->hasFlatInstOffsets() && (!Subtarget->hasFlatSegmentOffsetBug() || - findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)) { + AS != AMDGPUAS::FLAT_ADDRESS)) { SDValue N0, N1; if (CurDAG->isBaseWithConstantOffset(Addr)) { N0 = Addr.getOperand(0); @@ -1694,7 +1705,6 @@ uint64_t COffsetVal = cast(N1)->getSExtValue(); const SIInstrInfo *TII = Subtarget->getInstrInfo(); - unsigned AS = findMemSDNode(N)->getAddressSpace(); if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) { Addr = N0; OffsetVal = COffsetVal; @@ -1727,39 +1737,52 @@ OffsetVal = ImmField; - // TODO: Should this try to use a scalar add pseudo if the base address - // is uniform and saddr is usable? - SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); - SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); - - SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, - MVT::i32, N0, Sub0); - SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, - MVT::i32, N0, Sub1); - SDValue AddOffsetLo = getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); - SDValue AddOffsetHi = - getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); - - SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); - SDNode *Add = - CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs, - {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); - - SDNode *Addc = CurDAG->getMachineNode( - AMDGPU::V_ADDC_U32_e64, DL, VTs, - {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); - - SDValue RegSequenceArgs[] = { - CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32), - SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1}; - - Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, - MVT::i64, RegSequenceArgs), - 0); + if (Addr.getValueType().getSizeInBits() == 32) { + SmallVector Opnds; + Opnds.push_back(N0); + Opnds.push_back(AddOffsetLo); + unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32; + if (Subtarget->hasAddNoCarry()) { + AddOp = AMDGPU::V_ADD_U32_e64; + Opnds.push_back(Clamp); + } + Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0); + } else { + // TODO: Should this try to use a scalar add pseudo if the base address + // is uniform and saddr is usable? + SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); + SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); + + SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub0); + SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub1); + + SDValue AddOffsetHi = + getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); + + SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); + + SDNode *Add = + CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs, + {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); + + SDNode *Addc = CurDAG->getMachineNode( + AMDGPU::V_ADDC_U32_e64, DL, VTs, + {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); + + SDValue RegSequenceArgs[] = { + CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32), + SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1}; + + Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::i64, RegSequenceArgs), + 0); + } } } } @@ -1832,6 +1855,64 @@ return true; } +// Match (32-bit SGPR base) + sext(imm offset) +bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *N, + SDValue Addr, + SDValue &SAddr, + SDValue &Offset) const { + if (!Subtarget->enableFlatScratch() || Addr->isDivergent()) + return false; + + SAddr = Addr; + int64_t COffsetVal = 0; + + if (CurDAG->isBaseWithConstantOffset(Addr)) { + COffsetVal = cast(Addr.getOperand(1))->getSExtValue(); + SAddr = Addr.getOperand(0); + } + + if (auto FI = dyn_cast(SAddr)) { + SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); + } else if (SAddr.getOpcode() == ISD::ADD && + isa(SAddr.getOperand(0))) { + // Materialize this into a scalar move for scalar address to avoid + // readfirstlane. + auto FI = cast(SAddr.getOperand(0)); + SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(), + FI->getValueType(0)); + SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, SDLoc(SAddr), + MVT::i32, TFI, SAddr.getOperand(1)), + 0); + } + + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + + if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) { + int64_t RemainderOffset = COffsetVal; + int64_t ImmField = 0; + const unsigned NumBits = TII->getNumFlatOffsetBits(true); + // Use signed division by a power of two to truncate towards 0. + int64_t D = 1LL << (NumBits - 1); + RemainderOffset = (COffsetVal / D) * D; + ImmField = COffsetVal - RemainderOffset; + + assert(TII->isLegalFLATOffset(ImmField, AMDGPUAS::PRIVATE_ADDRESS, true)); + assert(RemainderOffset + ImmField == COffsetVal); + + COffsetVal = ImmField; + + SDLoc DL(N); + SDValue AddOffset = + getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); + SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, DL, MVT::i32, + SAddr, AddOffset), 0); + } + + Offset = CurDAG->getTargetConstant(COffsetVal, SDLoc(), MVT::i16); + + return true; +} + bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool &Imm) const { ConstantSDNode *C = dyn_cast(ByteOffsetNode); Index: llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -952,6 +952,8 @@ return true; } + bool enableFlatScratch() const; + void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override; @@ -1005,6 +1007,10 @@ return HasDPP8; } + bool hasSGPRNull() const { + return getGeneration() >= GFX10; + } + bool hasR128A16() const { return HasR128A16; } Index: llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -50,6 +50,11 @@ cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false)); +static cl::opt EnableFlatScratch( + "amdgpu-enable-flat-scratch", + cl::desc("Use flat scratch instructions"), + cl::init(false)); + GCNSubtarget::~GCNSubtarget() = default; R600Subtarget & @@ -286,6 +291,10 @@ *this, *static_cast(RegBankInfo.get()), TM)); } +bool GCNSubtarget::enableFlatScratch() const { + return EnableFlatScratch && hasFlatScratchInsts(); +} + unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { if (getGeneration() < GFX10) return 1; Index: llvm/lib/Target/AMDGPU/FLATInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/FLATInstructions.td +++ llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -8,8 +8,10 @@ def FLATOffset : ComplexPattern", [], [SDNPWantRoot], -10>; def FLATOffsetSigned : ComplexPattern", [], [SDNPWantRoot], -10>; +def ScratchOffset : ComplexPattern", [], [SDNPWantRoot], -10>; def GlobalSAddr : ComplexPattern; +def ScratchSAddr : ComplexPattern; //===----------------------------------------------------------------------===// // FLAT classes @@ -839,6 +841,37 @@ (inst $vaddr, $data, $offset) >; +class ScratchLoadSignedPat : GCNPat < + (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset))), + (inst $vaddr, $offset) +>; + +class ScratchSignedLoadPat_D16 : GCNPat < + (node (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset), vt:$in), + (inst $vaddr, $offset, 0, 0, 0, $in) +>; + +class ScratchStoreSignedPat : GCNPat < + (node vt:$data, (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset)), + (inst getVregSrcForVT.ret:$data, $vaddr, $offset) +>; + +class ScratchLoadSaddrPat : GCNPat < + (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i16:$offset))), + (inst $saddr, $offset) +>; + +class ScratchLoadSaddrPat_D16 : GCNPat < + (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i16:$offset), vt:$in)), + (inst $saddr, $offset, 0, 0, 0, $in) +>; + +class ScratchStoreSaddrPat : GCNPat < + (node vt:$data, (ScratchSAddr (i32 SGPR_32:$saddr), i16:$offset)), + (inst getVregSrcForVT.ret:$data, $saddr, $offset) +>; + let OtherPredicates = [HasFlatAddressSpace] in { def : FlatLoadPat ; @@ -996,6 +1029,37 @@ } } +multiclass ScratchFLATLoadPats { + def : ScratchLoadSignedPat { + let AddedComplexity = 25; + } + + def : ScratchLoadSaddrPat(!cast(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 26; + } +} + +multiclass ScratchFLATStorePats { + def : ScratchStoreSignedPat { + let AddedComplexity = 25; + } + + def : ScratchStoreSaddrPat(!cast(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 26; + } +} + +multiclass ScratchFLATLoadPats_D16 { + def : ScratchSignedLoadPat_D16 { + let AddedComplexity = 25; + } + + def : ScratchLoadSaddrPat_D16(!cast(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 26; + } +} + let OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATLoadPats ; @@ -1096,6 +1160,62 @@ } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 +let OtherPredicates = [HasFlatScratchInsts] in { + +defm : ScratchFLATLoadPats ; +defm : ScratchFLATLoadPats ; +defm : ScratchFLATLoadPats ; +defm : ScratchFLATLoadPats ; +defm : ScratchFLATLoadPats ; +defm : ScratchFLATLoadPats ; +defm : ScratchFLATLoadPats ; +defm : ScratchFLATLoadPats ; +defm : ScratchFLATLoadPats ; +defm : ScratchFLATLoadPats ; + +foreach vt = Reg32Types.types in { +defm : ScratchFLATLoadPats ; +defm : ScratchFLATStorePats ; +} + +foreach vt = VReg_64.RegTypes in { +defm : ScratchFLATLoadPats ; +defm : ScratchFLATStorePats ; +} + +defm : ScratchFLATLoadPats ; + +foreach vt = VReg_128.RegTypes in { +defm : ScratchFLATLoadPats ; +defm : ScratchFLATStorePats ; +} + +defm : ScratchFLATStorePats ; +defm : ScratchFLATStorePats ; +defm : ScratchFLATStorePats ; +defm : ScratchFLATStorePats ; +defm : ScratchFLATStorePats ; + +let OtherPredicates = [D16PreservesUnusedBits, HasFlatScratchInsts] in { +defm : ScratchFLATStorePats ; +defm : ScratchFLATStorePats ; + +defm : ScratchFLATLoadPats_D16 ; +defm : ScratchFLATLoadPats_D16 ; +defm : ScratchFLATLoadPats_D16 ; +defm : ScratchFLATLoadPats_D16 ; +defm : ScratchFLATLoadPats_D16 ; +defm : ScratchFLATLoadPats_D16 ; + +defm : ScratchFLATLoadPats_D16 ; +defm : ScratchFLATLoadPats_D16 ; +defm : ScratchFLATLoadPats_D16 ; +defm : ScratchFLATLoadPats_D16 ; +defm : ScratchFLATLoadPats_D16 ; +defm : ScratchFLATLoadPats_D16 ; +} + +} // End OtherPredicates = [HasFlatScratchInsts] //===----------------------------------------------------------------------===// // Target Index: llvm/lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -173,7 +173,7 @@ int OpNo, const MachineOperand &OpToFold) { return OpToFold.isFI() && - (TII->isMUBUF(UseMI) || TII->isFLATScratch(UseMI)) && + TII->isMUBUF(UseMI) && OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::vaddr); } Index: llvm/lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -365,6 +365,10 @@ return ScratchRsrcReg; } +static unsigned getScratchScaleFactor(const GCNSubtarget &ST) { + return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize(); +} + void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); @@ -461,7 +465,7 @@ Register SPReg = MFI->getStackPtrOffsetReg(); assert(SPReg != AMDGPU::SP_REG); BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg) - .addImm(MF.getFrameInfo().getStackSize() * ST.getWavefrontSize()); + .addImm(MF.getFrameInfo().getStackSize() * getScratchScaleFactor(ST)); } if (hasFP(MF)) { @@ -888,11 +892,11 @@ // s_and_b32 s32, tmp_reg, 0b111...0000 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg) .addReg(StackPtrReg) - .addImm((Alignment - 1) * ST.getWavefrontSize()) + .addImm((Alignment - 1) * getScratchScaleFactor(ST)) .setMIFlag(MachineInstr::FrameSetup); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg) .addReg(ScratchSPReg, RegState::Kill) - .addImm(-Alignment * ST.getWavefrontSize()) + .addImm(-Alignment * getScratchScaleFactor(ST)) .setMIFlag(MachineInstr::FrameSetup); FuncInfo->setIsStackRealigned(true); } else if ((HasFP = hasFP(MF))) { @@ -914,7 +918,7 @@ if (HasFP && RoundedSize != 0) { BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg) .addReg(StackPtrReg) - .addImm(RoundedSize * ST.getWavefrontSize()) + .addImm(RoundedSize * getScratchScaleFactor(ST)) .setMIFlag(MachineInstr::FrameSetup); } @@ -976,7 +980,7 @@ if (RoundedSize != 0 && hasFP(MF)) { BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) .addReg(StackPtrReg) - .addImm(RoundedSize * ST.getWavefrontSize()) + .addImm(RoundedSize * getScratchScaleFactor(ST)) .setMIFlag(MachineInstr::FrameDestroy); } @@ -1264,7 +1268,7 @@ unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; BuildMI(MBB, I, DL, TII->get(Op), SPReg) .addReg(SPReg) - .addImm(Amount * ST.getWavefrontSize()); + .addImm(Amount * getScratchScaleFactor(ST)); } else if (CalleePopAmount != 0) { llvm_unreachable("is this used?"); } Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -88,7 +88,7 @@ const MachineFunction &MF) const override; bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override; - int64_t getMUBUFInstrOffset(const MachineInstr *MI) const; + int64_t getScratchInstrOffset(const MachineInstr *MI) const; int64_t getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const override; Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -384,8 +384,8 @@ return true; } -int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const { - assert(SIInstrInfo::isMUBUF(*MI)); +int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const { + assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI)); int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::offset); @@ -394,23 +394,29 @@ int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const { - if (!SIInstrInfo::isMUBUF(*MI)) + if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) return 0; - assert(Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::vaddr) && + assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::vaddr) || + (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::saddr))) && "Should never see frame index on non-address operand"); - return getMUBUFInstrOffset(MI); + return getScratchInstrOffset(MI); } bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { if (!MI->mayLoadOrStore()) return false; - int64_t FullOffset = Offset + getMUBUFInstrOffset(MI); + int64_t FullOffset = Offset + getScratchInstrOffset(MI); - return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset); + if (SIInstrInfo::isMUBUF(*MI)) + return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset); + + const SIInstrInfo *TII = ST.getInstrInfo(); + return TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, true); } void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, @@ -425,9 +431,11 @@ MachineFunction *MF = MBB->getParent(); const SIInstrInfo *TII = ST.getInstrInfo(); + unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32 + : AMDGPU::V_MOV_B32_e32; if (Offset == 0) { - BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg) + BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg) .addFrameIndex(FrameIdx); return; } @@ -435,13 +443,22 @@ MachineRegisterInfo &MRI = MF->getRegInfo(); Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - Register FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register FIReg = MRI.createVirtualRegister( + ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass + : &AMDGPU::VGPR_32RegClass); BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) .addImm(Offset); - BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), FIReg) + BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg) .addFrameIndex(FrameIdx); + if (ST.enableFlatScratch() ) { + BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_U32), BaseReg) + .addReg(OffsetReg, RegState::Kill) + .addReg(FIReg); + return; + } + TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) .addReg(OffsetReg, RegState::Kill) .addReg(FIReg) @@ -451,6 +468,7 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const { const SIInstrInfo *TII = ST.getInstrInfo(); + bool IsFlat = TII->isFLATScratch(MI); #ifndef NDEBUG // FIXME: Is it possible to be storing a frame index to itself? @@ -465,21 +483,32 @@ } #endif - MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); + MachineOperand *FIOp = + TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr + : AMDGPU::OpName::vaddr); #ifndef NDEBUG MachineBasicBlock *MBB = MI.getParent(); MachineFunction *MF = MBB->getParent(); #endif assert(FIOp && FIOp->isFI() && "frame index must be address operand"); - assert(TII->isMUBUF(MI)); + assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI)); + + MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); + int64_t NewOffset = OffsetOp->getImm() + Offset; + + if (IsFlat) { + assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true) && + "offset should be legal"); + FIOp->ChangeToRegister(BaseReg, false); + OffsetOp->setImm(NewOffset); + return; + } MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset); assert(SOffset->getReg() == MF->getInfo()->getStackPtrOffsetReg() && "should only be seeing stack pointer offset relative FrameIndex"); - MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); - int64_t NewOffset = OffsetOp->getImm() + Offset; assert(SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) && "offset should be legal"); @@ -494,12 +523,16 @@ bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg, int64_t Offset) const { - if (!SIInstrInfo::isMUBUF(*MI)) + if (!SIInstrInfo::isMUBUF(*MI) && !!SIInstrInfo::isFLATScratch(*MI)) return false; - int64_t NewOffset = Offset + getMUBUFInstrOffset(MI); + int64_t NewOffset = Offset + getScratchInstrOffset(MI); + + if (SIInstrInfo::isMUBUF(*MI)) + return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset); - return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset); + const SIInstrInfo *TII = ST.getInstrInfo(); + return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true); } const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( @@ -1341,6 +1374,93 @@ default: { const DebugLoc &DL = MI->getDebugLoc(); + + if (TII->isFLATScratch(*MI)) { + // The offset is always swizzled, just replace it + if (FrameReg) + FIOp.ChangeToRegister(FrameReg, false); + + int64_t Offset = FrameInfo.getObjectOffset(Index); + if (!Offset) + return; + + MachineOperand *OffsetOp = + TII->getNamedOperand(*MI, AMDGPU::OpName::offset); + int64_t NewOffset = Offset + OffsetOp->getImm(); + if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, + true)) { + OffsetOp->setImm(NewOffset); + if (FrameReg) + return; + Offset = 0; + } + + Register SReg = AMDGPU::SGPR_NULL; + // On GFX10 we have NULL register to use here. + // Otherwise we need to materialize 0 into an SGPR. + if (Offset || !ST.hasSGPRNull()) { + SReg = RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0); + if (FrameReg) + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SReg) + .addReg(FrameReg) + .addImm(Offset); + else + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SReg) + .addImm(Offset); + } + FIOp.ChangeToRegister(SReg, false, false, true); + return; + } + + if (ST.enableFlatScratch()) { + int64_t Offset = FrameInfo.getObjectOffset(Index); + if (!FrameReg) { + FIOp.ChangeToImmediate(Offset); + if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) + return; + } + + // We need to use register here. Check if we can use an SGPR or need + // a VGPR. + FIOp.ChangeToRegister(AMDGPU::M0, false); + bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp); + const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass + : &AMDGPU::VGPR_32RegClass; + + if (!Offset && UseSGPR) { + FIOp.setReg(FrameReg); + return; + } + + Register TmpReg = RS->scavengeRegister(RC, MI, 0); + FIOp.setReg(TmpReg); + FIOp.setIsKill(true); + + if (!Offset || !FrameReg) { + unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; + auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg); + if (FrameReg) + MIB.addReg(FrameReg); + if (Offset) + MIB.addImm(Offset); + return; + } + + Register TmpSReg = UseSGPR ? TmpReg + : RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), TmpSReg) + .addReg(FrameReg) + .addImm(Offset); + + if (UseSGPR) + return; + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) + .addReg(TmpSReg, RegState::Kill); + + return; + } + bool IsMUBUF = TII->isMUBUF(*MI); if (!IsMUBUF && !MFI->isEntryFunction()) { Index: llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -1,6 +1,7 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FLATSCR %s declare hidden void @external_void_func_void() #0 @@ -53,7 +54,8 @@ ; GCN: v_writelane_b32 v40, s33, 4 ; GCN: s_mov_b32 s33, s32 -; GCN: s_add_u32 s32, s32, 0x400 +; MUBUF: s_add_u32 s32, s32, 0x400 +; FLATSCR: s_add_u32 s32, s32, 16 ; GCN: s_swappc_b64 ; GCN-NEXT: s_swappc_b64 Index: llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -1,5 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MUBUF %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,MUBUF %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-flat-scratch < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FLATSCR %s ; GCN-LABEL: {{^}}callee_no_stack: ; GCN: ; %bb.0: @@ -32,7 +33,8 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt ; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}} -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32{{$}} +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32{{$}} +; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @callee_with_stack() #0 { @@ -48,10 +50,13 @@ ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_mov_b32 s4, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_add_u32 s32, s32, 0x200 +; MUBUF-NEXT: s_add_u32 s32, s32, 0x200 +; FLATSCR-NEXT: s_add_u32 s32, s32, 8 ; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}} -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4{{$}} -; GCN-NEXT: s_sub_u32 s32, s32, 0x200 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4{{$}} +; FLATSCR-NEXT: scratch_store_dword off, v0, s33 offset:4{{$}} +; MUBUF-NEXT: s_sub_u32 s32, s32, 0x200 +; FLATSCR-NEXT: s_sub_u32 s32, s32, 8 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 @@ -65,7 +70,8 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt ; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}} -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32{{$}} +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32{{$}} +; FLATSCR-NEXT: scratch_store_dword off, v0, s32{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @callee_with_stack_no_fp_elim_non_leaf() #2 { @@ -82,19 +88,22 @@ ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN: v_writelane_b32 [[CSR_VGPR]], s33, 2 ; GCN-DAG: s_mov_b32 s33, s32 -; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}} +; MUBUF-DAG: s_add_u32 s32, s32, 0x400{{$}} +; FLATSCR-DAG: s_add_u32 s32, s32, 16{{$}} ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, -; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}} +; MUBUF-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}} +; FLATSCR-DAG: scratch_store_dword off, [[ZERO]], s33{{$}} ; GCN: s_swappc_b64 ; GCN-DAG: v_readlane_b32 s5, [[CSR_VGPR]] ; GCN-DAG: v_readlane_b32 s4, [[CSR_VGPR]] -; GCN: s_sub_u32 s32, s32, 0x400{{$}} +; MUBUF: s_sub_u32 s32, s32, 0x400{{$}} +; FLATSCR: s_sub_u32 s32, s32, 16{{$}} ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -120,7 +129,8 @@ ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-DAG: s_add_u32 s32, s32, 0x400 +; MUBUF-DAG: s_add_u32 s32, s32, 0x400 +; FLATSCR-DAG: s_add_u32 s32, s32, 16 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s33, [[FP_SPILL_LANE:[0-9]+]] ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, 0 @@ -130,7 +140,8 @@ ; GCN-DAG: v_readlane_b32 s4, v40, 0 ; GCN-DAG: v_readlane_b32 s5, v40, 1 -; GCN: s_sub_u32 s32, s32, 0x400 +; MUBUF: s_sub_u32 s32, s32, 0x400 +; FLATSCR: s_sub_u32 s32, s32, 16 ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], [[FP_SPILL_LANE]] ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload @@ -209,15 +220,18 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:8 +; MUBUF-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:8 +; FLATSCR-DAG: scratch_store_dword off, [[ZERO]], s33 offset:8 ; GCN: ;;#ASMSTART ; GCN-NEXT: ; clobber v41 ; GCN-NEXT: ;;#ASMEND ; GCN: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN: s_add_u32 s32, s32, 0x300 -; GCN-NEXT: s_sub_u32 s32, s32, 0x300 +; MUBUF: s_add_u32 s32, s32, 0x300 +; MUBUF-NEXT: s_sub_u32 s32, s32, 0x300 +; FLATSCR: s_add_u32 s32, s32, 12 +; FLATSCR-NEXT: s_sub_u32 s32, s32, 12 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 @@ -235,12 +249,15 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-COUNT-63: v_writelane_b32 v1 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:8 +; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:8 +; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33 offset:8 ; GCN: ;;#ASMSTART ; GCN-COUNT-63: v_readlane_b32 s{{[0-9]+}}, v1 -; GCN: s_add_u32 s32, s32, 0x300 -; GCN-NEXT: s_sub_u32 s32, s32, 0x300 +; MUBUF: s_add_u32 s32, s32, 0x300 +; MUBUF-NEXT: s_sub_u32 s32, s32, 0x300 +; FLATSCR: s_add_u32 s32, s32, 12 +; FLATSCR-NEXT: s_sub_u32 s32, s32, 12 ; GCN-NEXT: v_readlane_b32 s33, v1, 63 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 @@ -268,13 +285,16 @@ ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-COUNT-64: v_writelane_b32 v1, -; GCN: buffer_store_dword +; MUBUF: buffer_store_dword +; FLATSCR: scratch_store_dword ; GCN: ;;#ASMSTART ; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1 ; GCN: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN: s_add_u32 s32, s32, 0x300 -; GCN-NEXT: s_sub_u32 s32, s32, 0x300 +; MUBUF: s_add_u32 s32, s32, 0x300 +; MUBUF-NEXT: s_sub_u32 s32, s32, 0x300 +; FLATSCR: s_add_u32 s32, s32, 12 +; FLATSCR-NEXT: s_sub_u32 s32, s32, 12 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 @@ -296,13 +316,18 @@ ; GCN-LABEL: {{^}}realign_stack_no_fp_elim: ; GCN: s_waitcnt -; GCN-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x7ffc0 -; GCN-NEXT: s_mov_b32 s4, s33 -; GCN-NEXT: s_and_b32 s33, [[SCRATCH]], 0xfff80000 -; GCN-NEXT: s_add_u32 s32, s32, 0x100000 -; GCN-NEXT: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; GCN-NEXT: buffer_store_dword [[ZERO]], off, s[0:3], s33 -; GCN-NEXT: s_sub_u32 s32, s32, 0x100000 +; MUBUF-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x7ffc0 +; FLATSCR-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x1fff +; GCN-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_and_b32 s33, [[SCRATCH]], 0xfff80000 +; FLATSCR-NEXT: s_and_b32 s33, [[SCRATCH]], 0xffffe000 +; MUBUF-NEXT: s_add_u32 s32, s32, 0x100000 +; FLATSCR-NEXT: s_add_u32 s32, s32, 0x4000 +; GCN-NEXT: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 +; MUBUF-NEXT: buffer_store_dword [[ZERO]], off, s[0:3], s33 +; FLATSCR-NEXT: scratch_store_dword off, [[ZERO]], s33 +; MUBUF-NEXT: s_sub_u32 s32, s32, 0x100000 +; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x4000 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 @@ -319,12 +344,15 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; GCN: v_writelane_b32 v1, s31, 1 -; GCN: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4 +; MUBUF: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4 +; FLATSCR: scratch_store_dword off, [[ZERO]], s33 offset:4 ; GCN: ;;#ASMSTART ; GCN: v_readlane_b32 s4, v1, 0 -; GCN-NEXT: s_add_u32 s32, s32, 0x200 -; GCN-NEXT: v_readlane_b32 s5, v1, 1 -; GCN-NEXT: s_sub_u32 s32, s32, 0x200 +; MUBUF-NEXT: s_add_u32 s32, s32, 0x200 +; FLATSCR-NEXT: s_add_u32 s32, s32, 8 +; GCN-NEXT: v_readlane_b32 s5, v1, 1 +; MUBUF-NEXT: s_sub_u32 s32, s32, 0x200 +; FLATSCR-NEXT: s_sub_u32 s32, s32, 8 ; GCN-NEXT: v_readlane_b32 s33, v1, 2 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] @@ -353,14 +381,17 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1 -; GCN-DAG: buffer_store_dword -; GCN: s_add_u32 s32, s32, 0x300{{$}} +; MUBUF-DAG: buffer_store_dword +; FLATSCR-DAG: scratch_store_dword +; MUBUF: s_add_u32 s32, s32, 0x300{{$}} +; FLATSCR: s_add_u32 s32, s32, 12{{$}} ; GCN: ;;#ASMSTART -; GCN: v_readlane_b32 s4, [[CSR_VGPR]], 0 -; GCN-NEXT: v_readlane_b32 s5, [[CSR_VGPR]], 1 -; GCN-NEXT: s_sub_u32 s32, s32, 0x300{{$}} +; GCN: v_readlane_b32 s4, [[CSR_VGPR]], 0 +; GCN-NEXT: v_readlane_b32 s5, [[CSR_VGPR]], 1 +; MUBUF-NEXT: s_sub_u32 s32, s32, 0x300{{$}} +; FLATSCR-NEXT: s_sub_u32 s32, s32, 12{{$}} ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -396,17 +427,20 @@ ; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2 -; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s30, 0 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1 -; GCN-DAG: s_add_u32 s32, s32, 0x40300{{$}} -; GCN-DAG: buffer_store_dword +; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, 0 +; GCN-DAG: s_mov_b32 s33, s32 +; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1 +; MUBUF-DAG: s_add_u32 s32, s32, 0x40300{{$}} +; FLATSCR-DAG: s_add_u32 s32, s32, 0x100c{{$}} +; MUBUF-DAG: buffer_store_dword +; FLATSCR-DAG: scratch_store_dword ; GCN: ;;#ASMSTART ; GCN: v_readlane_b32 s4, [[CSR_VGPR]], 0 ; GCN-NEXT: v_readlane_b32 s5, [[CSR_VGPR]], 1 -; GCN-NEXT: s_sub_u32 s32, s32, 0x40300{{$}} +; MUBUF-NEXT: s_sub_u32 s32, s32, 0x40300{{$}} +; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x100c{{$}} ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008 @@ -447,10 +481,13 @@ ; GCN-LABEL: {{^}}ipra_call_with_stack: ; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 ; GCN: s_mov_b32 s33, s32 -; GCN: s_add_u32 s32, s32, 0x400 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33{{$}} -; GCN: s_swappc_b64 -; GCN: s_sub_u32 s32, s32, 0x400 +; MUBUF: s_add_u32 s32, s32, 0x400 +; FLATSCR: s_add_u32 s32, s32, 16 +; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33{{$}} +; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33{{$}} +; GCN: s_swappc_b64 +; MUBUF: s_sub_u32 s32, s32, 0x400 +; FLATSCR: s_sub_u32 s32, s32, 16 ; GCN: s_mov_b32 s33, [[FP_COPY:s[0-9]+]] define void @ipra_call_with_stack() #0 { %alloca = alloca i32, addrspace(5) Index: llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -1,15 +1,27 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s define <2 x half> @chain_hi_to_lo_private() { -; GCN-LABEL: chain_hi_to_lo_private: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], 0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: chain_hi_to_lo_private: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], 0 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: chain_hi_to_lo_private: +; FLATSCR: ; %bb.0: ; %bb +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s4, 2 +; FLATSCR-NEXT: scratch_load_ushort v0, off, s4 +; FLATSCR-NEXT: s_mov_b32 s4, 0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s4 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds half, half addrspace(5)* null, i64 1 %load_lo = load half, half addrspace(5)* %gep_lo @@ -23,14 +35,23 @@ } define <2 x half> @chain_hi_to_lo_private_different_bases(half addrspace(5)* %base_lo, half addrspace(5)* %base_hi) { -; GCN-LABEL: chain_hi_to_lo_private_different_bases: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: chain_hi_to_lo_private_different_bases: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: chain_hi_to_lo_private_different_bases: +; FLATSCR: ; %bb.0: ; %bb +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: scratch_load_ushort v0, v0, off +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_load_short_d16_hi v0, v1, off +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] bb: %load_lo = load half, half addrspace(5)* %base_lo %load_hi = load half, half addrspace(5)* %base_hi @@ -42,14 +63,23 @@ } define <2 x half> @chain_hi_to_lo_arithmatic(half addrspace(5)* %base, half %in) { -; GCN-LABEL: chain_hi_to_lo_arithmatic: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_add_f16_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: chain_hi_to_lo_arithmatic: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GFX900-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: chain_hi_to_lo_arithmatic: +; FLATSCR: ; %bb.0: ; %bb +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: v_add_f16_e32 v1, 1.0, v1 +; FLATSCR-NEXT: scratch_load_short_d16_hi v1, v0, off +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: v_mov_b32_e32 v0, v1 +; FLATSCR-NEXT: s_setpc_b64 s[30:31] bb: %arith_lo = fadd half %in, 1.0 %load_hi = load half, half addrspace(5)* %base @@ -191,38 +221,75 @@ ; Make sure we don't lose any of the private stores. define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %in, <2 x i16> addrspace(1)* nocapture %out) #0 { -; GCN-LABEL: vload2_private: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: global_load_ushort v2, v[0:1], off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 -; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:2 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:6 -; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:4 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:8 -; GCN-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:4 -; GCN-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:6 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v3, v4 -; GCN-NEXT: buffer_load_short_d16_hi v3, off, s[0:3], 0 offset:8 -; GCN-NEXT: v_lshl_or_b32 v2, v4, 16, v2 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx2 v[0:1], v[2:3], off -; GCN-NEXT: s_endpgm +; GFX900-LABEL: vload2_private: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX900-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX900-NEXT: s_add_u32 s0, s0, s9 +; GFX900-NEXT: s_addc_u32 s1, s1, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v1, s5 +; GFX900-NEXT: global_load_ushort v2, v[0:1], off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 +; GFX900-NEXT: global_load_ushort v2, v[0:1], off offset:2 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:6 +; GFX900-NEXT: global_load_ushort v2, v[0:1], off offset:4 +; GFX900-NEXT: v_mov_b32_e32 v0, s6 +; GFX900-NEXT: v_mov_b32_e32 v1, s7 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:8 +; GFX900-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:4 +; GFX900-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:6 +; GFX900-NEXT: s_waitcnt vmcnt(1) +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: buffer_load_short_d16_hi v3, off, s[0:3], 0 offset:8 +; GFX900-NEXT: v_lshl_or_b32 v2, v4, 16, v2 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX900-NEXT: s_endpgm +; +; FLATSCR-LABEL: vload2_private: +; FLATSCR: ; %bb.0: ; %entry +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 +; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; FLATSCR-NEXT: v_mov_b32_e32 v0, s4 +; FLATSCR-NEXT: v_mov_b32_e32 v1, s5 +; FLATSCR-NEXT: global_load_ushort v2, v[0:1], off +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_short off, v2, vcc_hi offset:4 +; FLATSCR-NEXT: global_load_ushort v2, v[0:1], off offset:2 +; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_short off, v2, vcc_hi offset:6 +; FLATSCR-NEXT: global_load_ushort v2, v[0:1], off offset:4 +; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 +; FLATSCR-NEXT: v_mov_b32_e32 v0, s6 +; FLATSCR-NEXT: v_mov_b32_e32 v1, s7 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_short off, v2, vcc_hi offset:8 +; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 +; FLATSCR-NEXT: scratch_load_ushort v2, off, vcc_hi offset:4 +; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 +; FLATSCR-NEXT: scratch_load_ushort v4, off, vcc_hi offset:6 +; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 +; FLATSCR-NEXT: s_waitcnt vmcnt(1) +; FLATSCR-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: v_mov_b32_e32 v3, v4 +; FLATSCR-NEXT: scratch_load_short_d16_hi v3, off, vcc_hi offset:8 +; FLATSCR-NEXT: v_lshl_or_b32 v2, v4, 16, v2 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; FLATSCR-NEXT: s_endpgm entry: %loc = alloca [3 x i16], align 2, addrspace(5) %loc.0.sroa_cast1 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)* @@ -297,16 +364,27 @@ } define <2 x i16> @chain_hi_to_lo_private_other_dep(i16 addrspace(5)* %ptr) { -; GCN-LABEL: chain_hi_to_lo_private_other_dep: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] -; GCN-NEXT: buffer_load_short_d16 v1, v0, s[0:3], 0 offen offset:2 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: chain_hi_to_lo_private_other_dep: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] +; GFX900-NEXT: buffer_load_short_d16 v1, v0, s[0:3], 0 offen offset:2 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: chain_hi_to_lo_private_other_dep: +; FLATSCR: ; %bb.0: ; %bb +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: scratch_load_short_d16_hi v1, v0, off +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] +; FLATSCR-NEXT: scratch_load_short_d16 v1, v0, off offset:2 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: v_mov_b32_e32 v0, v1 +; FLATSCR-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 1 %load_lo = load i16, i16 addrspace(5)* %gep_lo Index: llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -mattr=-unaligned-scratch-access < %s | FileCheck -check-prefixes=GCN,GFX7-ALIGNED %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -mattr=+unaligned-scratch-access < %s | FileCheck -check-prefixes=GCN,GFX7-UNALIGNED %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+unaligned-scratch-access < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+unaligned-scratch-access -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,GFX9-FLASTSCR %s ; Should not merge this to a dword load define i32 @private_load_2xi16_align2(i16 addrspace(5)* %p) #0 { @@ -35,6 +36,15 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLASTSCR-LABEL: private_load_2xi16_align2: +; GFX9-FLASTSCR: ; %bb.0: +; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLASTSCR-NEXT: scratch_load_ushort v1, v0, off +; GFX9-FLASTSCR-NEXT: scratch_load_ushort v0, v0, off offset:2 +; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLASTSCR-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 %p.0 = load i16, i16 addrspace(5)* %p, align 2 %p.1 = load i16, i16 addrspace(5)* %gep.p, align 2 @@ -78,6 +88,16 @@ ; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLASTSCR-LABEL: private_store_2xi16_align2: +; GFX9-FLASTSCR: ; %bb.0: +; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-FLASTSCR-NEXT: scratch_store_short v1, v0, off +; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v0, 2 +; GFX9-FLASTSCR-NEXT: scratch_store_short v1, v0, off offset:2 +; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 store i16 1, i16 addrspace(5)* %r, align 2 store i16 2, i16 addrspace(5)* %gep.r, align 2 @@ -124,6 +144,17 @@ ; GFX9-NEXT: v_bfi_b32 v1, v1, 0, v0 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLASTSCR-LABEL: private_load_2xi16_align1: +; GFX9-FLASTSCR: ; %bb.0: +; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off +; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-FLASTSCR-NEXT: s_mov_b32 s4, 0xffff +; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLASTSCR-NEXT: v_bfi_b32 v1, v1, 0, v0 +; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 %p.0 = load i16, i16 addrspace(5)* %p, align 1 %p.1 = load i16, i16 addrspace(5)* %gep.p, align 1 @@ -167,6 +198,14 @@ ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLASTSCR-LABEL: private_store_2xi16_align1: +; GFX9-FLASTSCR: ; %bb.0: +; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX9-FLASTSCR-NEXT: scratch_store_dword v1, v0, off +; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 store i16 1, i16 addrspace(5)* %r, align 1 store i16 2, i16 addrspace(5)* %gep.r, align 1 @@ -206,6 +245,17 @@ ; GFX9-NEXT: v_bfi_b32 v1, v1, 0, v0 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLASTSCR-LABEL: private_load_2xi16_align4: +; GFX9-FLASTSCR: ; %bb.0: +; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off +; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-FLASTSCR-NEXT: s_mov_b32 s4, 0xffff +; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLASTSCR-NEXT: v_bfi_b32 v1, v1, 0, v0 +; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 %p.0 = load i16, i16 addrspace(5)* %p, align 4 %p.1 = load i16, i16 addrspace(5)* %gep.p, align 2 @@ -228,13 +278,37 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GCN-LABEL: private_store_2xi16_align4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0x20001 -; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX7-ALIGNED-LABEL: private_store_2xi16_align4: +; GFX7-ALIGNED: ; %bb.0: +; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX7-ALIGNED-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-UNALIGNED-LABEL: private_store_2xi16_align4: +; GFX7-UNALIGNED: ; %bb.0: +; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX7-UNALIGNED-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: private_store_2xi16_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLASTSCR-LABEL: private_store_2xi16_align4: +; GFX9-FLASTSCR: ; %bb.0: +; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX9-FLASTSCR-NEXT: scratch_store_dword v1, v0, off +; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 store i16 1, i16 addrspace(5)* %r, align 4 store i16 2, i16 addrspace(5)* %gep.r, align 2 Index: llvm/test/CodeGen/AMDGPU/flat-scratch.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -0,0 +1,1241 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s + +define amdgpu_kernel void @zero_init_kernel() { +; GFX9-LABEL: zero_init_kernel: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:76 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:72 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:68 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:64 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:60 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:56 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:52 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:48 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:44 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:40 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:36 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:32 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:28 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:24 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:20 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:16 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: zero_init_kernel: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: scratch_store_dword off, v0, null offset:76 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:72 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:68 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:64 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:60 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:56 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:52 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:48 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:44 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:40 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:36 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:32 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:28 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:24 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:20 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:16 +; GFX10-NEXT: s_endpgm + %alloca = alloca [32 x i16], align 2, addrspace(5) + %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* + call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) + ret void +} + +define void @zero_init_foo() { +; GFX9-LABEL: zero_init_foo: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:60 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:56 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:52 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:48 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:44 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:40 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:36 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:32 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:28 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:24 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:20 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:16 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:12 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:8 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 +; GFX9-NEXT: scratch_store_dword off, v0, s32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: zero_init_foo: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:60 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:56 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:52 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:48 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:44 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:40 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:36 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:32 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:28 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:24 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:20 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:16 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:12 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:8 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 +; GFX10-NEXT: scratch_store_dword off, v0, s32 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %alloca = alloca [32 x i16], align 2, addrspace(5) + %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* + call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) + ret void +} + +define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { +; GFX9-LABEL: store_load_sindex_kernel: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s1, s0, 2 +; GFX9-NEXT: s_and_b32 s0, s0, 15 +; GFX9-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-NEXT: s_add_u32 s1, 4, s1 +; GFX9-NEXT: scratch_store_dword off, v0, s1 +; GFX9-NEXT: s_add_u32 s0, 4, s0 +; GFX9-NEXT: scratch_load_dword v0, off, s0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_load_sindex_kernel: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 15 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_and_b32 s1, s0, 15 +; GFX10-NEXT: s_lshl_b32 s0, s0, 2 +; GFX10-NEXT: s_lshl_b32 s1, s1, 2 +; GFX10-NEXT: s_add_u32 s0, 4, s0 +; GFX10-NEXT: s_add_u32 s1, 4, s1 +; GFX10-NEXT: scratch_store_dword off, v0, s0 +; GFX10-NEXT: scratch_load_dword v0, off, s1 +; GFX10-NEXT: s_endpgm +bb: + %i = alloca [32 x float], align 4, addrspace(5) + %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* + %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx + %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* + store volatile i32 15, i32 addrspace(5)* %i8, align 4 + %i9 = and i32 %idx, 15 + %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 + %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* + %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + ret void +} + +define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { +; GFX9-LABEL: store_load_sindex_foo: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_lshl_b32 s1, s0, 2 +; GFX9-NEXT: s_and_b32 s0, s0, 15 +; GFX9-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-NEXT: s_add_u32 s1, 4, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-NEXT: scratch_store_dword off, v0, s1 +; GFX9-NEXT: s_add_u32 s0, 4, s0 +; GFX9-NEXT: scratch_load_dword v0, off, s0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_load_sindex_foo: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_and_b32 s1, s0, 15 +; GFX10-NEXT: v_mov_b32_e32 v0, 15 +; GFX10-NEXT: s_lshl_b32 s0, s0, 2 +; GFX10-NEXT: s_lshl_b32 s1, s1, 2 +; GFX10-NEXT: s_add_u32 s0, 4, s0 +; GFX10-NEXT: s_add_u32 s1, 4, s1 +; GFX10-NEXT: scratch_store_dword off, v0, s0 +; GFX10-NEXT: scratch_load_dword v0, off, s1 +; GFX10-NEXT: s_endpgm +bb: + %i = alloca [32 x float], align 4, addrspace(5) + %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* + %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx + %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* + store volatile i32 15, i32 addrspace(5)* %i8, align 4 + %i9 = and i32 %idx, 15 + %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 + %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* + %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + ret void +} + +define amdgpu_kernel void @store_load_vindex_kernel() { +; GFX9-LABEL: store_load_vindex_kernel: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 +; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_load_vindex_kernel: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: v_mov_b32_e32 v1, 4 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX10-NEXT: scratch_store_dword v2, v3, off +; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX10-NEXT: s_endpgm +bb: + %i = alloca [32 x float], align 4, addrspace(5) + %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* + %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() + %i3 = zext i32 %i2 to i64 + %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 + %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* + store volatile i32 15, i32 addrspace(5)* %i8, align 4 + %i9 = sub nsw i32 31, %i2 + %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 + %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* + %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + ret void +} + +define void @store_load_vindex_foo(i32 %idx) { +; GFX9-LABEL: store_load_vindex_foo: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s32 +; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 +; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX9-NEXT: scratch_load_dword v0, v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: store_load_vindex_foo: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-NEXT: v_mov_b32_e32 v2, s32 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 +; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 +; GFX10-NEXT: scratch_store_dword v0, v1, off +; GFX10-NEXT: scratch_load_dword v0, v2, off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %i = alloca [32 x float], align 4, addrspace(5) + %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* + %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx + %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* + store volatile i32 15, i32 addrspace(5)* %i8, align 4 + %i9 = and i32 %idx, 15 + %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 + %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* + %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + ret void +} + +define void @private_ptr_foo(float addrspace(5)* nocapture %arg) { +; GFX9-LABEL: private_ptr_foo: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 +; GFX9-NEXT: scratch_store_dword v0, v1, off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: private_ptr_foo: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x41200000 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1 + store float 1.000000e+01, float addrspace(5)* %gep, align 4 + ret void +} + +define amdgpu_kernel void @zero_init_small_offset_kernel() { +; GFX9-LABEL: zero_init_small_offset_kernel: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:284 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:280 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:276 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:272 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:300 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:296 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:292 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:288 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:316 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:312 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:308 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:304 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:332 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:328 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:324 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:320 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: zero_init_small_offset_kernel: +; GFX10: ; %bb.0: +; GFX10-NEXT: scratch_load_dword v0, off, null offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: scratch_store_dword off, v0, null offset:284 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:280 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:276 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:272 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:300 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:296 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:292 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:288 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:316 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:312 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:308 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:304 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:332 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:328 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:324 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:320 +; GFX10-NEXT: s_endpgm + %padding = alloca [64 x i32], align 4, addrspace(5) + %alloca = alloca [32 x i16], align 2, addrspace(5) + %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef + %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 + %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* + call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) + ret void +} + +define void @zero_init_small_offset_foo() { +; GFX9-LABEL: zero_init_small_offset_foo: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: scratch_load_dword v0, off, s32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:268 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:264 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:260 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:256 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:284 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:280 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:276 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:272 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:300 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:296 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:292 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:288 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:316 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:312 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:308 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:304 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: zero_init_small_offset_foo: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, off, s32 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:268 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:264 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:260 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:256 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:284 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:280 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:276 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:272 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:300 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:296 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:292 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:288 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:316 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:312 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:308 +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:304 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %padding = alloca [64 x i32], align 4, addrspace(5) + %alloca = alloca [32 x i16], align 2, addrspace(5) + %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef + %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 + %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* + call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) + ret void +} + +define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { +; GFX9-LABEL: store_load_sindex_small_offset_kernel: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-NEXT: s_lshl_b32 s1, s0, 2 +; GFX9-NEXT: s_and_b32 s0, s0, 15 +; GFX9-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-NEXT: s_add_u32 s1, 0x104, s1 +; GFX9-NEXT: scratch_store_dword off, v0, s1 +; GFX9-NEXT: s_add_u32 s0, 0x104, s0 +; GFX9-NEXT: scratch_load_dword v0, off, s0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_load_sindex_small_offset_kernel: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: scratch_load_dword v0, off, null offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 15 +; GFX10-NEXT: s_and_b32 s1, s0, 15 +; GFX10-NEXT: s_lshl_b32 s0, s0, 2 +; GFX10-NEXT: s_lshl_b32 s1, s1, 2 +; GFX10-NEXT: s_add_u32 s0, 0x104, s0 +; GFX10-NEXT: s_add_u32 s1, 0x104, s1 +; GFX10-NEXT: scratch_store_dword off, v0, s0 +; GFX10-NEXT: scratch_load_dword v0, off, s1 +; GFX10-NEXT: s_endpgm +bb: + %padding = alloca [64 x i32], align 4, addrspace(5) + %i = alloca [32 x float], align 4, addrspace(5) + %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef + %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 + %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* + %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx + %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* + store volatile i32 15, i32 addrspace(5)* %i8, align 4 + %i9 = and i32 %idx, 15 + %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 + %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* + %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + ret void +} + +define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { +; GFX9-LABEL: store_load_sindex_small_offset_foo: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: s_lshl_b32 s1, s0, 2 +; GFX9-NEXT: s_and_b32 s0, s0, 15 +; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 +; GFX9-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-NEXT: s_add_u32 s1, 0x104, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-NEXT: scratch_store_dword off, v0, s1 +; GFX9-NEXT: s_add_u32 s0, 0x104, s0 +; GFX9-NEXT: scratch_load_dword v0, off, s0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_load_sindex_small_offset_foo: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: scratch_load_dword v0, off, null offset:4 +; GFX10-NEXT: s_and_b32 s1, s0, 15 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 15 +; GFX10-NEXT: s_lshl_b32 s0, s0, 2 +; GFX10-NEXT: s_lshl_b32 s1, s1, 2 +; GFX10-NEXT: s_add_u32 s0, 0x104, s0 +; GFX10-NEXT: s_add_u32 s1, 0x104, s1 +; GFX10-NEXT: scratch_store_dword off, v0, s0 +; GFX10-NEXT: scratch_load_dword v0, off, s1 +; GFX10-NEXT: s_endpgm +bb: + %padding = alloca [64 x i32], align 4, addrspace(5) + %i = alloca [32 x float], align 4, addrspace(5) + %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef + %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 + %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* + %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx + %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* + store volatile i32 15, i32 addrspace(5)* %i8, align 4 + %i9 = and i32 %idx, 15 + %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 + %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* + %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + ret void +} + +define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { +; GFX9-LABEL: store_load_vindex_small_offset_kernel: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0x104 +; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 +; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_load_vindex_small_offset_kernel: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: v_mov_b32_e32 v1, 0x104 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX10-NEXT: scratch_load_dword v1, off, null offset:4 +; GFX10-NEXT: scratch_store_dword v2, v3, off +; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX10-NEXT: s_endpgm +bb: + %padding = alloca [64 x i32], align 4, addrspace(5) + %i = alloca [32 x float], align 4, addrspace(5) + %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef + %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 + %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* + %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() + %i3 = zext i32 %i2 to i64 + %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 + %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* + store volatile i32 15, i32 addrspace(5)* %i8, align 4 + %i9 = sub nsw i32 31, %i2 + %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 + %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* + %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + ret void +} + +define void @store_load_vindex_small_offset_foo(i32 %idx) { +; GFX9-LABEL: store_load_vindex_small_offset_foo: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: scratch_load_dword v1, off, s32 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x100 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi +; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 +; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX9-NEXT: scratch_load_dword v0, v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: store_load_vindex_small_offset_foo: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x100 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 +; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 +; GFX10-NEXT: scratch_load_dword v3, off, s32 +; GFX10-NEXT: scratch_store_dword v0, v1, off +; GFX10-NEXT: scratch_load_dword v0, v2, off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %padding = alloca [64 x i32], align 4, addrspace(5) + %i = alloca [32 x float], align 4, addrspace(5) + %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef + %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 + %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* + %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx + %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* + store volatile i32 15, i32 addrspace(5)* %i8, align 4 + %i9 = and i32 %idx, 15 + %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 + %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* + %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + ret void +} + +define amdgpu_kernel void @zero_init_large_offset_kernel() { +; GFX9-LABEL: zero_init_large_offset_kernel: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:12 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:8 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:28 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:24 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:20 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:16 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:44 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:40 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:36 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:32 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:60 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:56 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:52 +; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:48 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: zero_init_large_offset_kernel: +; GFX10: ; %bb.0: +; GFX10-NEXT: scratch_load_dword v0, off, null offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:12 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:8 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:28 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:24 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:20 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:16 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:44 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:40 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:36 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:32 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:60 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:56 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:52 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:48 +; GFX10-NEXT: s_endpgm + %padding = alloca [4096 x i32], align 4, addrspace(5) + %alloca = alloca [32 x i16], align 2, addrspace(5) + %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef + %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 + %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* + call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) + ret void +} + +define void @zero_init_large_offset_foo() { +; GFX9-LABEL: zero_init_large_offset_foo: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: scratch_load_dword v0, off, s32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:12 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:8 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:28 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:24 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:20 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:16 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:44 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:40 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:36 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:32 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:60 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:56 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:52 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:48 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: zero_init_large_offset_foo: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, off, s32 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:12 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:8 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:28 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:24 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:20 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:16 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:44 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:40 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:36 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:32 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:60 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:56 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:52 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:48 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %padding = alloca [4096 x i32], align 4, addrspace(5) + %alloca = alloca [32 x i16], align 2, addrspace(5) + %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef + %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 + %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* + call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) + ret void +} + +define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { +; GFX9-LABEL: store_load_sindex_large_offset_kernel: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-NEXT: s_lshl_b32 s1, s0, 2 +; GFX9-NEXT: s_and_b32 s0, s0, 15 +; GFX9-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-NEXT: s_add_u32 s1, 0x4004, s1 +; GFX9-NEXT: scratch_store_dword off, v0, s1 +; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 +; GFX9-NEXT: scratch_load_dword v0, off, s0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_load_sindex_large_offset_kernel: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: scratch_load_dword v0, off, null offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 15 +; GFX10-NEXT: s_and_b32 s1, s0, 15 +; GFX10-NEXT: s_lshl_b32 s0, s0, 2 +; GFX10-NEXT: s_lshl_b32 s1, s1, 2 +; GFX10-NEXT: s_add_u32 s0, 0x4004, s0 +; GFX10-NEXT: s_add_u32 s1, 0x4004, s1 +; GFX10-NEXT: scratch_store_dword off, v0, s0 +; GFX10-NEXT: scratch_load_dword v0, off, s1 +; GFX10-NEXT: s_endpgm +bb: + %padding = alloca [4096 x i32], align 4, addrspace(5) + %i = alloca [32 x float], align 4, addrspace(5) + %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef + %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 + %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* + %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx + %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* + store volatile i32 15, i32 addrspace(5)* %i8, align 4 + %i9 = and i32 %idx, 15 + %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 + %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* + %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + ret void +} + +define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { +; GFX9-LABEL: store_load_sindex_large_offset_foo: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: s_lshl_b32 s1, s0, 2 +; GFX9-NEXT: s_and_b32 s0, s0, 15 +; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 +; GFX9-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-NEXT: s_add_u32 s1, 0x4004, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-NEXT: scratch_store_dword off, v0, s1 +; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 +; GFX9-NEXT: scratch_load_dword v0, off, s0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_load_sindex_large_offset_foo: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: scratch_load_dword v0, off, null offset:4 +; GFX10-NEXT: s_and_b32 s1, s0, 15 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 15 +; GFX10-NEXT: s_lshl_b32 s0, s0, 2 +; GFX10-NEXT: s_lshl_b32 s1, s1, 2 +; GFX10-NEXT: s_add_u32 s0, 0x4004, s0 +; GFX10-NEXT: s_add_u32 s1, 0x4004, s1 +; GFX10-NEXT: scratch_store_dword off, v0, s0 +; GFX10-NEXT: scratch_load_dword v0, off, s1 +; GFX10-NEXT: s_endpgm +bb: + %padding = alloca [4096 x i32], align 4, addrspace(5) + %i = alloca [32 x float], align 4, addrspace(5) + %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef + %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 + %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* + %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx + %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* + store volatile i32 15, i32 addrspace(5)* %i8, align 4 + %i9 = and i32 %idx, 15 + %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 + %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* + %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + ret void +} + +define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { +; GFX9-LABEL: store_load_vindex_large_offset_kernel: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4004 +; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 +; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_load_vindex_large_offset_kernel: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: v_mov_b32_e32 v1, 0x4004 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX10-NEXT: scratch_load_dword v1, off, null offset:4 +; GFX10-NEXT: scratch_store_dword v2, v3, off +; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX10-NEXT: s_endpgm +bb: + %padding = alloca [4096 x i32], align 4, addrspace(5) + %i = alloca [32 x float], align 4, addrspace(5) + %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef + %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 + %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* + %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() + %i3 = zext i32 %i2 to i64 + %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 + %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* + store volatile i32 15, i32 addrspace(5)* %i8, align 4 + %i9 = sub nsw i32 31, %i2 + %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 + %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* + %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + ret void +} + +define void @store_load_vindex_large_offset_foo(i32 %idx) { +; GFX9-LABEL: store_load_vindex_large_offset_foo: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: scratch_load_dword v1, off, s32 +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi +; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 +; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX9-NEXT: scratch_load_dword v0, v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: store_load_vindex_large_offset_foo: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 +; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 +; GFX10-NEXT: scratch_load_dword v3, off, s32 +; GFX10-NEXT: scratch_store_dword v0, v1, off +; GFX10-NEXT: scratch_load_dword v0, v2, off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %padding = alloca [4096 x i32], align 4, addrspace(5) + %i = alloca [32 x float], align 4, addrspace(5) + %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef + %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 + %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* + %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx + %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* + store volatile i32 15, i32 addrspace(5)* %i8, align 4 + %i9 = and i32 %idx, 15 + %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 + %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* + %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + ret void +} + +define amdgpu_kernel void @store_load_large_imm_offset_kernel() { +; GFX9-LABEL: store_load_large_imm_offset_kernel: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_movk_i32 s0, 0x3000 +; GFX9-NEXT: v_mov_b32_e32 v0, 13 +; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 +; GFX9-NEXT: s_add_u32 s0, 4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 +; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_load_large_imm_offset_kernel: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: v_mov_b32_e32 v0, 13 +; GFX10-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-NEXT: s_movk_i32 s0, 0x3800 +; GFX10-NEXT: s_add_u32 s0, 4, s0 +; GFX10-NEXT: scratch_store_dword off, v0, null offset:4 +; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 +; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 +; GFX10-NEXT: s_endpgm +bb: + %i = alloca [4096 x i32], align 4, addrspace(5) + %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef + store volatile i32 13, i32 addrspace(5)* %i1, align 4 + %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 + store volatile i32 15, i32 addrspace(5)* %i7, align 4 + %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 + %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 + ret void +} + +define void @store_load_large_imm_offset_foo() { +; GFX9-LABEL: store_load_large_imm_offset_foo: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_movk_i32 s4, 0x3000 +; GFX9-NEXT: v_mov_b32_e32 v0, 13 +; GFX9-NEXT: scratch_store_dword off, v0, s32 +; GFX9-NEXT: s_add_u32 s4, s32, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-NEXT: scratch_store_dword off, v0, s4 offset:3712 +; GFX9-NEXT: scratch_load_dword v0, off, s4 offset:3712 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: store_load_large_imm_offset_foo: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 13 +; GFX10-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-NEXT: s_movk_i32 s4, 0x3800 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_add_u32 s4, s32, s4 +; GFX10-NEXT: scratch_store_dword off, v0, s32 +; GFX10-NEXT: scratch_store_dword off, v1, s4 offset:1664 +; GFX10-NEXT: scratch_load_dword v0, off, s4 offset:1664 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %i = alloca [4096 x i32], align 4, addrspace(5) + %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef + store volatile i32 13, i32 addrspace(5)* %i1, align 4 + %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 + store volatile i32 15, i32 addrspace(5)* %i7, align 4 + %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 + %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 + ret void +} + +define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { +; GFX9-LABEL: store_load_vidx_sidx_offset: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024 +; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: store_load_vidx_sidx_offset: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, 4 +; GFX10-NEXT: scratch_store_dword v0, v1, off offset:1024 +; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 +; GFX10-NEXT: s_endpgm +bb: + %alloca = alloca [32 x i32], align 4, addrspace(5) + %vidx = tail call i32 @llvm.amdgcn.workitem.id.x() + %add1 = add nsw i32 %sidx, %vidx + %add2 = add nsw i32 %add1, 256 + %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2 + store volatile i32 15, i32 addrspace(5)* %gep, align 4 + %load = load volatile i32, i32 addrspace(5)* %gep, align 4 + ret void +} + +; FIXME: Multi-DWORD scratch shall be supported +define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) { +; GFX9-LABEL: store_load_i64_aligned: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: scratch_store_dword v0, v1, off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-NEXT: scratch_store_dword v0, v1, off +; GFX9-NEXT: scratch_load_dword v1, v0, off offset:4 +; GFX9-NEXT: scratch_load_dword v0, v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: store_load_i64_aligned: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4 +; GFX10-NEXT: scratch_store_dword v0, v2, off +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: scratch_load_dword v1, v0, off offset:4 +; GFX10-NEXT: scratch_load_dword v0, v0, off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + store volatile i64 15, i64 addrspace(5)* %arg, align 8 + %load = load volatile i64, i64 addrspace(5)* %arg, align 8 + ret void +} + +; FIXME: Multi-DWORD unaligned scratch shall be supported +define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) { +; GFX9-LABEL: store_load_i64_unaligned: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: scratch_store_byte v0, v1, off offset:7 +; GFX9-NEXT: scratch_store_byte v0, v1, off offset:6 +; GFX9-NEXT: scratch_store_byte v0, v1, off offset:5 +; GFX9-NEXT: scratch_store_byte v0, v1, off offset:4 +; GFX9-NEXT: scratch_store_byte v0, v1, off offset:3 +; GFX9-NEXT: scratch_store_byte v0, v1, off offset:2 +; GFX9-NEXT: scratch_store_byte v0, v1, off offset:1 +; GFX9-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-NEXT: scratch_store_byte v0, v1, off +; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_ubyte v1, v0, off +; GFX9-NEXT: scratch_load_ubyte v0, v0, off offset:1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: store_load_i64_unaligned: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: scratch_store_byte v0, v1, off offset:7 +; GFX10-NEXT: scratch_store_byte v0, v1, off offset:6 +; GFX10-NEXT: scratch_store_byte v0, v1, off offset:5 +; GFX10-NEXT: scratch_store_byte v0, v1, off offset:4 +; GFX10-NEXT: scratch_store_byte v0, v1, off offset:3 +; GFX10-NEXT: scratch_store_byte v0, v1, off offset:2 +; GFX10-NEXT: scratch_store_byte v0, v1, off offset:1 +; GFX10-NEXT: scratch_store_byte v0, v2, off +; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:7 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:5 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:3 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: scratch_load_ubyte v1, v0, off +; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + store volatile i64 15, i64 addrspace(5)* %arg, align 1 + %load = load volatile i64, i64 addrspace(5)* %arg, align 1 + ret void +} + +declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg) +declare i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -1,5 +1,6 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MUBUF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-MUBUF,MUBUF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-sroa=0 -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLATSCR %s ; Test that non-entry function frame indices are expanded properly to ; give an index relative to the scratch wave offset register @@ -9,9 +10,13 @@ ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshr_b32_e64 v0, s32, 6 -; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s32 +; GFX9-MUBUF-NEXT: v_lshrrev_b32_e64 v0, 6, s32 + +; GFX9-FLATSCR: v_mov_b32_e32 v0, s32 +; GFX9-FLATSCR-NOT: v_lshrrev_b32_e64 + +; MUBUF-NOT: v_mov -; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 define void @func_mov_fi_i32() #0 { %alloca = alloca i32, addrspace(5) @@ -30,11 +35,14 @@ ; CI-NEXT: v_add_i32_e{{32|64}} v0, {{s\[[0-9]+:[0-9]+\]|vcc}}, 4, [[SCALED]] ; CI-NEXT: ds_write_b32 v0, v0 -; GFX9: v_lshrrev_b32_e64 v0, 6, s32 -; GFX9-NEXT: ds_write_b32 v0, v0 -; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 -; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]] -; GFX9-NEXT: ds_write_b32 v0, v0 +; GFX9-MUBUF-NEXT: v_lshrrev_b32_e64 v0, 6, s32 +; GFX9-FLATSCR: v_mov_b32_e32 v0, s32 +; GFX9-FLATSCR: s_add_u32 [[ADD:[^,]+]], s32, 4 +; GFX9-NEXT: ds_write_b32 v0, v0 +; GFX9-MUBUF-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 4, [[SCALED]] +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, [[ADD]] +; GFX9-NEXT: ds_write_b32 v0, v0 define void @func_mov_fi_i32_offset() #0 { %alloca0 = alloca i32, addrspace(5) %alloca1 = alloca i32, addrspace(5) @@ -52,8 +60,11 @@ ; CI: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6 ; CI-NEXT: v_add_i32_e32 v0, vcc, 4, [[SCALED]] -; GFX9: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 -; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]] +; GFX9-MUBUF: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 4, [[SCALED]] + +; GFX9-FLATSCR: v_mov_b32_e32 [[ADD:v[0-9]+]], s32 +; GFX9-FLATSCR-NEXT: v_add_u32_e32 v0, 4, [[ADD]] ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 @@ -71,7 +82,8 @@ ; CI: v_lshr_b32_e64 v0, s32, 6 -; GFX9: v_lshrrev_b32_e64 v0, 6, s32 +; GFX9-MUBUF: v_lshrrev_b32_e64 v0, 6, s32 +; GFX9-FLATSCR: v_mov_b32_e32 v0, s32 ; GCN-NEXT: v_mul_u32_u24_e32 v0, 9, v0 ; GCN-NOT: v_mov @@ -86,7 +98,8 @@ ; GCN-LABEL: {{^}}func_store_private_arg_i32_ptr: ; GCN: v_mov_b32_e32 v1, 15{{$}} -; GCN: buffer_store_dword v1, v0, s[0:3], 0 offen{{$}} +; MUBUF: buffer_store_dword v1, v0, s[0:3], 0 offen{{$}} +; GFX9-FLATSCR: scratch_store_dword v0, v1, off{{$}} define void @func_store_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 { store volatile i32 15, i32 addrspace(5)* %ptr ret void @@ -94,7 +107,8 @@ ; GCN-LABEL: {{^}}func_load_private_arg_i32_ptr: ; GCN: s_waitcnt -; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen{{$}} +; MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen{{$}} +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v0, off{{$}} define void @func_load_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 { %val = load volatile i32, i32 addrspace(5)* %ptr ret void @@ -106,8 +120,11 @@ ; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6 ; CI-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]] -; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32 -; GFX9-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]] +; GFX9-MUBUF: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32 +; GFX9-MUBUF-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]] + +; GFX9-FLATSCR: v_mov_b32_e32 [[SP:v[0-9]+]], s32 +; GFX9-FLATSCR-NEXT: v_or_b32_e32 v0, 4, [[SP]] ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 @@ -121,8 +138,10 @@ ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_value: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_ubyte v0, off, s[0:3], s32 -; GCN_NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; MUBUF-NEXT: buffer_load_ubyte v0, off, s[0:3], s32 +; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; GFX9-FLATSCR-NEXT: scratch_load_ubyte v0, off, s32 +; GFX9-FLATSCR-NEXT: scratch_load_dword v1, off, s32 offset:4 define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 } addrspace(5)* byval %arg0) #0 { %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %arg0, i32 0, i32 0 %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %arg0, i32 0, i32 1 @@ -137,15 +156,17 @@ ; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6 -; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32 +; GFX9-MUBUF: v_lshrrev_b32_e64 [[SP:v[0-9]+]], 6, s32 +; GFX9-FLATSCR: v_mov_b32_e32 [[SP:v[0-9]+]], s32 ; GCN: s_and_saveexec_b64 ; CI: v_add_i32_e32 [[GEP:v[0-9]+]], vcc, 4, [[SHIFT]] ; CI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}} -; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SHIFT]] -; GFX9: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}} +; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SP]] +; GFX9-MUBUF: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}} +; GFX9-FLATSCR: scratch_load_dword v{{[0-9]+}}, [[SP]], off offset:4{{$}} ; GCN: ds_write_b32 v{{[0-9]+}}, [[GEP]] define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 } addrspace(5)* byval %arg0, i32 %arg2) #0 { @@ -170,8 +191,11 @@ ; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6 ; CI: v_add_i32_e32 [[VZ:v[0-9]+]], vcc, [[K]], [[SCALED]] -; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 -; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]] +; GFX9-MUBUF-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 +; GFX9-MUBUF: v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]] + +; GFX9-FLATSCR-DAG: s_add_u32 [[SZ:[^,]+]], s32, 0x200 +; GFX9-FLATSCR: v_mov_b32_e32 [[VZ:v[0-9]+]], [[SZ]] ; GCN: v_mul_u32_u24_e32 [[VZ]], 9, [[VZ]] ; GCN: ds_write_b32 v0, [[VZ]] @@ -193,8 +217,11 @@ ; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6 ; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, [[OFFSET]], [[SCALED]] -; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 -; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]] +; GFX9-MUBUF-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 +; GFX9-MUBUF: v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]] + +; GFX9-FLATSCR-DAG: s_add_u32 [[SZ:[^,]+]], s32, 0x200 +; GFX9-FLATSCR: v_mov_b32_e32 [[VZ:v[0-9]+]], [[SZ]] ; GCN: v_mul_u32_u24_e32 [[VZ]], 9, [[VZ]] ; GCN: ds_write_b32 v0, [[VZ]] @@ -219,10 +246,14 @@ ; GCN-LABEL: {{^}}undefined_stack_store_reg: ; GCN: s_and_saveexec_b64 -; GCN: buffer_store_dword v0, off, s[0:3], s33 offset: -; GCN: buffer_store_dword v0, off, s[0:3], s33 offset: -; GCN: buffer_store_dword v0, off, s[0:3], s33 offset: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset: +; MUBUF: buffer_store_dword v0, off, s[0:3], s33 offset: +; MUBUF: buffer_store_dword v0, off, s[0:3], s33 offset: +; MUBUF: buffer_store_dword v0, off, s[0:3], s33 offset: +; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset: +; FLATSCR: scratch_store_dword v0, off, s33 offset: +; FLATSCR: scratch_store_dword v0, off, s33 offset: +; FLATSCR: scratch_store_dword v0, off, s33 offset: +; FLATSCR: scratch_store_dword v{{[0-9]+}}, off, s33 offset: define void @undefined_stack_store_reg(float %arg, i32 %arg1) #0 { bb: %tmp = alloca <4 x float>, align 16, addrspace(5) @@ -243,13 +274,17 @@ ; GCN-LABEL: {{^}}alloca_ptr_nonentry_block: ; GCN: s_and_saveexec_b64 -; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 +; MUBUF: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 +; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, s32 offset:4 ; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6 ; CI-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]] -; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32 -; GFX9-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]] +; GFX9-MUBUF: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32 +; GFX9-MUBUF-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]] + +; GFX9-FLATSCR: v_mov_b32_e32 [[SP:v[0-9]+]], s32 +; GFX9-FLATSCR-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SP]] ; GCN: ds_write_b32 v{{[0-9]+}}, [[PTR]] define void @alloca_ptr_nonentry_block(i32 %arg0) #0 { Index: llvm/test/CodeGen/AMDGPU/load-hi16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/load-hi16.ll +++ llvm/test/CodeGen/AMDGPU/load-hi16.ll @@ -1,6 +1,7 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,GFX900-MUBUF %s ; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,GFX900-FLATSCR %s ; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_lo: ; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -493,7 +494,8 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg: ; GCN: s_waitcnt -; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} +; GFX900-MUBUF: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} +; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, s32 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt @@ -512,7 +514,8 @@ ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg: ; GCN: s_waitcnt -; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} +; GFX900-MUBUF: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} +; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, s32 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt @@ -531,7 +534,9 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff: ; GCN: s_waitcnt -; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], 0 offset:4094{{$}} +; GFX900-MUBUFF: buffer_load_short_d16_hi v0, off, s[0:3], 0 offset:4094{{$}} +; GFX900-FLATSCR: s_movk_i32 [[SOFF:[^,]+]], 0xffe +; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, [[SOFF]]{{$}} ; GFX900: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt @@ -549,7 +554,9 @@ ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} +; GFX900-MUBUF-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} +; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe +; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v1, off, [[SOFF]]{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt @@ -567,7 +574,8 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8: ; GCN: s_waitcnt -; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} +; GFX900-MUBUF: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} +; GFX900-FLATSCR: scratch_load_ubyte_d16_hi v0, off, s32 offset:4095{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt @@ -587,7 +595,8 @@ ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_zexti8: ; GCN: s_waitcnt -; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} +; GFX900-MUBUF: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} +; GFX900-FLATSCR: scratch_load_ubyte_d16_hi v0, off, s32 offset:4095{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt @@ -608,7 +617,8 @@ ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_sexti8: ; GCN: s_waitcnt -; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} +; GFX900-MUBUF: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} +; GFX900-FLATSCR: scratch_load_sbyte_d16_hi v0, off, s32 offset:4095{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt @@ -629,7 +639,8 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8: ; GCN: s_waitcnt -; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} +; GFX900-MUBUF: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} +; GFX900-FLATSCR: scratch_load_sbyte_d16_hi v0, off, s32 offset:4095{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt @@ -649,7 +660,9 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} +; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} +; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe +; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]]{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt @@ -668,7 +681,9 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} +; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} +; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe +; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v1, off, [[SOFF]]{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt @@ -687,7 +702,9 @@ ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} +; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} +; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe +; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]]{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt @@ -787,8 +804,10 @@ ; to offset variant. ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset: -; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4094 +; GFX900-MUBUF: buffer_store_dword +; GFX900-MUBUF-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4094 +; GFX900-FLATSCR: scratch_store_dword +; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v{{[0-9]+}}, off, s32 offset:4094 define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -804,8 +823,10 @@ } ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset: -; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095 +; GFX900-MUBUF: buffer_store_dword +; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095 +; GFX900-FLATSCR: scratch_store_dword +; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v{{[0-9]+}}, off, s32 offset:4095 define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -822,8 +843,10 @@ } ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset: -; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095 +; GFX900-MUBUF: buffer_store_dword +; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095 +; GFX900-FLATSCR: scratch_store_dword +; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v{{[0-9]+}}, off, s32 offset:4095 define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -974,9 +997,11 @@ ; FIXME: Is there a cost to using the extload over not? ; GCN-LABEL: {{^}}load_private_v2i16_split: ; GCN: s_waitcnt -; GFX900: buffer_load_ushort v0, off, s[0:3], s32{{$}} +; GFX900-MUBUF: buffer_load_ushort v0, off, s[0:3], s32{{$}} +; GFX900-FLATSCR: scratch_load_ushort v0, off, s32{{$}} ; GFX900-NEXT: s_waitcnt -; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:2 +; GFX900-MUBUF-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:2 +; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s32 offset:2 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 define <2 x i16> @load_private_v2i16_split(i16 addrspace(5)* byval %in) #0 { Index: llvm/test/CodeGen/AMDGPU/load-lo16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,GFX900-MUBUF %s ; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs --amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,GFX900,GFX900-FLATSCR %s define <2 x i16> @load_local_lo_v2i16_undeflo(i16 addrspace(3)* %in) #0 { ; GFX900-LABEL: load_local_lo_v2i16_undeflo: @@ -1177,14 +1178,14 @@ } define void @load_private_lo_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg: ; GFX906: ; %bb.0: ; %entry @@ -1207,6 +1208,15 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v0, off, s32 offset:4094 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047 @@ -1217,16 +1227,16 @@ } define void @load_private_lo_v2i16_reghi_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2i16_reghi_vreg: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reghi_vreg: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX900-MUBUF-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reghi_vreg: ; GFX906: ; %bb.0: ; %entry @@ -1249,6 +1259,17 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reghi_vreg: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: scratch_load_ushort v1, off, s32 offset:4094 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX900-FLATSCR-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047 %load = load i16, i16 addrspace(5)* %gep @@ -1259,14 +1280,14 @@ } define void @load_private_lo_v2f16_reglo_vreg(half addrspace(5)* byval %in, i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg: ; GFX906: ; %bb.0: ; %entry @@ -1290,6 +1311,15 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v0, off, s32 offset:4094 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2047 @@ -1300,14 +1330,14 @@ } define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_nooff: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v1, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v1, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff: ; GFX906: ; %bb.0: ; %entry @@ -1330,6 +1360,16 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe +; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s4 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*) @@ -1339,14 +1379,14 @@ } define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2i16_reghi_vreg_nooff: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v1, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reghi_vreg_nooff: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v1, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reghi_vreg_nooff: ; GFX906: ; %bb.0: ; %entry @@ -1369,6 +1409,16 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reghi_vreg_nooff: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe +; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s4 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*) @@ -1378,14 +1428,14 @@ } define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg_nooff: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v1, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_nooff: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v1, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff: ; GFX906: ; %bb.0: ; %entry @@ -1409,6 +1459,16 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_nooff: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe +; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s4 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*) @@ -1418,14 +1478,14 @@ } define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8: ; GFX906: ; %bb.0: ; %entry @@ -1449,6 +1509,15 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v0, off, s32 offset:4095 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 @@ -1460,14 +1529,14 @@ } define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8: ; GFX906: ; %bb.0: ; %entry @@ -1490,6 +1559,15 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v0, off, s32 offset:4095 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 @@ -1501,14 +1579,14 @@ } define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v1, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v1, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8: ; GFX906: ; %bb.0: ; %entry @@ -1532,6 +1610,16 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe +; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s4 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) @@ -1542,14 +1630,14 @@ } define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], 0 offset:4094 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v1, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], 0 offset:4094 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v1, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8: ; GFX906: ; %bb.0: ; %entry @@ -1572,6 +1660,16 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe +; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v1, off, s4 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) @@ -1582,14 +1680,14 @@ } define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v1, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v1, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8: ; GFX906: ; %bb.0: ; %entry @@ -1614,6 +1712,16 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe +; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s4 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) @@ -1801,16 +1909,16 @@ } define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX900-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX900-MUBUF-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset: ; GFX906: ; %bb.0: ; %entry @@ -1837,6 +1945,17 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32 +; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v0, off, s32 offset:4094 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) %obj1 = alloca [4096 x i16], align 2, addrspace(5) @@ -1851,16 +1970,16 @@ } define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset: ; GFX906: ; %bb.0: ; %entry @@ -1887,6 +2006,17 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32 +; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v0, off, s32 offset:4095 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) %obj1 = alloca [4096 x i8], align 2, addrspace(5) @@ -1902,16 +2032,16 @@ } define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset: ; GFX906: ; %bb.0: ; %entry @@ -1939,6 +2069,17 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32 +; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v0, off, s32 offset:4095 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) %obj1 = alloca [4096 x i8], align 2, addrspace(5) @@ -1954,16 +2095,16 @@ } define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset: ; GFX906: ; %bb.0: ; %entry @@ -1991,6 +2132,17 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32 +; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v0, off, s32 offset:4095 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) %obj1 = alloca [4096 x i8], align 2, addrspace(5) @@ -2007,16 +2159,16 @@ } define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 { -; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset: ; GFX906: ; %bb.0: ; %entry @@ -2045,6 +2197,17 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32 +; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v0, off, s32 offset:4095 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) %obj1 = alloca [4096 x i8], align 2, addrspace(5) Index: llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck -check-prefixes=GCN,GFX9,MUBUF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,GFX9,FLATSCR %s ; Make sure we use the correct frame offset is used with the local ; frame area. @@ -16,42 +17,78 @@ ; correct FP offset. define amdgpu_kernel void @local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8 addrspace(1)* %in) { -; GCN-LABEL: local_stack_offset_uses_sp: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: v_mov_b32_e32 v1, 0x3000 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: v_add_u32_e32 v0, 64, v1 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mov_b32_e32 v3, 0x2000 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GCN-NEXT: BB0_1: ; %loadstoreloop -; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_add_u32_e32 v3, s6, v1 -; GCN-NEXT: s_add_i32 s6, s6, 1 -; GCN-NEXT: s_cmpk_lt_u32 s6, 0x2120 -; GCN-NEXT: buffer_store_byte v2, v3, s[0:3], 0 offen -; GCN-NEXT: s_cbranch_scc1 BB0_1 -; GCN-NEXT: ; %bb.2: ; %split -; GCN-NEXT: v_mov_b32_e32 v1, 0x3000 -; GCN-NEXT: v_add_u32_e32 v1, 0x20d0, v1 -; GCN-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 -; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v2, v3 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc -; GCN-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NEXT: global_store_dwordx2 v[2:3], v[0:1], off -; GCN-NEXT: s_endpgm +; MUBUF-LABEL: local_stack_offset_uses_sp: +; MUBUF: ; %bb.0: ; %entry +; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; MUBUF-NEXT: s_add_u32 s0, s0, s9 +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x3000 +; MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; MUBUF-NEXT: v_add_u32_e32 v0, 64, v1 +; MUBUF-NEXT: v_mov_b32_e32 v2, 0 +; MUBUF-NEXT: v_mov_b32_e32 v3, 0x2000 +; MUBUF-NEXT: s_mov_b32 s6, 0 +; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; MUBUF-NEXT: BB0_1: ; %loadstoreloop +; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 +; MUBUF-NEXT: v_add_u32_e32 v3, s6, v1 +; MUBUF-NEXT: s_add_i32 s6, s6, 1 +; MUBUF-NEXT: s_cmpk_lt_u32 s6, 0x2120 +; MUBUF-NEXT: buffer_store_byte v2, v3, s[0:3], 0 offen +; MUBUF-NEXT: s_cbranch_scc1 BB0_1 +; MUBUF-NEXT: ; %bb.2: ; %split +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x3000 +; MUBUF-NEXT: v_add_u32_e32 v1, 0x20d0, v1 +; MUBUF-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen +; MUBUF-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: s_waitcnt vmcnt(1) +; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v2, v3 +; MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; MUBUF-NEXT: v_mov_b32_e32 v2, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; MUBUF-NEXT: v_mov_b32_e32 v3, s5 +; MUBUF-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; MUBUF-NEXT: s_endpgm +; +; FLATSCR-LABEL: local_stack_offset_uses_sp: +; FLATSCR: ; %bb.0: ; %entry +; FLATSCR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: s_movk_i32 vcc_hi, 0x2000 +; FLATSCR-NEXT: s_mov_b32 s6, 0 +; FLATSCR-NEXT: scratch_store_dword off, v0, vcc_hi +; FLATSCR-NEXT: BB0_1: ; %loadstoreloop +; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 +; FLATSCR-NEXT: s_add_u32 s7, 0x3000, s6 +; FLATSCR-NEXT: s_add_i32 s6, s6, 1 +; FLATSCR-NEXT: s_cmpk_lt_u32 s6, 0x2120 +; FLATSCR-NEXT: scratch_store_byte off, v0, s7 +; FLATSCR-NEXT: s_cbranch_scc1 BB0_1 +; FLATSCR-NEXT: ; %bb.2: ; %split +; FLATSCR-NEXT: s_movk_i32 s6, 0x20d0 +; FLATSCR-NEXT: s_add_u32 s6, 0x3000, s6 +; FLATSCR-NEXT: scratch_load_dword v1, off, s6 offset:4 +; FLATSCR-NEXT: s_movk_i32 s6, 0x2000 +; FLATSCR-NEXT: s_add_u32 s6, 0x3000, s6 +; FLATSCR-NEXT: scratch_load_dword v0, off, s6 offset:208 +; FLATSCR-NEXT: s_movk_i32 s6, 0x3000 +; FLATSCR-NEXT: scratch_load_dword v2, off, s6 offset:68 +; FLATSCR-NEXT: s_movk_i32 s6, 0x3000 +; FLATSCR-NEXT: scratch_load_dword v3, off, s6 offset:64 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; FLATSCR-NEXT: v_mov_b32_e32 v2, s4 +; FLATSCR-NEXT: v_mov_b32_e32 v3, s5 +; FLATSCR-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; FLATSCR-NEXT: s_endpgm entry: %pin.low = alloca i32, align 8192, addrspace(5) %local.area = alloca [1060 x i64], align 4096, addrspace(5) @@ -68,43 +105,83 @@ } define void @func_local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8 addrspace(1)* %in) { -; GCN-LABEL: func_local_stack_offset_uses_sp: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_add_u32 s4, s32, 0x7ffc0 -; GCN-NEXT: s_mov_b32 s5, s33 -; GCN-NEXT: s_and_b32 s33, s4, 0xfff80000 -; GCN-NEXT: v_lshrrev_b32_e64 v3, 6, s33 -; GCN-NEXT: v_add_u32_e32 v3, 0x1000, v3 -; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: v_add_u32_e32 v2, 64, v3 -; GCN-NEXT: s_mov_b32 s4, 0 -; GCN-NEXT: s_add_u32 s32, s32, 0x180000 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 -; GCN-NEXT: BB1_1: ; %loadstoreloop -; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_add_u32_e32 v5, s4, v3 -; GCN-NEXT: s_add_i32 s4, s4, 1 -; GCN-NEXT: s_cmpk_lt_u32 s4, 0x2120 -; GCN-NEXT: buffer_store_byte v4, v5, s[0:3], 0 offen -; GCN-NEXT: s_cbranch_scc1 BB1_1 -; GCN-NEXT: ; %bb.2: ; %split -; GCN-NEXT: v_lshrrev_b32_e64 v3, 6, s33 -; GCN-NEXT: v_add_u32_e32 v3, 0x1000, v3 -; GCN-NEXT: v_add_u32_e32 v3, 0x20d0, v3 -; GCN-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen offset:4 -; GCN-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:4 -; GCN-NEXT: s_sub_u32 s32, s32, 0x180000 -; GCN-NEXT: s_mov_b32 s33, s5 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v4, v5 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc -; GCN-NEXT: global_store_dwordx2 v[0:1], v[2:3], off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; MUBUF-LABEL: func_local_stack_offset_uses_sp: +; MUBUF: ; %bb.0: ; %entry +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_add_u32 s4, s32, 0x7ffc0 +; MUBUF-NEXT: s_mov_b32 s5, s33 +; MUBUF-NEXT: s_and_b32 s33, s4, 0xfff80000 +; MUBUF-NEXT: v_lshrrev_b32_e64 v3, 6, s33 +; MUBUF-NEXT: v_add_u32_e32 v3, 0x1000, v3 +; MUBUF-NEXT: v_mov_b32_e32 v4, 0 +; MUBUF-NEXT: v_add_u32_e32 v2, 64, v3 +; MUBUF-NEXT: s_mov_b32 s4, 0 +; MUBUF-NEXT: s_add_u32 s32, s32, 0x180000 +; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s33 +; MUBUF-NEXT: BB1_1: ; %loadstoreloop +; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 +; MUBUF-NEXT: v_add_u32_e32 v5, s4, v3 +; MUBUF-NEXT: s_add_i32 s4, s4, 1 +; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2120 +; MUBUF-NEXT: buffer_store_byte v4, v5, s[0:3], 0 offen +; MUBUF-NEXT: s_cbranch_scc1 BB1_1 +; MUBUF-NEXT: ; %bb.2: ; %split +; MUBUF-NEXT: v_lshrrev_b32_e64 v3, 6, s33 +; MUBUF-NEXT: v_add_u32_e32 v3, 0x1000, v3 +; MUBUF-NEXT: v_add_u32_e32 v3, 0x20d0, v3 +; MUBUF-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen +; MUBUF-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen +; MUBUF-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: s_sub_u32 s32, s32, 0x180000 +; MUBUF-NEXT: s_mov_b32 s33, s5 +; MUBUF-NEXT: s_waitcnt vmcnt(1) +; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v4, v5 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc +; MUBUF-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: func_local_stack_offset_uses_sp: +; FLATSCR: ; %bb.0: ; %entry +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_add_u32 s4, s32, 0x1fff +; FLATSCR-NEXT: s_mov_b32 s6, s33 +; FLATSCR-NEXT: s_and_b32 s33, s4, 0xffffe000 +; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 +; FLATSCR-NEXT: s_mov_b32 s4, 0 +; FLATSCR-NEXT: s_add_u32 s32, s32, 0x6000 +; FLATSCR-NEXT: scratch_store_dword off, v2, s33 +; FLATSCR-NEXT: BB1_1: ; %loadstoreloop +; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 +; FLATSCR-NEXT: s_add_u32 vcc_hi, s33, 0x1000 +; FLATSCR-NEXT: s_add_u32 s5, vcc_hi, s4 +; FLATSCR-NEXT: s_add_i32 s4, s4, 1 +; FLATSCR-NEXT: s_cmpk_lt_u32 s4, 0x2120 +; FLATSCR-NEXT: scratch_store_byte off, v2, s5 +; FLATSCR-NEXT: s_cbranch_scc1 BB1_1 +; FLATSCR-NEXT: ; %bb.2: ; %split +; FLATSCR-NEXT: s_movk_i32 s4, 0x20d0 +; FLATSCR-NEXT: s_add_u32 s5, s33, 0x1000 +; FLATSCR-NEXT: s_add_u32 s4, s5, s4 +; FLATSCR-NEXT: scratch_load_dword v3, off, s4 offset:4 +; FLATSCR-NEXT: s_movk_i32 s4, 0x2000 +; FLATSCR-NEXT: s_add_u32 s5, s33, 0x1000 +; FLATSCR-NEXT: s_add_u32 s4, s5, s4 +; FLATSCR-NEXT: scratch_load_dword v2, off, s4 offset:208 +; FLATSCR-NEXT: s_add_u32 s4, s33, 0x1000 +; FLATSCR-NEXT: scratch_load_dword v4, off, s4 offset:68 +; FLATSCR-NEXT: s_add_u32 s4, s33, 0x1000 +; FLATSCR-NEXT: scratch_load_dword v5, off, s4 offset:64 +; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x6000 +; FLATSCR-NEXT: s_mov_b32 s33, s6 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 +; FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc +; FLATSCR-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %pin.low = alloca i32, align 8192, addrspace(5) %local.area = alloca [1060 x i64], align 4096, addrspace(5) Index: llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll +++ llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll @@ -1,31 +1,56 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s -check-prefix=MUBUF +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-flat-scratch < %s | FileCheck %s -check-prefix=FLATSCR ; Make sure there's no assertion from passing a 0 alignment value define void @memcpy_fixed_align(i8 addrspace(5)* %dst, i8 addrspace(1)* %src) { -; CHECK-LABEL: memcpy_fixed_align: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dword v0, v[1:2], off offset:36 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:36 -; CHECK-NEXT: global_load_dword v0, v[1:2], off offset:32 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 -; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 -; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 -; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 -; CHECK-NEXT: global_load_dwordx4 v[0:3], v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[30:31] +; MUBUF-LABEL: memcpy_fixed_align: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: global_load_dword v0, v[1:2], off offset:36 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:36 +; MUBUF-NEXT: global_load_dword v0, v[1:2], off offset:32 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 +; MUBUF-NEXT: global_load_dwordx4 v[3:6], v[1:2], off offset:16 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 +; MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 +; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 +; MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 +; MUBUF-NEXT: global_load_dwordx4 v[0:3], v[1:2], off +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 +; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: memcpy_fixed_align: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: global_load_dword v0, v[1:2], off offset:36 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:36 +; FLATSCR-NEXT: global_load_dword v0, v[1:2], off offset:32 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:32 +; FLATSCR-NEXT: global_load_dwordx4 v[3:6], v[1:2], off offset:16 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dword off, v6, s32 offset:28 +; FLATSCR-NEXT: scratch_store_dword off, v5, s32 offset:24 +; FLATSCR-NEXT: scratch_store_dword off, v4, s32 offset:20 +; FLATSCR-NEXT: scratch_store_dword off, v3, s32 offset:16 +; FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[1:2], off +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dword off, v3, s32 offset:12 +; FLATSCR-NEXT: scratch_store_dword off, v2, s32 offset:8 +; FLATSCR-NEXT: scratch_store_dword off, v1, s32 offset:4 +; FLATSCR-NEXT: scratch_store_dword off, v0, s32 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca [40 x i8], addrspace(5) %cast = bitcast [40 x i8] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memcpy.p5i8.p1i8.i64(i8 addrspace(5)* align 4 dereferenceable(40) %cast, i8 addrspace(1)* align 4 dereferenceable(40) %src, i64 40, i1 false) Index: llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DEFAULTSIZE %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=GCN,ASSUME1024 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DEFAULTSIZE,MUBUF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=GCN,ASSUME1024,MUBUF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,DEFAULTSIZE,FLATSCR %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-flat-scratch -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=GCN,ASSUME1024,FLATSCR %s ; FIXME: Generated test checks do not check metadata at the end of the ; function, so this also includes manually added checks. @@ -11,44 +13,82 @@ ; FIXME: FunctionLoweringInfo unhelpfully doesn't preserve an ; alignment less than the stack alignment. define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) { -; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_movk_i32 s32, 0x400 -; GCN-NEXT: s_mov_b32 s33, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s8, 0 -; GCN-NEXT: s_cbranch_scc1 BB0_3 -; GCN-NEXT: ; %bb.1: ; %bb.0 -; GCN-NEXT: s_cmp_lg_u32 s9, 0 -; GCN-NEXT: s_cbranch_scc1 BB0_3 -; GCN-NEXT: ; %bb.2: ; %bb.1 -; GCN-NEXT: s_add_i32 s6, s32, 0x1000 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: s_lshl_b32 s7, s10, 2 -; GCN-NEXT: s_mov_b32 s32, s6 -; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v1, 1 -; GCN-NEXT: s_add_i32 s6, s6, s7 -; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_u32_e32 v2, v1, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: global_store_dword v[0:1], v2, off -; GCN-NEXT: BB0_3: ; %bb.2 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: global_store_dword v[0:1], v0, off -; GCN-NEXT: s_endpgm +; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4: +; MUBUF: ; %bb.0: ; %entry +; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; MUBUF-NEXT: s_add_u32 s0, s0, s9 +; MUBUF-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 +; MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; MUBUF-NEXT: s_movk_i32 s32, 0x400 +; MUBUF-NEXT: s_mov_b32 s33, 0 +; MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; MUBUF-NEXT: s_cmp_lg_u32 s8, 0 +; MUBUF-NEXT: s_cbranch_scc1 BB0_3 +; MUBUF-NEXT: ; %bb.1: ; %bb.0 +; MUBUF-NEXT: s_cmp_lg_u32 s9, 0 +; MUBUF-NEXT: s_cbranch_scc1 BB0_3 +; MUBUF-NEXT: ; %bb.2: ; %bb.1 +; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 +; MUBUF-NEXT: v_mov_b32_e32 v1, 0 +; MUBUF-NEXT: v_mov_b32_e32 v2, s6 +; MUBUF-NEXT: s_lshl_b32 s7, s10, 2 +; MUBUF-NEXT: s_mov_b32 s32, s6 +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; MUBUF-NEXT: v_mov_b32_e32 v1, 1 +; MUBUF-NEXT: s_add_i32 s6, s6, s7 +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: v_mov_b32_e32 v1, s6 +; MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen +; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_add_u32_e32 v2, v1, v0 +; MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; MUBUF-NEXT: v_mov_b32_e32 v0, s4 +; MUBUF-NEXT: v_mov_b32_e32 v1, s5 +; MUBUF-NEXT: global_store_dword v[0:1], v2, off +; MUBUF-NEXT: BB0_3: ; %bb.2 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: global_store_dword v[0:1], v0, off +; MUBUF-NEXT: s_endpgm +; +; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4: +; FLATSCR: ; %bb.0: ; %entry +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_mov_b32 s32, 16 +; FLATSCR-NEXT: s_mov_b32 s33, 0 +; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; FLATSCR-NEXT: s_cmp_lg_u32 s8, 0 +; FLATSCR-NEXT: s_cbranch_scc1 BB0_3 +; FLATSCR-NEXT: ; %bb.1: ; %bb.0 +; FLATSCR-NEXT: s_cmp_lg_u32 s9, 0 +; FLATSCR-NEXT: s_cbranch_scc1 BB0_3 +; FLATSCR-NEXT: ; %bb.2: ; %bb.1 +; FLATSCR-NEXT: s_mov_b32 s6, s32 +; FLATSCR-NEXT: s_movk_i32 s7, 0x1000 +; FLATSCR-NEXT: s_add_i32 s8, s6, s7 +; FLATSCR-NEXT: s_add_u32 s6, s6, s7 +; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 +; FLATSCR-NEXT: scratch_store_dword off, v1, s6 +; FLATSCR-NEXT: v_mov_b32_e32 v1, 1 +; FLATSCR-NEXT: s_lshl_b32 s6, s10, 2 +; FLATSCR-NEXT: s_mov_b32 s32, s8 +; FLATSCR-NEXT: scratch_store_dword off, v1, s8 offset:4 +; FLATSCR-NEXT: s_add_i32 s8, s8, s6 +; FLATSCR-NEXT: scratch_load_dword v1, off, s8 +; FLATSCR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: v_add_u32_e32 v2, v1, v0 +; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; FLATSCR-NEXT: v_mov_b32_e32 v0, s4 +; FLATSCR-NEXT: v_mov_b32_e32 v1, s5 +; FLATSCR-NEXT: global_store_dword v[0:1], v2, off +; FLATSCR-NEXT: BB0_3: ; %bb.2 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; FLATSCR-NEXT: s_endpgm entry: %cond0 = icmp eq i32 %arg.cond0, 0 @@ -83,42 +123,75 @@ ; ASSUME1024: ; ScratchSize: 1040 define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) { -; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 -; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_movk_i32 s32, 0x1000 -; GCN-NEXT: s_mov_b32 s33, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_cbranch_scc1 BB1_2 -; GCN-NEXT: ; %bb.1: ; %bb.0 -; GCN-NEXT: s_add_i32 s6, s32, 0x1000 -; GCN-NEXT: s_and_b32 s6, s6, 0xfffff000 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: s_lshl_b32 s7, s7, 2 -; GCN-NEXT: s_mov_b32 s32, s6 -; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v1, 1 -; GCN-NEXT: s_add_i32 s6, s6, s7 -; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_u32_e32 v2, v1, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: global_store_dword v[0:1], v2, off -; GCN-NEXT: BB1_2: ; %bb.1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: global_store_dword v[0:1], v0, off -; GCN-NEXT: s_endpgm +; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: +; MUBUF: ; %bb.0: ; %entry +; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; MUBUF-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 +; MUBUF-NEXT: s_add_u32 s0, s0, s9 +; MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; MUBUF-NEXT: s_movk_i32 s32, 0x1000 +; MUBUF-NEXT: s_mov_b32 s33, 0 +; MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; MUBUF-NEXT: s_cmp_lg_u32 s6, 0 +; MUBUF-NEXT: s_cbranch_scc1 BB1_2 +; MUBUF-NEXT: ; %bb.1: ; %bb.0 +; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 +; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000 +; MUBUF-NEXT: v_mov_b32_e32 v1, 0 +; MUBUF-NEXT: v_mov_b32_e32 v2, s6 +; MUBUF-NEXT: s_lshl_b32 s7, s7, 2 +; MUBUF-NEXT: s_mov_b32 s32, s6 +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; MUBUF-NEXT: v_mov_b32_e32 v1, 1 +; MUBUF-NEXT: s_add_i32 s6, s6, s7 +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: v_mov_b32_e32 v1, s6 +; MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen +; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_add_u32_e32 v2, v1, v0 +; MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; MUBUF-NEXT: v_mov_b32_e32 v0, s4 +; MUBUF-NEXT: v_mov_b32_e32 v1, s5 +; MUBUF-NEXT: global_store_dword v[0:1], v2, off +; MUBUF-NEXT: BB1_2: ; %bb.1 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: global_store_dword v[0:1], v0, off +; MUBUF-NEXT: s_endpgm +; +; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: +; FLATSCR: ; %bb.0: ; %entry +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 +; FLATSCR-NEXT: s_mov_b32 s32, 64 +; FLATSCR-NEXT: s_mov_b32 s33, 0 +; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; FLATSCR-NEXT: s_cmp_lg_u32 s6, 0 +; FLATSCR-NEXT: s_cbranch_scc1 BB1_2 +; FLATSCR-NEXT: ; %bb.1: ; %bb.0 +; FLATSCR-NEXT: s_add_i32 s6, s32, 0x1000 +; FLATSCR-NEXT: s_and_b32 s6, s6, 0xfffff000 +; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 +; FLATSCR-NEXT: scratch_store_dword off, v1, s6 +; FLATSCR-NEXT: v_mov_b32_e32 v1, 1 +; FLATSCR-NEXT: s_lshl_b32 s7, s7, 2 +; FLATSCR-NEXT: s_mov_b32 s32, s6 +; FLATSCR-NEXT: scratch_store_dword off, v1, s6 offset:4 +; FLATSCR-NEXT: s_add_i32 s6, s6, s7 +; FLATSCR-NEXT: scratch_load_dword v1, off, s6 +; FLATSCR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: v_add_u32_e32 v2, v1, v0 +; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; FLATSCR-NEXT: v_mov_b32_e32 v0, s4 +; FLATSCR-NEXT: v_mov_b32_e32 v1, s5 +; FLATSCR-NEXT: global_store_dword v[0:1], v2, off +; FLATSCR-NEXT: BB1_2: ; %bb.1 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; FLATSCR-NEXT: s_endpgm entry: %cond = icmp eq i32 %arg.cond, 0 br i1 %cond, label %bb.0, label %bb.1 @@ -149,41 +222,79 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) { -; GCN-LABEL: func_non_entry_block_static_alloca_align4: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s7, s33 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_add_u32 s32, s32, 0x400 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz BB2_3 -; GCN-NEXT: ; %bb.1: ; %bb.0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GCN-NEXT: s_and_b64 exec, exec, vcc -; GCN-NEXT: s_cbranch_execz BB2_3 -; GCN-NEXT: ; %bb.2: ; %bb.1 -; GCN-NEXT: s_add_i32 s6, s32, 0x1000 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v2, 1 -; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4 -; GCN-NEXT: v_lshl_add_u32 v2, v4, 2, s6 -; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v5 -; GCN-NEXT: s_mov_b32 s32, s6 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_u32_e32 v2, v2, v3 -; GCN-NEXT: global_store_dword v[0:1], v2, off -; GCN-NEXT: BB2_3: ; %bb.2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: global_store_dword v[0:1], v0, off -; GCN-NEXT: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: s_mov_b32 s33, s7 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; MUBUF-LABEL: func_non_entry_block_static_alloca_align4: +; MUBUF: ; %bb.0: ; %entry +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s7, s33 +; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_add_u32 s32, s32, 0x400 +; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc +; MUBUF-NEXT: s_cbranch_execz BB2_3 +; MUBUF-NEXT: ; %bb.1: ; %bb.0 +; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; MUBUF-NEXT: s_and_b64 exec, exec, vcc +; MUBUF-NEXT: s_cbranch_execz BB2_3 +; MUBUF-NEXT: ; %bb.2: ; %bb.1 +; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 +; MUBUF-NEXT: v_mov_b32_e32 v2, 0 +; MUBUF-NEXT: v_mov_b32_e32 v3, s6 +; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; MUBUF-NEXT: v_mov_b32_e32 v2, 1 +; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: v_lshl_add_u32 v2, v4, 2, s6 +; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen +; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v5 +; MUBUF-NEXT: s_mov_b32 s32, s6 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3 +; MUBUF-NEXT: global_store_dword v[0:1], v2, off +; MUBUF-NEXT: BB2_3: ; %bb.2 +; MUBUF-NEXT: s_or_b64 exec, exec, s[4:5] +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: global_store_dword v[0:1], v0, off +; MUBUF-NEXT: s_sub_u32 s32, s32, 0x400 +; MUBUF-NEXT: s_mov_b32 s33, s7 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: func_non_entry_block_static_alloca_align4: +; FLATSCR: ; %bb.0: ; %entry +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s9, s33 +; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_add_u32 s32, s32, 16 +; FLATSCR-NEXT: s_and_saveexec_b64 s[4:5], vcc +; FLATSCR-NEXT: s_cbranch_execz BB2_3 +; FLATSCR-NEXT: ; %bb.1: ; %bb.0 +; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; FLATSCR-NEXT: s_and_b64 exec, exec, vcc +; FLATSCR-NEXT: s_cbranch_execz BB2_3 +; FLATSCR-NEXT: ; %bb.2: ; %bb.1 +; FLATSCR-NEXT: s_mov_b32 s6, s32 +; FLATSCR-NEXT: s_movk_i32 s7, 0x1000 +; FLATSCR-NEXT: s_add_i32 s8, s6, s7 +; FLATSCR-NEXT: s_add_u32 s6, s6, s7 +; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 +; FLATSCR-NEXT: scratch_store_dword off, v2, s6 +; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 +; FLATSCR-NEXT: scratch_store_dword off, v2, s8 offset:4 +; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s8 +; FLATSCR-NEXT: scratch_load_dword v2, v2, off +; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v5 +; FLATSCR-NEXT: s_mov_b32 s32, s8 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3 +; FLATSCR-NEXT: global_store_dword v[0:1], v2, off +; FLATSCR-NEXT: BB2_3: ; %bb.2 +; FLATSCR-NEXT: s_or_b64 exec, exec, s[4:5] +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; FLATSCR-NEXT: s_sub_u32 s32, s32, 16 +; FLATSCR-NEXT: s_mov_b32 s33, s9 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %cond0 = icmp eq i32 %arg.cond0, 0 @@ -213,39 +324,72 @@ } define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) { -; GCN-LABEL: func_non_entry_block_static_alloca_align64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_add_u32 s4, s32, 0xfc0 -; GCN-NEXT: s_mov_b32 s7, s33 -; GCN-NEXT: s_and_b32 s33, s4, 0xfffff000 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_add_u32 s32, s32, 0x2000 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz BB3_2 -; GCN-NEXT: ; %bb.1: ; %bb.0 -; GCN-NEXT: s_add_i32 s6, s32, 0x1000 -; GCN-NEXT: s_and_b32 s6, s6, 0xfffff000 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mov_b32_e32 v5, s6 -; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v2, 1 -; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen offset:4 -; GCN-NEXT: v_lshl_add_u32 v2, v3, 2, s6 -; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v4 -; GCN-NEXT: s_mov_b32 s32, s6 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_u32_e32 v2, v2, v3 -; GCN-NEXT: global_store_dword v[0:1], v2, off -; GCN-NEXT: BB3_2: ; %bb.1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: global_store_dword v[0:1], v0, off -; GCN-NEXT: s_sub_u32 s32, s32, 0x2000 -; GCN-NEXT: s_mov_b32 s33, s7 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; MUBUF-LABEL: func_non_entry_block_static_alloca_align64: +; MUBUF: ; %bb.0: ; %entry +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_add_u32 s4, s32, 0xfc0 +; MUBUF-NEXT: s_mov_b32 s7, s33 +; MUBUF-NEXT: s_and_b32 s33, s4, 0xfffff000 +; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; MUBUF-NEXT: s_add_u32 s32, s32, 0x2000 +; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc +; MUBUF-NEXT: s_cbranch_execz BB3_2 +; MUBUF-NEXT: ; %bb.1: ; %bb.0 +; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 +; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000 +; MUBUF-NEXT: v_mov_b32_e32 v2, 0 +; MUBUF-NEXT: v_mov_b32_e32 v5, s6 +; MUBUF-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; MUBUF-NEXT: v_mov_b32_e32 v2, 1 +; MUBUF-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: v_lshl_add_u32 v2, v3, 2, s6 +; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen +; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v4 +; MUBUF-NEXT: s_mov_b32 s32, s6 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3 +; MUBUF-NEXT: global_store_dword v[0:1], v2, off +; MUBUF-NEXT: BB3_2: ; %bb.1 +; MUBUF-NEXT: s_or_b64 exec, exec, s[4:5] +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: global_store_dword v[0:1], v0, off +; MUBUF-NEXT: s_sub_u32 s32, s32, 0x2000 +; MUBUF-NEXT: s_mov_b32 s33, s7 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: func_non_entry_block_static_alloca_align64: +; FLATSCR: ; %bb.0: ; %entry +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_add_u32 s4, s32, 63 +; FLATSCR-NEXT: s_mov_b32 s7, s33 +; FLATSCR-NEXT: s_and_b32 s33, s4, 0xffffffc0 +; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; FLATSCR-NEXT: s_add_u32 s32, s32, 0x80 +; FLATSCR-NEXT: s_and_saveexec_b64 s[4:5], vcc +; FLATSCR-NEXT: s_cbranch_execz BB3_2 +; FLATSCR-NEXT: ; %bb.1: ; %bb.0 +; FLATSCR-NEXT: s_add_i32 s6, s32, 0x1000 +; FLATSCR-NEXT: s_and_b32 s6, s6, 0xfffff000 +; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 +; FLATSCR-NEXT: scratch_store_dword off, v2, s6 +; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 +; FLATSCR-NEXT: scratch_store_dword off, v2, s6 offset:4 +; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s6 +; FLATSCR-NEXT: scratch_load_dword v2, v2, off +; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v4 +; FLATSCR-NEXT: s_mov_b32 s32, s6 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3 +; FLATSCR-NEXT: global_store_dword v[0:1], v2, off +; FLATSCR-NEXT: BB3_2: ; %bb.1 +; FLATSCR-NEXT: s_or_b64 exec, exec, s[4:5] +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x80 +; FLATSCR-NEXT: s_mov_b32 s33, s7 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %cond = icmp eq i32 %arg.cond, 0 br i1 %cond, label %bb.0, label %bb.1 Index: llvm/test/CodeGen/AMDGPU/scratch-simple.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -1,9 +1,11 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI,SIVI %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx803 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI,SIVI %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX9_10 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI,SIVI,MUBUF %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx803 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI,SIVI,MUBUF %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX9_10,MUBUF,GFX9-MUBUF,GFX9_10-MUBUF %s ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -filetype=obj -amdgpu-use-divergent-register-indexing < %s | llvm-readobj -r - | FileCheck --check-prefix=RELS %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W32,GFX9_10 %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W64,GFX9_10 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W32,GFX9_10,MUBUF,GFX10_W32-MUBUF,GFX9_10-MUBUF %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W64,GFX9_10,MUBUF,GFX10_W64-MUBUF,GFX9_10-MUBUF %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX9_10,FLATSCR,GFX9-FLATSCR %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W32,GFX9_10,FLATSCR,GFX10_W32-FLATSCR,GFX9_10-FLATSCR %s ; RELS: R_AMDGPU_ABS32_LO SCRATCH_RSRC_DWORD0 0x0 ; RELS: R_AMDGPU_ABS32_LO SCRATCH_RSRC_DWORD1 0x0 @@ -14,14 +16,25 @@ ; ; GCN-LABEL: {{^}}ps_main: -; GCN-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 -; GCN-DAG: s_mov_b32 s1, SCRATCH_RSRC_DWORD1 -; GCN-DAG: s_mov_b32 s2, -1 +; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 +; MUBUF-DAG: s_mov_b32 s1, SCRATCH_RSRC_DWORD1 +; MUBUF-DAG: s_mov_b32 s2, -1 ; SI-DAG: s_mov_b32 s3, 0xe8f000 ; VI-DAG: s_mov_b32 s3, 0xe80000 -; GFX9-DAG: s_mov_b32 s3, 0xe00000 -; GFX10_W32-DAG: s_mov_b32 s3, 0x31c16000 -; GFX10_W64-DAG: s_mov_b32 s3, 0x31e16000 +; GFX9-MUBUF-DAG: s_mov_b32 s3, 0xe00000 +; GFX10_W32-MUBUF-DAG: s_mov_b32 s3, 0x31c16000 +; GFX10_W64-MUBUF-DAG: s_mov_b32 s3, 0x31e16000 + +; FLATSCR-NOT: SCRATCH_RSRC_DWORD + +; GFX9-FLATSCR: s_mov_b32 [[SP:[^,]+]], 0 +; GFX9-FLATSCR: scratch_store_dword off, v2, [[SP]] offset: +; GFX9-FLATSCR: s_mov_b32 [[SP:[^,]+]], 0 +; GFX9-FLATSCR: scratch_store_dword off, v2, [[SP]] offset: + +; GFX10-FLATSCR: scratch_store_dword off, v2, null offset: +; GFX10-FLATSCR: scratch_store_dword off, v2, null offset: + ; GCN-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0 ; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]] ; GCN-NOT: s_mov_b32 s0 @@ -29,8 +42,10 @@ ; GCN-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[HI_OFF:v[0-9]+]],{{.*}} 0x280, [[CLAMP_IDX]] ; GCN-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0x80}}, [[CLAMP_IDX]] -; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen -; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; MUBUF: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; MUBUF: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; FLATSCR: scratch_load_dword {{v[0-9]+}}, [[LO_OFF]], off +; FLATSCR: scratch_load_dword {{v[0-9]+}}, [[HI_OFF]], off define amdgpu_ps float @ps_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -39,10 +54,22 @@ } ; GCN-LABEL: {{^}}vs_main: -; GCN-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 +; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 ; GCN-NOT: s_mov_b32 s0 -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen + +; FLATSCR-NOT: SCRATCH_RSRC_DWORD + +; MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen + +; GFX9-FLATSCR: s_mov_b32 [[SP:[^,]+]], 0 +; GFX9-FLATSCR: scratch_store_dword off, v2, [[SP]] offset: +; GFX9-FLATSCR: s_mov_b32 [[SP:[^,]+]], 0 +; GFX9-FLATSCR: scratch_store_dword off, v2, [[SP]] offset: + +; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off +; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off + define amdgpu_vs float @vs_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -51,9 +78,15 @@ } ; GCN-LABEL: {{^}}cs_main: -; GCN-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 + +; FLATSCR-NOT: SCRATCH_RSRC_DWORD + +; MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen + +; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off +; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off define amdgpu_cs float @cs_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -67,10 +100,14 @@ ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen -; GFX9_10: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 -; GFX9_10-NOT: s_mov_b32 s5 -; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen -; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GFX9_10-MUBUF: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 +; GFX9_10-NOT: s_mov_b32 s5 +; GFX9_10-MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GFX9_10-MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen + +; FLATSCR-NOT: SCRATCH_RSRC_DWORD +; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off +; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off define amdgpu_hs float @hs_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -83,9 +120,13 @@ ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen -; GFX9_10: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 -; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen -; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GFX9_10-MUBUF: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 +; GFX9_10-MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GFX9_10-MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen + +; FLATSCR-NOT: SCRATCH_RSRC_DWORD +; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off +; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off define amdgpu_gs float @gs_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -99,17 +140,21 @@ ; (i.e. SI_RETURN_TO_EPILOG) can access the scratch wave offset. ; GCN-LABEL: {{^}}hs_ir_uses_scratch_offset: -; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; MUBUF: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; FLATSCR-NOT: SCRATCH_RSRC_DWORD ; SIVI-NOT: s_mov_b32 s6 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; GFX9_10-NOT: s_mov_b32 s5 -; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen -; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GFX9_10-MUBUF-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GFX9_10-MUBUF-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; GCN-DAG: s_mov_b32 s2, s5 + +; FLATSCR-DAG: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off +; FLATSCR-DAG: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -120,15 +165,19 @@ } ; GCN-LABEL: {{^}}gs_ir_uses_scratch_offset: -; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; MUBUF: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; FLATSCR-NOT: SCRATCH_RSRC_DWORD ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen -; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen -; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GFX9_10-MUBUF-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GFX9_10-MUBUF-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; GCN-DAG: s_mov_b32 s2, s5 + +; FLATSCR-DAG: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off +; FLATSCR-DAG: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx Index: llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GCN %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=MUBUF %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -amdgpu-enable-flat-scratch -verify-machineinstrs | FileCheck -check-prefix=FLATSCR %s ; FIXME: The MUBUF loads in this test output are incorrect, their SOffset ; should use the frame offset register, not the ABI stack pointer register. We @@ -13,44 +14,86 @@ ; An assert was hit when frame offset register was used to address FrameIndex. define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <4 x i32> addrspace(1)* %input, <4 x float> addrspace(1)* %output, i32 %i) { -; GCN-LABEL: kernel_background_evaluate: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 -; GCN-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GCN-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GCN-NEXT: s_mov_b32 s38, -1 -; GCN-NEXT: s_mov_b32 s39, 0x31c16000 -; GCN-NEXT: s_add_u32 s36, s36, s3 -; GCN-NEXT: s_addc_u32 s37, s37, 0 -; GCN-NEXT: v_mov_b32_e32 v1, 0x2000 -; GCN-NEXT: v_mov_b32_e32 v2, 0x4000 -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: v_mov_b32_e32 v4, 0x400000 -; GCN-NEXT: s_mov_b32 s32, 0xc0000 -; GCN-NEXT: v_add_nc_u32_e64 v40, 4, 0x4000 -; GCN-NEXT: ; implicit-def: $vcc_hi -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, svm_eval_nodes@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, svm_eval_nodes@rel32@hi+12 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: s_mov_b64 s[0:1], s[36:37] -; GCN-NEXT: s_mov_b64 s[2:3], s[38:39] -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GCN-NEXT: s_cbranch_execz BB0_2 -; GCN-NEXT: ; %bb.1: ; %if.then4.i -; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: buffer_load_dword v0, v40, s[36:39], 0 offen -; GCN-NEXT: buffer_load_dword v1, v40, s[36:39], 0 offen offset:4 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: v_mul_lo_u32 v0, 0x41c64e6d, v0 -; GCN-NEXT: v_add_nc_u32_e32 v0, 0x3039, v0 -; GCN-NEXT: buffer_store_dword v0, v0, s[36:39], 0 offen -; GCN-NEXT: BB0_2: ; %shader_eval_surface.exit -; GCN-NEXT: s_endpgm +; MUBUF-LABEL: kernel_background_evaluate: +; MUBUF: ; %bb.0: ; %entry +; MUBUF-NEXT: s_load_dword s0, s[0:1], 0x24 +; MUBUF-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MUBUF-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MUBUF-NEXT: s_mov_b32 s38, -1 +; MUBUF-NEXT: s_mov_b32 s39, 0x31c16000 +; MUBUF-NEXT: s_add_u32 s36, s36, s3 +; MUBUF-NEXT: s_addc_u32 s37, s37, 0 +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x2000 +; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 +; MUBUF-NEXT: v_mov_b32_e32 v3, 0 +; MUBUF-NEXT: v_mov_b32_e32 v4, 0x400000 +; MUBUF-NEXT: s_mov_b32 s32, 0xc0000 +; MUBUF-NEXT: v_add_nc_u32_e64 v40, 4, 0x4000 +; MUBUF-NEXT: ; implicit-def: $vcc_hi +; MUBUF-NEXT: s_getpc_b64 s[4:5] +; MUBUF-NEXT: s_add_u32 s4, s4, svm_eval_nodes@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s5, s5, svm_eval_nodes@rel32@hi+12 +; MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; MUBUF-NEXT: v_mov_b32_e32 v0, s0 +; MUBUF-NEXT: s_mov_b64 s[0:1], s[36:37] +; MUBUF-NEXT: s_mov_b64 s[2:3], s[38:39] +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MUBUF-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; MUBUF-NEXT: s_and_saveexec_b32 s0, vcc_lo +; MUBUF-NEXT: s_cbranch_execz BB0_2 +; MUBUF-NEXT: ; %bb.1: ; %if.then4.i +; MUBUF-NEXT: s_clause 0x1 +; MUBUF-NEXT: buffer_load_dword v0, v40, s[36:39], 0 offen +; MUBUF-NEXT: buffer_load_dword v1, v40, s[36:39], 0 offen offset:4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_add_nc_u32_e32 v0, v1, v0 +; MUBUF-NEXT: v_mul_lo_u32 v0, 0x41c64e6d, v0 +; MUBUF-NEXT: v_add_nc_u32_e32 v0, 0x3039, v0 +; MUBUF-NEXT: buffer_store_dword v0, v0, s[36:39], 0 offen +; MUBUF-NEXT: BB0_2: ; %shader_eval_surface.exit +; MUBUF-NEXT: s_endpgm +; +; FLATSCR-LABEL: kernel_background_evaluate: +; FLATSCR: ; %bb.0: ; %entry +; FLATSCR-NEXT: s_load_dword s0, s[0:1], 0x24 +; FLATSCR-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; FLATSCR-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; FLATSCR-NEXT: s_mov_b32 s38, -1 +; FLATSCR-NEXT: s_mov_b32 s39, 0x31c16000 +; FLATSCR-NEXT: s_add_u32 s36, s36, s3 +; FLATSCR-NEXT: s_addc_u32 s37, s37, 0 +; FLATSCR-NEXT: v_mov_b32_e32 v1, 0x2000 +; FLATSCR-NEXT: v_mov_b32_e32 v2, 0x4000 +; FLATSCR-NEXT: v_mov_b32_e32 v3, 0 +; FLATSCR-NEXT: v_mov_b32_e32 v4, 0x400000 +; FLATSCR-NEXT: s_movk_i32 s32, 0x6000 +; FLATSCR-NEXT: ; implicit-def: $vcc_hi +; FLATSCR-NEXT: s_getpc_b64 s[4:5] +; FLATSCR-NEXT: s_add_u32 s4, s4, svm_eval_nodes@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s5, s5, svm_eval_nodes@rel32@hi+12 +; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; FLATSCR-NEXT: v_mov_b32_e32 v0, s0 +; FLATSCR-NEXT: s_mov_b64 s[0:1], s[36:37] +; FLATSCR-NEXT: s_mov_b64 s[2:3], s[38:39] +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[4:5] +; FLATSCR-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; FLATSCR-NEXT: s_and_saveexec_b32 s0, vcc_lo +; FLATSCR-NEXT: s_cbranch_execz BB0_2 +; FLATSCR-NEXT: ; %bb.1: ; %if.then4.i +; FLATSCR-NEXT: s_movk_i32 vcc_lo, 0x4000 +; FLATSCR-NEXT: s_nop 0 +; FLATSCR-NEXT: s_nop 0 +; FLATSCR-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 +; FLATSCR-NEXT: s_waitcnt_depctr 0xffe3 +; FLATSCR-NEXT: s_movk_i32 vcc_lo, 0x4000 +; FLATSCR-NEXT: scratch_load_dword v1, off, vcc_lo offset:8 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: v_add_nc_u32_e32 v0, v1, v0 +; FLATSCR-NEXT: v_mul_lo_u32 v0, 0x41c64e6d, v0 +; FLATSCR-NEXT: v_add_nc_u32_e32 v0, 0x3039, v0 +; FLATSCR-NEXT: scratch_store_dword off, v0, s0 +; FLATSCR-NEXT: BB0_2: ; %shader_eval_surface.exit +; FLATSCR-NEXT: s_endpgm entry: %sd = alloca < 1339 x i32>, align 8192, addrspace(5) %state = alloca <4 x i32>, align 16, addrspace(5) Index: llvm/test/CodeGen/AMDGPU/store-hi16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/store-hi16.ll +++ llvm/test/CodeGen/AMDGPU/store-hi16.ll @@ -1,6 +1,7 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900,GFX9,GFX900-MUBUF %s ; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX906,GFX9,NO-D16-HI %s ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX803,NO-D16-HI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900,GFX9,GFX900-FLATSCR %s ; GCN-LABEL: {{^}}store_global_hi_v2i16: ; GCN: s_waitcnt @@ -389,7 +390,8 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}} +; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}} +; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI: buffer_store_short v1, v0, s[0:3], 0 offen{{$}} @@ -408,7 +410,8 @@ ; GCN-LABEL: {{^}}store_private_hi_v2f16: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}} +; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}} +; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off{{$}} ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI: buffer_store_short v1, v0, s[0:3], 0 offen{{$}} @@ -427,7 +430,8 @@ ; GCN-LABEL: {{^}}store_private_hi_i32_shift: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}} +; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}} +; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen{{$}} @@ -445,7 +449,8 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}} +; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}} +; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi v0, v1, off{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen{{$}} @@ -464,7 +469,8 @@ ; GCN-LABEL: {{^}}store_private_hi_i8_shift: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}} +; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}} +; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi v0, v1, off{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen{{$}} @@ -481,7 +487,8 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16_max_offset: ; GCN: s_waitcnt -; GFX900: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} +; GFX900-MUBUF: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} +; GFX900-FLATSCR: scratch_store_short_d16_hi off, v0, s32 offset:4094{{$}} ; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0 ; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], s32 offset:4094{{$}} @@ -502,7 +509,9 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16_nooff: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], 0{{$}} +; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], 0{{$}} +; GFX900-FLATSCR-NEXT: s_mov_b32 [[SOFF:s[0-9]+]], 0 +; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, [[SOFF]]{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], 0{{$}} @@ -522,7 +531,9 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_nooff: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], 0{{$}} +; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], 0{{$}} +; GFX900-FLATSCR-NEXT: s_mov_b32 [[SOFF:s[0-9]+]], 0 +; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, [[SOFF]]{{$}} ; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0 ; NO-D16-HI: buffer_store_byte v0, off, s[0:3], 0{{$}} @@ -634,8 +645,10 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16_to_offset: ; GCN: s_waitcnt -; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094 +; GFX900-MUBUF: buffer_store_dword +; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094 +; GFX900-FLATSCR: scratch_store_dword +; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, s32 offset:4094 define void @store_private_hi_v2i16_to_offset(i32 %arg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -651,8 +664,10 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_to_offset: ; GCN: s_waitcnt -; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s32 offset:4095 +; GFX900-MUBUF: buffer_store_dword +; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s32 offset:4095 +; GFX900-FLATSCR: scratch_store_dword +; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, s32 offset:4095 define void @store_private_hi_v2i16_i8_to_offset(i32 %arg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5)