Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -43,11 +43,12 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIInstrInfo.h" #include "SIRegisterInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -74,23 +75,42 @@ #define DEBUG_TYPE "si-load-store-opt" namespace { +enum InstClassEnum { + UNKNOWN, + DS_READ, + DS_WRITE, + S_BUFFER_LOAD_IMM, + BUFFER_LOAD_IDXEN, + BUFFER_LOAD_OFFEN, + BUFFER_LOAD_OFFSET, + BUFFER_STORE_IDXEN, + BUFFER_STORE_OFFEN, + BUFFER_STORE_OFFSET, + BUFFER_LOAD_IDXEN_exact, + BUFFER_LOAD_OFFEN_exact, + BUFFER_LOAD_OFFSET_exact, + BUFFER_STORE_IDXEN_exact, + BUFFER_STORE_OFFEN_exact, + BUFFER_STORE_OFFSET_exact, +}; -class SILoadStoreOptimizer : public MachineFunctionPass { - enum InstClassEnum { - DS_READ_WRITE, - S_BUFFER_LOAD_IMM, - BUFFER_LOAD_OFFEN, - BUFFER_LOAD_OFFSET, - BUFFER_STORE_OFFEN, - BUFFER_STORE_OFFSET, - }; +enum RegisterEnum { + SBASE = 0x1, + SRSRC = 0x2, + SOFFSET = 0x4, + VADDR = 0x8, + ADDR = 0x10, +}; +class SILoadStoreOptimizer : public MachineFunctionPass { struct CombineInfo { MachineBasicBlock::iterator I; MachineBasicBlock::iterator Paired; unsigned EltSize; unsigned Offset0; unsigned Offset1; + unsigned Width0; + unsigned Width1; unsigned BaseOff; InstClassEnum InstClass; bool GLC0; @@ -98,9 +118,8 @@ bool SLC0; bool SLC1; bool UseST64; - bool IsX2; - SmallVector InstsToMove; - }; + SmallVector InstsToMove; + }; private: const GCNSubtarget *STM = nullptr; @@ -108,9 +127,17 @@ const SIRegisterInfo *TRI = nullptr; MachineRegisterInfo *MRI = nullptr; AliasAnalysis *AA = nullptr; - unsigned CreatedX2; + bool OptimizeAgain; static bool offsetsCanBeCombined(CombineInfo &CI); + static bool widthsFit(const CombineInfo &CI); + static unsigned getNewOpcode(const CombineInfo &CI); + static std::pair getSubRegIdxs(const CombineInfo &CI); + static const TargetRegisterClass * + getTargetRegisterClass(const CombineInfo &CI); + static unsigned getOpcodeWidth(unsigned Opc); + static InstClassEnum getInstClass(unsigned Opc); + static unsigned getRegs(unsigned Opc); bool findMatchingInst(CombineInfo &CI); @@ -123,8 +150,6 @@ MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); - unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2, - bool &IsOffen) const; MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); public: @@ -153,8 +178,8 @@ INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, - "SI Load Store Optimizer", false, false) +INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", + false, false) char SILoadStoreOptimizer::ID = 0; @@ -165,7 +190,7 @@ } static void moveInstsAfter(MachineBasicBlock::iterator I, - ArrayRef InstsToMove) { + ArrayRef InstsToMove) { MachineBasicBlock *MBB = I->getParent(); ++I; for (MachineInstr *MI : InstsToMove) { @@ -191,21 +216,19 @@ static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, MachineBasicBlock::iterator B, const SIInstrInfo *TII, - AliasAnalysis * AA) { + AliasAnalysis *AA) { // RAW or WAR - cannot reorder // WAW - cannot reorder // RAR - safe to reorder return !(A->mayStore() || B->mayStore()) || - TII->areMemAccessesTriviallyDisjoint(*A, *B, AA); + TII->areMemAccessesTriviallyDisjoint(*A, *B, AA); } // Add MI and its defs to the lists if MI reads one of the defs that are // already in the list. Returns true in that case. -static bool -addToListsIfDependent(MachineInstr &MI, - DenseSet &RegDefs, - DenseSet &PhysRegUses, - SmallVectorImpl &Insts) { +static bool addToListsIfDependent(MachineInstr &MI, DenseSet &RegDefs, + DenseSet &PhysRegUses, + SmallVectorImpl &Insts) { for (MachineOperand &Use : MI.operands()) { // If one of the defs is read, then there is a use of Def between I and the // instruction that I will potentially be merged with. We will need to move @@ -228,18 +251,16 @@ return false; } -static bool -canMoveInstsAcrossMemOp(MachineInstr &MemOp, - ArrayRef InstsToMove, - const SIInstrInfo *TII, - AliasAnalysis *AA) { +static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, + ArrayRef InstsToMove, + const SIInstrInfo *TII, AliasAnalysis *AA) { assert(MemOp.mayLoadOrStore()); for (MachineInstr *InstToMove : InstsToMove) { if (!InstToMove->mayLoadOrStore()) continue; if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA)) - return false; + return false; } return true; } @@ -260,10 +281,9 @@ CI.BaseOff = 0; // Handle SMEM and VMEM instructions. - if (CI.InstClass != DS_READ_WRITE) { - unsigned Diff = CI.IsX2 ? 2 : 1; - return (EltOffset0 + Diff == EltOffset1 || - EltOffset1 + Diff == EltOffset0) && + if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { + return (EltOffset0 + CI.Width0 == EltOffset1 || + EltOffset1 + CI.Width1 == EltOffset0) && CI.GLC0 == CI.GLC1 && (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1); } @@ -305,42 +325,280 @@ return false; } +bool SILoadStoreOptimizer::widthsFit(const CombineInfo &CI) { + const unsigned Width = (CI.Width0 + CI.Width1); + switch (CI.InstClass) { + default: + return Width <= 4; + case S_BUFFER_LOAD_IMM: + switch (Width) { + default: + return false; + case 2: + case 4: + return true; + } + } +} + +unsigned SILoadStoreOptimizer::getOpcodeWidth(unsigned Opc) { + switch (Opc) { + default: + return 0; + case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + case AMDGPU::BUFFER_LOAD_DWORD_IDXEN: + case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: + case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: + case AMDGPU::BUFFER_STORE_DWORD_IDXEN: + case AMDGPU::BUFFER_STORE_DWORD_OFFEN: + case AMDGPU::BUFFER_STORE_DWORD_OFFSET: + case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact: + case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: + case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: + case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact: + case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: + case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: + return 1; + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::BUFFER_LOAD_DWORDX2_IDXEN: + case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: + case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET: + case AMDGPU::BUFFER_STORE_DWORDX2_IDXEN: + case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: + case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: + case AMDGPU::BUFFER_LOAD_DWORDX2_IDXEN_exact: + case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN_exact: + case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET_exact: + case AMDGPU::BUFFER_STORE_DWORDX2_IDXEN_exact: + case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact: + case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact: + return 2; + case AMDGPU::BUFFER_LOAD_DWORDX3_IDXEN: + case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN: + case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET: + case AMDGPU::BUFFER_STORE_DWORDX3_IDXEN: + case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN: + case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET: + case AMDGPU::BUFFER_LOAD_DWORDX3_IDXEN_exact: + case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN_exact: + case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET_exact: + case AMDGPU::BUFFER_STORE_DWORDX3_IDXEN_exact: + case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN_exact: + case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET_exact: + return 3; + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + case AMDGPU::BUFFER_LOAD_DWORDX4_IDXEN: + case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: + case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET: + case AMDGPU::BUFFER_STORE_DWORDX4_IDXEN: + case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: + case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET: + case AMDGPU::BUFFER_LOAD_DWORDX4_IDXEN_exact: + case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN_exact: + case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET_exact: + case AMDGPU::BUFFER_STORE_DWORDX4_IDXEN_exact: + case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact: + case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact: + return 4; + } +} + +InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) { + switch (Opc) { + default: + return UNKNOWN; + case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + return S_BUFFER_LOAD_IMM; + case AMDGPU::BUFFER_LOAD_DWORD_IDXEN: + case AMDGPU::BUFFER_LOAD_DWORDX2_IDXEN: + case AMDGPU::BUFFER_LOAD_DWORDX3_IDXEN: + case AMDGPU::BUFFER_LOAD_DWORDX4_IDXEN: + return BUFFER_LOAD_IDXEN; + case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: + case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: + case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN: + case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: + return BUFFER_LOAD_OFFEN; + case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: + case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET: + case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET: + case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET: + return BUFFER_LOAD_OFFSET; + case AMDGPU::BUFFER_STORE_DWORD_IDXEN: + case AMDGPU::BUFFER_STORE_DWORDX2_IDXEN: + case AMDGPU::BUFFER_STORE_DWORDX3_IDXEN: + case AMDGPU::BUFFER_STORE_DWORDX4_IDXEN: + return BUFFER_STORE_IDXEN; + case AMDGPU::BUFFER_STORE_DWORD_OFFEN: + case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: + case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN: + case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: + return BUFFER_STORE_OFFEN; + case AMDGPU::BUFFER_STORE_DWORD_OFFSET: + case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: + case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET: + case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET: + return BUFFER_STORE_OFFSET; + case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact: + case AMDGPU::BUFFER_LOAD_DWORDX2_IDXEN_exact: + case AMDGPU::BUFFER_LOAD_DWORDX3_IDXEN_exact: + case AMDGPU::BUFFER_LOAD_DWORDX4_IDXEN_exact: + return BUFFER_LOAD_IDXEN_exact; + case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: + case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN_exact: + case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN_exact: + case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN_exact: + return BUFFER_LOAD_OFFEN_exact; + case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: + case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET_exact: + case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET_exact: + case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET_exact: + return BUFFER_LOAD_OFFSET_exact; + case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact: + case AMDGPU::BUFFER_STORE_DWORDX2_IDXEN_exact: + case AMDGPU::BUFFER_STORE_DWORDX3_IDXEN_exact: + case AMDGPU::BUFFER_STORE_DWORDX4_IDXEN_exact: + return BUFFER_STORE_IDXEN_exact; + case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: + case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact: + case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN_exact: + case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact: + return BUFFER_STORE_OFFEN_exact; + case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: + case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact: + case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET_exact: + case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact: + return BUFFER_STORE_OFFSET_exact; + case AMDGPU::DS_READ_B32: + case AMDGPU::DS_READ_B64: + case AMDGPU::DS_READ_B32_gfx9: + case AMDGPU::DS_READ_B64_gfx9: + return DS_READ; + case AMDGPU::DS_WRITE_B32: + case AMDGPU::DS_WRITE_B64: + case AMDGPU::DS_WRITE_B32_gfx9: + case AMDGPU::DS_WRITE_B64_gfx9: + return DS_WRITE; + } +} + +unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) { + switch (Opc) { + default: + return 0; + case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + return SBASE; + case AMDGPU::BUFFER_LOAD_DWORD_IDXEN: + case AMDGPU::BUFFER_LOAD_DWORDX2_IDXEN: + case AMDGPU::BUFFER_LOAD_DWORDX3_IDXEN: + case AMDGPU::BUFFER_LOAD_DWORDX4_IDXEN: + case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: + case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: + case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN: + case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: + case AMDGPU::BUFFER_STORE_DWORD_IDXEN: + case AMDGPU::BUFFER_STORE_DWORDX2_IDXEN: + case AMDGPU::BUFFER_STORE_DWORDX3_IDXEN: + case AMDGPU::BUFFER_STORE_DWORDX4_IDXEN: + case AMDGPU::BUFFER_STORE_DWORD_OFFEN: + case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: + case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN: + case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: + case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact: + case AMDGPU::BUFFER_LOAD_DWORDX2_IDXEN_exact: + case AMDGPU::BUFFER_LOAD_DWORDX3_IDXEN_exact: + case AMDGPU::BUFFER_LOAD_DWORDX4_IDXEN_exact: + case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: + case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN_exact: + case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN_exact: + case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN_exact: + case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact: + case AMDGPU::BUFFER_STORE_DWORDX2_IDXEN_exact: + case AMDGPU::BUFFER_STORE_DWORDX3_IDXEN_exact: + case AMDGPU::BUFFER_STORE_DWORDX4_IDXEN_exact: + case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: + case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact: + case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN_exact: + case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact: + return SRSRC | SOFFSET | VADDR; + case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: + case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET: + case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET: + case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET: + case AMDGPU::BUFFER_STORE_DWORD_OFFSET: + case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: + case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET: + case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET: + case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: + case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET_exact: + case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET_exact: + case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET_exact: + case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: + case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact: + case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET_exact: + case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact: + return SRSRC | SOFFSET; + case AMDGPU::DS_READ_B32: + case AMDGPU::DS_READ_B64: + case AMDGPU::DS_READ_B32_gfx9: + case AMDGPU::DS_READ_B64_gfx9: + case AMDGPU::DS_WRITE_B32: + case AMDGPU::DS_WRITE_B64: + case AMDGPU::DS_WRITE_B32_gfx9: + case AMDGPU::DS_WRITE_B64_gfx9: + return ADDR; + } +} + bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); MachineBasicBlock::iterator E = MBB->end(); MachineBasicBlock::iterator MBBI = CI.I; - unsigned AddrOpName[3] = {0}; - int AddrIdx[3]; - const MachineOperand *AddrReg[3]; + const unsigned Opc = CI.I->getOpcode(); + const InstClassEnum InstClass = getInstClass(Opc); + + if (InstClass == UNKNOWN) { + return false; + } + + const unsigned Regs = getRegs(Opc); + + unsigned AddrOpName[5] = {0}; + int AddrIdx[5]; + const MachineOperand *AddrReg[5]; unsigned NumAddresses = 0; - switch (CI.InstClass) { - case DS_READ_WRITE: + if (Regs & ADDR) { AddrOpName[NumAddresses++] = AMDGPU::OpName::addr; - break; - case S_BUFFER_LOAD_IMM: + } + + if (Regs & SBASE) { AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase; - break; - case BUFFER_LOAD_OFFEN: - case BUFFER_STORE_OFFEN: - AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; - AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; - AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; - break; - case BUFFER_LOAD_OFFSET: - case BUFFER_STORE_OFFSET: + } + + if (Regs & SRSRC) { AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; + } + + if (Regs & SOFFSET) { AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; - break; + } + + if (Regs & VADDR) { + AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; } for (unsigned i = 0; i < NumAddresses; i++) { AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]); AddrReg[i] = &CI.I->getOperand(AddrIdx[i]); - // We only ever merge operations with the same base address register, so don't - // bother scanning forward if there are no other uses. + // We only ever merge operations with the same base address register, so + // don't bother scanning forward if there are no other uses. if (AddrReg[i]->isReg() && (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) || MRI->hasOneNonDBGUse(AddrReg[i]->getReg()))) @@ -353,8 +611,11 @@ DenseSet PhysRegUsesToMove; addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); - for ( ; MBBI != E; ++MBBI) { - if (MBBI->getOpcode() != CI.I->getOpcode()) { + for (; MBBI != E; ++MBBI) { + const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE); + + if ((getInstClass(MBBI->getOpcode()) != InstClass) || + (IsDS && (MBBI->getOpcode() != Opc))) { // This is not a matching DS instruction, but we can keep looking as // long as one of these conditions are met: // 1. It is safe to move I down past MBBI. @@ -368,8 +629,8 @@ } if (MBBI->mayLoadOrStore() && - (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || - !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) { + (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || + !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) { // We fail condition #1, but we may still be able to satisfy condition // #2. Add this instruction to the move list and then we will check // if condition #2 holds once we have selected the matching instruction. @@ -413,8 +674,8 @@ continue; } - // Check same base pointer. Be careful of subregisters, which can occur with - // vectors of pointers. + // Check same base pointer. Be careful of subregisters, which can occur + // with vectors of pointers. if (AddrReg[i]->getReg() != AddrRegNext.getReg() || AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { Match = false; @@ -423,13 +684,15 @@ } if (Match) { - int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), - AMDGPU::OpName::offset); + int OffsetIdx = + AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset); CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm(); + CI.Width0 = getOpcodeWidth(Opc); CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm(); + CI.Width1 = getOpcodeWidth(MBBI->getOpcode()); CI.Paired = MBBI; - if (CI.InstClass == DS_READ_WRITE) { + if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) { CI.Offset0 &= 0xffff; CI.Offset1 &= 0xffff; } else { @@ -445,7 +708,7 @@ // We also need to go through the list of instructions that we plan to // move and make sure they are all safe to move down past the merged // instruction. - if (offsetsCanBeCombined(CI)) + if (widthsFit(CI) && offsetsCanBeCombined(CI)) if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) return true; } @@ -472,12 +735,12 @@ if (STM->ldsRequiresM0Init()) return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; - return (EltSize == 4) ? - AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9; + return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 + : AMDGPU::DS_READ2ST64_B64_gfx9; } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( - CombineInfo &CI) { +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); // Be careful, since the addresses could be subregisters themselves in weird @@ -489,8 +752,8 @@ unsigned NewOffset0 = CI.Offset0; unsigned NewOffset1 = CI.Offset1; - unsigned Opc = CI.UseST64 ? - read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); + unsigned Opc = + CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; @@ -502,13 +765,12 @@ } assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && - (NewOffset0 != NewOffset1) && - "Computed offset doesn't fit"); + (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); const MCInstrDesc &Read2Desc = TII->get(Opc); - const TargetRegisterClass *SuperRC - = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; + const TargetRegisterClass *SuperRC = + (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; unsigned DestReg = MRI->createVirtualRegister(SuperRC); DebugLoc DL = CI.I->getDebugLoc(); @@ -519,23 +781,24 @@ if (CI.BaseOff) { unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) - .addImm(CI.BaseOff); + .addImm(CI.BaseOff); BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) - .addReg(ImmReg) - .addReg(AddrReg->getReg(), 0, BaseSubReg); + .addReg(ImmReg) + .addReg(AddrReg->getReg(), 0, BaseSubReg); BaseSubReg = 0; } - MachineInstrBuilder Read2 = BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg) - .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr - .addImm(NewOffset0) // offset0 - .addImm(NewOffset1) // offset1 - .addImm(0) // gds - .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); + MachineInstrBuilder Read2 = + BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg) + .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addImm(0) // gds + .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); (void)Read2; @@ -562,32 +825,36 @@ unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { if (STM->ldsRequiresM0Init()) return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; - return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9; + return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 + : AMDGPU::DS_WRITE2_B64_gfx9; } unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { if (STM->ldsRequiresM0Init()) - return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; + return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 + : AMDGPU::DS_WRITE2ST64_B64; - return (EltSize == 4) ? - AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9; + return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 + : AMDGPU::DS_WRITE2ST64_B64_gfx9; } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( - CombineInfo &CI) { +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); // Be sure to use .addOperand(), and not .addReg() with these. We want to be // sure we preserve the subregister index and any register flags set on them. - const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); - const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); - const MachineOperand *Data1 - = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0); + const MachineOperand *AddrReg = + TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); + const MachineOperand *Data0 = + TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); + const MachineOperand *Data1 = + TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0); unsigned NewOffset0 = CI.Offset0; unsigned NewOffset1 = CI.Offset1; - unsigned Opc = CI.UseST64 ? - write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); + unsigned Opc = + CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); if (NewOffset0 > NewOffset1) { // Canonicalize the merged instruction so the smaller offset comes first. @@ -596,8 +863,7 @@ } assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && - (NewOffset0 != NewOffset1) && - "Computed offset doesn't fit"); + (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); const MCInstrDesc &Write2Desc = TII->get(Opc); DebugLoc DL = CI.I->getDebugLoc(); @@ -608,25 +874,26 @@ if (CI.BaseOff) { unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) - .addImm(CI.BaseOff); + .addImm(CI.BaseOff); BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) - .addReg(ImmReg) - .addReg(AddrReg->getReg(), 0, BaseSubReg); + .addReg(ImmReg) + .addReg(AddrReg->getReg(), 0, BaseSubReg); BaseSubReg = 0; } - MachineInstrBuilder Write2 = BuildMI(*MBB, CI.Paired, DL, Write2Desc) - .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr - .add(*Data0) // data0 - .add(*Data1) // data1 - .addImm(NewOffset0) // offset0 - .addImm(NewOffset1) // offset1 - .addImm(0) // gds - .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); + MachineInstrBuilder Write2 = + BuildMI(*MBB, CI.Paired, DL, Write2Desc) + .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr + .add(*Data0) // data0 + .add(*Data1) // data1 + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addImm(0) // gds + .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); moveInstsAfter(Write2, CI.InstsToMove); @@ -638,15 +905,14 @@ return Next; } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( - CombineInfo &CI) { +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); - unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM : - AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; + const unsigned Opcode = getNewOpcode(CI); + + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); - const TargetRegisterClass *SuperRC = - CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass; unsigned DestReg = MRI->createVirtualRegister(SuperRC); unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); @@ -656,12 +922,9 @@ .addImm(CI.GLC0) // glc .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); - unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; - unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; - - // Handle descending offsets - if (CI.Offset0 > CI.Offset1) - std::swap(SubRegIdx0, SubRegIdx1); + std::pair SubRegIdx = getSubRegIdxs(CI); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); // Copy to the old destination registers. const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); @@ -683,29 +946,25 @@ return Next; } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( - CombineInfo &CI) { +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); - unsigned Opcode; - if (CI.InstClass == BUFFER_LOAD_OFFEN) { - Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN : - AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; - } else { - Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET : - AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; - } + const unsigned Opcode = getNewOpcode(CI); - const TargetRegisterClass *SuperRC = - CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); + + // Copy to the new source register. unsigned DestReg = MRI->createVirtualRegister(SuperRC); unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg); - if (CI.InstClass == BUFFER_LOAD_OFFEN) - MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); + const unsigned Regs = getRegs(Opcode); + + if (Regs & VADDR) + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) @@ -715,12 +974,9 @@ .addImm(0) // tfe .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); - unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; - unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; - - // Handle descending offsets - if (CI.Offset0 > CI.Offset1) - std::swap(SubRegIdx0, SubRegIdx1); + std::pair SubRegIdx = getSubRegIdxs(CI); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); // Copy to the old destination registers. const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); @@ -742,57 +998,267 @@ return Next; } -unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode( - const MachineInstr &I, bool &IsX2, bool &IsOffen) const { - IsX2 = false; - IsOffen = false; +unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) { + const unsigned Width = CI.Width0 + CI.Width1; - switch (I.getOpcode()) { - case AMDGPU::BUFFER_STORE_DWORD_OFFEN: - IsOffen = true; - return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN; - case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: - IsOffen = true; - return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact; - case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: - IsX2 = true; - IsOffen = true; - return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN; - case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact: - IsX2 = true; - IsOffen = true; - return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact; - case AMDGPU::BUFFER_STORE_DWORD_OFFSET: - return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; - case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: - return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact; - case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: - IsX2 = true; - return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; - case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact: - IsX2 = true; - return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact; + switch (CI.InstClass) { + default: + return 0; + case S_BUFFER_LOAD_IMM: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; + case 4: + return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; + } + case BUFFER_LOAD_IDXEN: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::BUFFER_LOAD_DWORDX2_IDXEN; + case 3: + return AMDGPU::BUFFER_LOAD_DWORDX3_IDXEN; + case 4: + return AMDGPU::BUFFER_LOAD_DWORDX4_IDXEN; + } + case BUFFER_LOAD_OFFEN: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; + case 3: + return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN; + case 4: + return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN; + } + case BUFFER_LOAD_OFFSET: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; + case 3: + return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET; + case 4: + return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; + } + case BUFFER_STORE_IDXEN: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::BUFFER_STORE_DWORDX2_IDXEN; + case 3: + return AMDGPU::BUFFER_STORE_DWORDX3_IDXEN; + case 4: + return AMDGPU::BUFFER_STORE_DWORDX4_IDXEN; + } + case BUFFER_STORE_OFFEN: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN; + case 3: + return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN; + case 4: + return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN; + } + case BUFFER_STORE_OFFSET: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; + case 3: + return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET; + case 4: + return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; + } + case BUFFER_LOAD_IDXEN_exact: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::BUFFER_LOAD_DWORDX2_IDXEN_exact; + case 3: + return AMDGPU::BUFFER_LOAD_DWORDX3_IDXEN_exact; + case 4: + return AMDGPU::BUFFER_LOAD_DWORDX4_IDXEN_exact; + } + case BUFFER_LOAD_OFFEN_exact: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN_exact; + case 3: + return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN_exact; + case 4: + return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN_exact; + } + case BUFFER_LOAD_OFFSET_exact: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET_exact; + case 3: + return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET_exact; + case 4: + return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET_exact; + } + case BUFFER_STORE_IDXEN_exact: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::BUFFER_STORE_DWORDX2_IDXEN_exact; + case 3: + return AMDGPU::BUFFER_STORE_DWORDX3_IDXEN_exact; + case 4: + return AMDGPU::BUFFER_STORE_DWORDX4_IDXEN_exact; + } + case BUFFER_STORE_OFFEN_exact: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact; + case 3: + return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN_exact; + case 4: + return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact; + } + case BUFFER_STORE_OFFSET_exact: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact; + case 3: + return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET_exact; + case 4: + return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact; + } } - return 0; } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( - CombineInfo &CI) { +std::pair +SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) { + if (CI.Offset0 > CI.Offset1) { + switch (CI.Width0) { + default: + return std::make_pair(0, 0); + case 1: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub1, AMDGPU::sub0); + case 2: + return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1); + case 3: + return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2); + } + case 2: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0); + case 2: + return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1); + } + case 3: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0); + } + } + } else { + switch (CI.Width0) { + default: + return std::make_pair(0, 0); + case 1: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub0, AMDGPU::sub1); + case 2: + return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2); + case 3: + return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3); + } + case 2: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2); + case 2: + return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3); + } + case 3: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3); + } + } + } +} + +const TargetRegisterClass * +SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) { + if (CI.InstClass == S_BUFFER_LOAD_IMM) { + switch (CI.Width0 + CI.Width1) { + default: + return nullptr; + case 2: + return &AMDGPU::SReg_64_XEXECRegClass; + case 4: + return &AMDGPU::SReg_128RegClass; + case 8: + return &AMDGPU::SReg_256RegClass; + case 16: + return &AMDGPU::SReg_512RegClass; + } + } else { + switch (CI.Width0 + CI.Width1) { + default: + return nullptr; + case 2: + return &AMDGPU::VReg_64RegClass; + case 3: + return &AMDGPU::VReg_96RegClass; + case 4: + return &AMDGPU::VReg_128RegClass; + } + } +} + +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); - bool Unused1, Unused2; - unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2); - unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; - unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; + const unsigned Opcode = getNewOpcode(CI); - // Handle descending offsets - if (CI.Offset0 > CI.Offset1) - std::swap(SubRegIdx0, SubRegIdx1); + std::pair SubRegIdx = getSubRegIdxs(CI); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); // Copy to the new source register. - const TargetRegisterClass *SuperRC = - CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); unsigned SrcReg = MRI->createVirtualRegister(SuperRC); const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); @@ -805,10 +1271,12 @@ .addImm(SubRegIdx1); auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode)) - .addReg(SrcReg, RegState::Kill); + .addReg(SrcReg, RegState::Kill); + + const unsigned Regs = getRegs(Opcode); - if (CI.InstClass == BUFFER_STORE_OFFEN) - MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); + if (Regs & VADDR) + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) @@ -841,90 +1309,73 @@ continue; } + const unsigned Opc = MI.getOpcode(); + CombineInfo CI; CI.I = I; - unsigned Opc = MI.getOpcode(); - if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 || - Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) { + CI.InstClass = getInstClass(Opc); - CI.InstClass = DS_READ_WRITE; + switch (CI.InstClass) { + default: + break; + case DS_READ: CI.EltSize = - (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4; - + (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 + : 4; if (findMatchingInst(CI)) { Modified = true; I = mergeRead2Pair(CI); } else { ++I; } - continue; - } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 || - Opc == AMDGPU::DS_WRITE_B32_gfx9 || - Opc == AMDGPU::DS_WRITE_B64_gfx9) { - CI.InstClass = DS_READ_WRITE; - CI.EltSize - = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4; - + case DS_WRITE: + CI.EltSize = + (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 + : 4; if (findMatchingInst(CI)) { Modified = true; I = mergeWrite2Pair(CI); } else { ++I; } - continue; - } - if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM || - Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) { - // EltSize is in units of the offset encoding. - CI.InstClass = S_BUFFER_LOAD_IMM; + case S_BUFFER_LOAD_IMM: CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4); - CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; if (findMatchingInst(CI)) { Modified = true; I = mergeSBufferLoadImmPair(CI); - if (!CI.IsX2) - CreatedX2++; + OptimizeAgain |= (CI.Width0 + CI.Width1) < 16; } else { ++I; } continue; - } - if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || - Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || - Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET || - Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) { - if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || - Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN) - CI.InstClass = BUFFER_LOAD_OFFEN; - else - CI.InstClass = BUFFER_LOAD_OFFSET; - + case BUFFER_LOAD_IDXEN: + case BUFFER_LOAD_OFFEN: + case BUFFER_LOAD_OFFSET: + case BUFFER_LOAD_IDXEN_exact: + case BUFFER_LOAD_OFFEN_exact: + case BUFFER_LOAD_OFFSET_exact: CI.EltSize = 4; - CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || - Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; if (findMatchingInst(CI)) { Modified = true; I = mergeBufferLoadPair(CI); - if (!CI.IsX2) - CreatedX2++; + OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; } else { ++I; } continue; - } - - bool StoreIsX2, IsOffen; - if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) { - CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET; + case BUFFER_STORE_IDXEN: + case BUFFER_STORE_OFFEN: + case BUFFER_STORE_OFFSET: + case BUFFER_STORE_IDXEN_exact: + case BUFFER_STORE_OFFEN_exact: + case BUFFER_STORE_OFFSET_exact: CI.EltSize = 4; - CI.IsX2 = StoreIsX2; if (findMatchingInst(CI)) { Modified = true; I = mergeBufferStorePair(CI); - if (!CI.IsX2) - CreatedX2++; + OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; } else { ++I; } @@ -958,12 +1409,10 @@ bool Modified = false; for (MachineBasicBlock &MBB : MF) { - CreatedX2 = 0; - Modified |= optimizeBlock(MBB); - - // Run again to convert x2 to x4. - if (CreatedX2 >= 1) + do { + OptimizeAgain = false; Modified |= optimizeBlock(MBB); + } while (OptimizeAgain); } return Modified; Index: test/CodeGen/AMDGPU/cvt_f32_ubyte.ll =================================================================== --- test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -36,10 +36,10 @@ ; GCN-LABEL: {{^}}load_v3i8_to_v3f32: ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]] ; GCN-NOT: v_cvt_f32_ubyte3_e32 -; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]] -; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]] +; GCN-DAG: v_cvt_f32_ubyte2_e32 v[[HIRESULT:[0-9]+]], [[VAL]] +; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[VAL]] ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]] -; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, +; GCN: buffer_store_dwordx3 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid Index: test/CodeGen/AMDGPU/early-if-convert-cost.ll =================================================================== --- test/CodeGen/AMDGPU/early-if-convert-cost.ll +++ test/CodeGen/AMDGPU/early-if-convert-cost.ll @@ -60,8 +60,7 @@ ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc -; GCN-DAG: buffer_store_dword v -; GCN-DAG: buffer_store_dwordx2 +; GCN-DAG: buffer_store_dwordx3 define amdgpu_kernel void @test_vccnz_ifcvt_triangle96(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in, float %cnd) #0 { entry: %v = load <3 x i32>, <3 x i32> addrspace(1)* %in Index: test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -95,8 +95,7 @@ ; GCN-LABEL: {{^}}dynamic_insertelement_v3f32: ; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]] -; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]: -; GCN-DAG: buffer_store_dword v +; GCN-DAG: buffer_store_dwordx3 {{v\[}}[[LOW_RESULT_REG]]: define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind { %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 @@ -146,8 +145,7 @@ ; GCN-LABEL: {{^}}dynamic_insertelement_v3i32: ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], 5 -; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]: -; GCN-DAG: buffer_store_dword v +; GCN-DAG: buffer_store_dwordx3 {{v\[}}[[LOW_RESULT_REG]]: define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind { %vecins = insertelement <3 x i32> %a, i32 5, i32 %b store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16 Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll @@ -193,6 +193,22 @@ ret void } +;CHECK-LABEL: {{^}}buffer_load_x3_offen_merged: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 +;CHECK: s_waitcnt +define amdgpu_ps void @buffer_load_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a) { +main_body: + %a1 = add i32 %a, 4 + %a2 = add i32 %a, 12 + %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) + %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) + %r1 = extractelement <2 x float> %vr1, i32 0 + %r2 = extractelement <2 x float> %vr1, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float undef, i1 true, i1 true) + ret void +} + ;CHECK-LABEL: {{^}}buffer_load_x1_offset_merged: ;CHECK-NEXT: %bb. ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 @@ -227,6 +243,117 @@ ret void } +;CHECK-LABEL: {{^}}buffer_load_x3_offset_merged: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +;CHECK: s_waitcnt +define amdgpu_ps void @buffer_load_x3_offset_merged(<4 x i32> inreg %rsrc) { +main_body: + %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) + %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) + %r1 = extractelement <2 x float> %vr1, i32 0 + %r2 = extractelement <2 x float> %vr1, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}buffer_load_x1_idxen_merged: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_load_dwordx2 v[0:1], v0, s[0:3], 0 idxen offset:4 +define amdgpu_ps void @buffer_load_x1_idxen_merged(<4 x i32> inreg %rsrc, i32 %index) { + %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 4, i1 0, i1 0) + %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float 0.0, float 0.0, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}buffer_load_x1_idxen_merged2: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen offset:4 +define amdgpu_ps void @buffer_load_x1_idxen_merged2(<4 x i32> inreg %rsrc, i32 %index) { + %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 4, i1 0, i1 0) + %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 12, i1 0, i1 0) + %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 16, i1 0, i1 0) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}buffer_load_x2_idxen_merged: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen +define amdgpu_ps void @buffer_load_x2_idxen_merged(<4 x i32> inreg %rsrc, i32 %index) { + %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + %vr2 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + %r1 = extractelement <2 x float> %vr1, i32 0 + %r2 = extractelement <2 x float> %vr1, i32 1 + %r3 = extractelement <2 x float> %vr2, i32 0 + %r4 = extractelement <2 x float> %vr2, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}buffer_load_x3_idxen_merged: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_load_dwordx3 v[0:2], v0, s[0:3], 0 idxen +define amdgpu_ps void @buffer_load_x3_idxen_merged(<4 x i32> inreg %rsrc, i32 %index) { + %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 4, i1 0, i1 0) + %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}buffer_load_x3_idxen_merged2: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_load_dwordx3 v[0:2], v0, s[0:3], 0 idxen +define amdgpu_ps void @buffer_load_x3_idxen_merged2(<4 x i32> inreg %rsrc, i32 %index) { + %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + %r1 = extractelement <2 x float> %vr1, i32 0 + %r2 = extractelement <2 x float> %vr1, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}buffer_load_x3_idxen_merged3: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_load_dwordx3 v[0:2], v0, s[0:3], 0 idxen +define amdgpu_ps void @buffer_load_x3_idxen_merged3(<4 x i32> inreg %rsrc, i32 %index) { + %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %index, i32 12, i1 0, i1 0) + %r2 = extractelement <2 x float> %vr1, i32 0 + %r3 = extractelement <2 x float> %vr1, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}buffer_load_x4_idxen_merged: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen +define amdgpu_ps void @buffer_load_x4_idxen_merged(<4 x i32> inreg %rsrc, i32 %index) { + %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 12, i1 0, i1 0) + %r1 = extractelement <2 x float> %vr1, i32 0 + %r2 = extractelement <2 x float> %vr1, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}buffer_load_x4_idxen_merged2: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen offset:8 +define amdgpu_ps void @buffer_load_x4_idxen_merged2(<4 x i32> inreg %rsrc, i32 %index) { + %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %index, i32 12, i1 0, i1 0) + %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 20, i1 0, i1 0) + %r2 = extractelement <2 x float> %vr1, i32 0 + %r3 = extractelement <2 x float> %vr1, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) + ret void +} + declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0 declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0 declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0 Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll @@ -147,6 +147,41 @@ ret void } +;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 +define amdgpu_ps void @buffer_store_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3) { + %a1 = add i32 %a, 28 + %a2 = add i32 %a, 32 + %a3 = add i32 %a, 36 + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 %a3, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged2: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 +define amdgpu_ps void @buffer_store_x3_offen_merged2(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, float %v2) { + %a1 = add i32 %a, 4 + %a2 = add i32 %a, 12 + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged3: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 +define amdgpu_ps void @buffer_store_x3_offen_merged3(<4 x i32> inreg %rsrc, i32 %a, float %v1, <2 x float> %v2) { + %a1 = add i32 %a, 4 + %a2 = add i32 %a, 8 + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) + ret void +} + ;CHECK-LABEL: {{^}}buffer_store_x1_offset_merged: ;CHECK-NOT: s_waitcnt ;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 @@ -164,12 +199,118 @@ ;CHECK-LABEL: {{^}}buffer_store_x2_offset_merged: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1,<2 x float> %v2) { +define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1, <2 x float> %v2) { call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) ret void } +;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged: +;CHECK-NOT: s_waitcnt +;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +define amdgpu_ps void @buffer_store_x3_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3) { + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged2: +;CHECK-NOT: s_waitcnt +;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +define amdgpu_ps void @buffer_store_x3_offset_merged2(<4 x i32> inreg %rsrc, float %v1, <2 x float> %v2) { + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged3: +;CHECK-NOT: s_waitcnt +;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:8 +define amdgpu_ps void @buffer_store_x3_offset_merged3(<4 x i32> inreg %rsrc, <2 x float> %v1, float %v2) { + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) + ret void +} + + +;CHECK-LABEL: {{^}}buffer_store_x1_idxen_merged: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx2 v[1:2], v0, s[0:3], 0 idxen offset:4 +define amdgpu_ps void @buffer_store_x1_idxen_merged(<4 x i32> inreg %rsrc, i32 %index, float %v1, float %v2) { + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 %index, i32 4, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x1_idxen_merged2: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 idxen offset:4 +define amdgpu_ps void @buffer_store_x1_idxen_merged2(<4 x i32> inreg %rsrc, i32 %index, float %v1, float %v2, float %v3, float %v4) { + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 %index, i32 4, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 %index, i32 12, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 %index, i32 16, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x2_idxen_merged: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x2_idxen_merged(<4 x i32> inreg %rsrc, i32 %index, <2 x float> %v1, <2 x float> %v2) { + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x3_idxen_merged: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx3 v[1:3], v0, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x3_idxen_merged(<4 x i32> inreg %rsrc, i32 %index, float %v1, float %v2, float %v3) { + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 %index, i32 12, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 %index, i32 16, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x3_idxen_merged2: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx3 v[1:3], v0, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x3_idxen_merged2(<4 x i32> inreg %rsrc, i32 %index, <2 x float> %v1, float %v2) { + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 %index, i32 16, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x3_idxen_merged3: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx3 v[1:3], v0, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x3_idxen_merged3(<4 x i32> inreg %rsrc, i32 %index, float %v1, <2 x float> %v2) { + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 %index, i32 4, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x4_idxen_merged: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x4_idxen_merged(<4 x i32> inreg %rsrc, i32 %index, float %v1, <2 x float> %v2, float %v3) { + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 %index, i32 12, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 %index, i32 20, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x4_idxen_merged2: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x4_idxen_merged2(<4 x i32> inreg %rsrc, i32 %index, <2 x float> %v1, float %v2, float %v3) { + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 %index, i32 12, i1 0, i1 0) + ret void +} + declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #0 declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0 declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0 Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll @@ -0,0 +1,114 @@ +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}s_buffer_load_imm: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x4 +define amdgpu_ps void @s_buffer_load_imm(<4 x i32> inreg %desc) { +main_body: + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0) + %bitcast = bitcast i32 %load to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_load_index: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} +define amdgpu_ps void @s_buffer_load_index(<4 x i32> inreg %desc, i32 inreg %index) { +main_body: + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %index, i32 0) + %bitcast = bitcast i32 %load to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_loadx2_imm: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x40 +define amdgpu_ps void @s_buffer_loadx2_imm(<4 x i32> inreg %desc) { +main_body: + %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 64, i32 0) + %bitcast = bitcast <2 x i32> %load to <2 x float> + %x = extractelement <2 x float> %bitcast, i32 0 + %y = extractelement <2 x float> %bitcast, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_loadx2_index: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} +define amdgpu_ps void @s_buffer_loadx2_index(<4 x i32> inreg %desc, i32 inreg %index) { +main_body: + %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %index, i32 0) + %bitcast = bitcast <2 x i32> %load to <2 x float> + %x = extractelement <2 x float> %bitcast, i32 0 + %y = extractelement <2 x float> %bitcast, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_loadx4_imm: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0xc8 +define amdgpu_ps void @s_buffer_loadx4_imm(<4 x i32> inreg %desc) { +main_body: + %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 200, i32 0) + %bitcast = bitcast <4 x i32> %load to <4 x float> + %x = extractelement <4 x float> %bitcast, i32 0 + %y = extractelement <4 x float> %bitcast, i32 1 + %z = extractelement <4 x float> %bitcast, i32 2 + %w = extractelement <4 x float> %bitcast, i32 3 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_loadx4_index: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} +define amdgpu_ps void @s_buffer_loadx4_index(<4 x i32> inreg %desc, i32 inreg %index) { +main_body: + %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %index, i32 0) + %bitcast = bitcast <4 x i32> %load to <4 x float> + %x = extractelement <4 x float> %bitcast, i32 0 + %y = extractelement <4 x float> %bitcast, i32 1 + %z = extractelement <4 x float> %bitcast, i32 2 + %w = extractelement <4 x float> %bitcast, i32 3 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_load_imm_mergex2: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x4 +define amdgpu_ps void @s_buffer_load_imm_mergex2(<4 x i32> inreg %desc) { +main_body: + %load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0) + %load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 8, i32 0) + %x = bitcast i32 %load0 to float + %y = bitcast i32 %load1 to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_load_imm_mergex4: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x8 +define amdgpu_ps void @s_buffer_load_imm_mergex4(<4 x i32> inreg %desc) { +main_body: + %load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 8, i32 0) + %load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 12, i32 0) + %load2 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 16, i32 0) + %load3 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 20, i32 0) + %x = bitcast i32 %load0 to float + %y = bitcast i32 %load1 to float + %z = bitcast i32 %load2 to float + %w = bitcast i32 %load3 to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true) + ret void +} + +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) +declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32) +declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32) +declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) Index: test/CodeGen/AMDGPU/merge-stores.ll =================================================================== --- test/CodeGen/AMDGPU/merge-stores.ll +++ test/CodeGen/AMDGPU/merge-stores.ll @@ -164,8 +164,8 @@ } ; GCN-LABEL: {{^}}merge_global_store_3_constants_i32: -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dword +; SI-DAG: buffer_store_dwordx3 +; SI-NOT: buffer_store_dwordx2 ; SI-NOT: buffer_store_dword ; GCN: s_endpgm define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 { @@ -274,11 +274,9 @@ } ; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32: -; SI-DAG: buffer_load_dwordx2 -; SI-DAG: buffer_load_dword v +; SI-DAG: buffer_load_dwordx3 ; GCN: s_waitcnt -; SI-DAG: buffer_store_dword v -; SI-DAG: buffer_store_dwordx2 v +; SI-DAG: buffer_store_dwordx3 v ; GCN: s_endpgm define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 @@ -563,8 +561,7 @@ ; GCN-LABEL: {{^}}merge_global_store_7_constants_i32: ; GCN: buffer_store_dwordx4 -; GCN: buffer_store_dwordx2 -; GCN: buffer_store_dword v +; GCN: buffer_store_dwordx3 define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) { store i32 34, i32 addrspace(1)* %out, align 4 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 @@ -611,13 +608,11 @@ ; GCN-LABEL: {{^}}copy_v3i32_align4: ; GCN-NOT: SCRATCH_RSRC_DWORD -; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 -; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-NOT: offen ; GCN: s_waitcnt vmcnt ; GCN-NOT: offen -; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 +; GCN-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN: ScratchSize: 0{{$}} define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 { @@ -644,13 +639,11 @@ ; GCN-LABEL: {{^}}copy_v3f32_align4: ; GCN-NOT: SCRATCH_RSRC_DWORD -; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 -; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-NOT: offen ; GCN: s_waitcnt vmcnt ; GCN-NOT: offen -; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 +; GCN-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN: ScratchSize: 0{{$}} define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4 Index: test/CodeGen/AMDGPU/store-global.ll =================================================================== --- test/CodeGen/AMDGPU/store-global.ll +++ test/CodeGen/AMDGPU/store-global.ll @@ -273,8 +273,7 @@ } ; FUNC-LABEL: {{^}}store_v3i32: -; SIVI-DAG: buffer_store_dwordx2 -; SIVI-DAG: buffer_store_dword v +; SIVI-DAG: buffer_store_dwordx3 ; GFX9-DAG: global_store_dwordx2 ; GFX9-DAG: global_store_dword v Index: test/CodeGen/AMDGPU/store-v3i64.ll =================================================================== --- test/CodeGen/AMDGPU/store-v3i64.ll +++ test/CodeGen/AMDGPU/store-v3i64.ll @@ -89,8 +89,7 @@ } ; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i32: -; GCN-DAG: buffer_store_dwordx2 -; GCN-DAG: buffer_store_dword v +; GCN-DAG: buffer_store_dwordx3 define amdgpu_kernel void @global_truncstore_v3i64_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i64> %x) { %trunc = trunc <3 x i64> %x to <3 x i32> store <3 x i32> %trunc, <3 x i32> addrspace(1)* %out