Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -48,6 +48,7 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -74,23 +75,181 @@ #define DEBUG_TYPE "si-load-store-opt" namespace { +enum InstClassEnum { + UNKNOWN, + DS_READ, + DS_WRITE, + S_LOAD_IMM, + S_BUFFER_LOAD_IMM, + BUFFER_LOAD_IDXEN, + BUFFER_LOAD_OFFEN, + BUFFER_LOAD_OFFSET, + BUFFER_STORE_IDXEN, + BUFFER_STORE_OFFEN, + BUFFER_STORE_OFFSET, + BUFFER_LOAD_IDXEN_exact, + BUFFER_LOAD_OFFEN_exact, + BUFFER_LOAD_OFFSET_exact, + BUFFER_STORE_IDXEN_exact, + BUFFER_STORE_OFFEN_exact, + BUFFER_STORE_OFFSET_exact, +}; -class SILoadStoreOptimizer : public MachineFunctionPass { - enum InstClassEnum { - DS_READ_WRITE, - S_BUFFER_LOAD_IMM, - BUFFER_LOAD_OFFEN, - BUFFER_LOAD_OFFSET, - BUFFER_STORE_OFFEN, - BUFFER_STORE_OFFSET, - }; +enum RegisterEnum { + SBASE = 0x1, + SRSRC = 0x2, + SOFFSET = 0x4, + VADDR = 0x8, + ADDR = 0x10, +}; + +struct OpcodeInfo { + InstClassEnum InstClass = UNKNOWN; + unsigned Width = 0; + unsigned Regs = 0; +}; + +const DenseMap OpcodeInfoMap{ + {AMDGPU::S_LOAD_DWORD_IMM, {S_LOAD_IMM, 1, SBASE}}, + {AMDGPU::S_LOAD_DWORDX2_IMM, {S_LOAD_IMM, 2, SBASE}}, + {AMDGPU::S_LOAD_DWORDX4_IMM, {S_LOAD_IMM, 4, SBASE}}, + {AMDGPU::S_LOAD_DWORDX8_IMM, {S_LOAD_IMM, 8, SBASE}}, + {AMDGPU::S_LOAD_DWORDX16_IMM, {S_LOAD_IMM, 16, SBASE}}, + + {AMDGPU::S_BUFFER_LOAD_DWORD_IMM, {S_BUFFER_LOAD_IMM, 1, SBASE}}, + {AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM, {S_BUFFER_LOAD_IMM, 2, SBASE}}, + {AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM, {S_BUFFER_LOAD_IMM, 4, SBASE}}, + {AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM, {S_BUFFER_LOAD_IMM, 8, SBASE}}, + {AMDGPU::S_BUFFER_LOAD_DWORDX16_IMM, {S_BUFFER_LOAD_IMM, 16, SBASE}}, + + {AMDGPU::BUFFER_LOAD_DWORD_IDXEN, + {BUFFER_LOAD_IDXEN, 1, SRSRC | SOFFSET | VADDR}}, + {AMDGPU::BUFFER_LOAD_DWORDX2_IDXEN, + {BUFFER_LOAD_IDXEN, 2, SRSRC | SOFFSET | VADDR}}, + {AMDGPU::BUFFER_LOAD_DWORDX3_IDXEN, + {BUFFER_LOAD_IDXEN, 3, SRSRC | SOFFSET | VADDR}}, + {AMDGPU::BUFFER_LOAD_DWORDX4_IDXEN, + {BUFFER_LOAD_IDXEN, 4, SRSRC | SOFFSET | VADDR}}, + + {AMDGPU::BUFFER_LOAD_DWORD_OFFEN, + {BUFFER_LOAD_OFFEN, 1, SRSRC | SOFFSET | VADDR}}, + {AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN, + {BUFFER_LOAD_OFFEN, 2, SRSRC | SOFFSET | VADDR}}, + {AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN, + {BUFFER_LOAD_OFFEN, 3, SRSRC | SOFFSET | VADDR}}, + {AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN, + {BUFFER_LOAD_OFFEN, 4, SRSRC | SOFFSET | VADDR}}, + + {AMDGPU::BUFFER_LOAD_DWORD_OFFSET, + {BUFFER_LOAD_OFFSET, 1, SRSRC | SOFFSET}}, + {AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET, + {BUFFER_LOAD_OFFSET, 2, SRSRC | SOFFSET}}, + {AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET, + {BUFFER_LOAD_OFFSET, 3, SRSRC | SOFFSET}}, + {AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET, + {BUFFER_LOAD_OFFSET, 4, SRSRC | SOFFSET}}, + + {AMDGPU::BUFFER_STORE_DWORD_IDXEN, + {BUFFER_STORE_IDXEN, 1, SRSRC | SOFFSET | VADDR}}, + {AMDGPU::BUFFER_STORE_DWORDX2_IDXEN, + {BUFFER_STORE_IDXEN, 2, SRSRC | SOFFSET | VADDR}}, + {AMDGPU::BUFFER_STORE_DWORDX3_IDXEN, + {BUFFER_STORE_IDXEN, 3, SRSRC | SOFFSET | VADDR}}, + {AMDGPU::BUFFER_STORE_DWORDX4_IDXEN, + {BUFFER_STORE_IDXEN, 4, SRSRC | SOFFSET | VADDR}}, + + {AMDGPU::BUFFER_STORE_DWORD_OFFEN, + {BUFFER_STORE_OFFEN, 1, SRSRC | SOFFSET | VADDR}}, + {AMDGPU::BUFFER_STORE_DWORDX2_OFFEN, + {BUFFER_STORE_OFFEN, 2, SRSRC | SOFFSET | VADDR}}, + {AMDGPU::BUFFER_STORE_DWORDX3_OFFEN, + {BUFFER_STORE_OFFEN, 3, SRSRC | SOFFSET | VADDR}}, + {AMDGPU::BUFFER_STORE_DWORDX4_OFFEN, + {BUFFER_STORE_OFFEN, 4, SRSRC | SOFFSET | VADDR}}, + + {AMDGPU::BUFFER_STORE_DWORD_OFFSET, + {BUFFER_STORE_OFFSET, 1, SRSRC | SOFFSET}}, + {AMDGPU::BUFFER_STORE_DWORDX2_OFFSET, + {BUFFER_STORE_OFFSET, 2, SRSRC | SOFFSET}}, + {AMDGPU::BUFFER_STORE_DWORDX3_OFFSET, + {BUFFER_STORE_OFFSET, 3, SRSRC | SOFFSET}}, + {AMDGPU::BUFFER_STORE_DWORDX4_OFFSET, + {BUFFER_STORE_OFFSET, 4, SRSRC | SOFFSET}}, + + {AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact, + {BUFFER_LOAD_IDXEN_exact, 1, SRSRC | SOFFSET | VADDR}}, + {AMDGPU::BUFFER_LOAD_DWORDX2_IDXEN_exact, + {BUFFER_LOAD_IDXEN_exact, 2, SRSRC | SOFFSET | VADDR}}, + {AMDGPU::BUFFER_LOAD_DWORDX3_IDXEN_exact, + {BUFFER_LOAD_IDXEN_exact, 3, SRSRC | SOFFSET | VADDR}}, + {AMDGPU::BUFFER_LOAD_DWORDX4_IDXEN_exact, + {BUFFER_LOAD_IDXEN_exact, 4, SRSRC | SOFFSET | VADDR}}, + + {AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact, + {BUFFER_LOAD_OFFEN_exact, 1, SRSRC | SOFFSET | VADDR}}, + {AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN_exact, + {BUFFER_LOAD_OFFEN_exact, 2, SRSRC | SOFFSET | VADDR}}, + {AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN_exact, + {BUFFER_LOAD_OFFEN_exact, 3, SRSRC | SOFFSET | VADDR}}, + {AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN_exact, + {BUFFER_LOAD_OFFEN_exact, 4, SRSRC | SOFFSET | VADDR}}, + + {AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact, + {BUFFER_LOAD_OFFSET_exact, 1, SRSRC | SOFFSET}}, + {AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET_exact, + {BUFFER_LOAD_OFFSET_exact, 2, SRSRC | SOFFSET}}, + {AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET_exact, + {BUFFER_LOAD_OFFSET_exact, 3, SRSRC | SOFFSET}}, + {AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET_exact, + {BUFFER_LOAD_OFFSET_exact, 4, SRSRC | SOFFSET}}, + + {AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact, + {BUFFER_STORE_IDXEN_exact, 1, SRSRC | SOFFSET | VADDR}}, + {AMDGPU::BUFFER_STORE_DWORDX2_IDXEN_exact, + {BUFFER_STORE_IDXEN_exact, 2, SRSRC | SOFFSET | VADDR}}, + {AMDGPU::BUFFER_STORE_DWORDX3_IDXEN_exact, + {BUFFER_STORE_IDXEN_exact, 3, SRSRC | SOFFSET | VADDR}}, + {AMDGPU::BUFFER_STORE_DWORDX4_IDXEN_exact, + {BUFFER_STORE_IDXEN_exact, 4, SRSRC | SOFFSET | VADDR}}, + + {AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact, + {BUFFER_STORE_OFFEN_exact, 1, SRSRC | SOFFSET | VADDR}}, + {AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact, + {BUFFER_STORE_OFFEN_exact, 2, SRSRC | SOFFSET | VADDR}}, + {AMDGPU::BUFFER_STORE_DWORDX3_OFFEN_exact, + {BUFFER_STORE_OFFEN_exact, 3, SRSRC | SOFFSET | VADDR}}, + {AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact, + {BUFFER_STORE_OFFEN_exact, 4, SRSRC | SOFFSET | VADDR}}, + + {AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact, + {BUFFER_STORE_OFFSET_exact, 1, SRSRC | SOFFSET}}, + {AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact, + {BUFFER_STORE_OFFSET_exact, 2, SRSRC | SOFFSET}}, + {AMDGPU::BUFFER_STORE_DWORDX3_OFFSET_exact, + {BUFFER_STORE_OFFSET_exact, 3, SRSRC | SOFFSET}}, + {AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact, + {BUFFER_STORE_OFFSET_exact, 4, SRSRC | SOFFSET}}, + + {AMDGPU::DS_READ_B32, {DS_READ, 0, ADDR}}, + {AMDGPU::DS_READ_B64, {DS_READ, 0, ADDR}}, + {AMDGPU::DS_READ_B32_gfx9, {DS_READ, 0, ADDR}}, + {AMDGPU::DS_READ_B64_gfx9, {DS_READ, 0, ADDR}}, + + {AMDGPU::DS_WRITE_B32, {DS_WRITE, 0, ADDR}}, + {AMDGPU::DS_WRITE_B64, {DS_WRITE, 0, ADDR}}, + {AMDGPU::DS_WRITE_B32_gfx9, {DS_WRITE, 0, ADDR}}, + {AMDGPU::DS_WRITE_B64_gfx9, {DS_WRITE, 0, ADDR}}, +}; +class SILoadStoreOptimizer : public MachineFunctionPass { struct CombineInfo { MachineBasicBlock::iterator I; MachineBasicBlock::iterator Paired; unsigned EltSize; unsigned Offset0; unsigned Offset1; + unsigned Width0; + unsigned Width1; unsigned BaseOff; InstClassEnum InstClass; bool GLC0; @@ -98,7 +257,6 @@ bool SLC0; bool SLC1; bool UseST64; - bool IsX2; SmallVector InstsToMove; }; @@ -108,9 +266,13 @@ const SIRegisterInfo *TRI = nullptr; MachineRegisterInfo *MRI = nullptr; AliasAnalysis *AA = nullptr; - unsigned CreatedX2; + bool OptimizeAgain; static bool offsetsCanBeCombined(CombineInfo &CI); + static bool widthsFit(const CombineInfo &CI); + static unsigned getNewOpcode(const CombineInfo &CI); + static std::pair getSubRegIdxs(const CombineInfo &CI); + static const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI); bool findMatchingInst(CombineInfo &CI); @@ -123,8 +285,6 @@ MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); - unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2, - bool &IsOffen) const; MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); public: @@ -260,10 +420,9 @@ CI.BaseOff = 0; // Handle SMEM and VMEM instructions. - if (CI.InstClass != DS_READ_WRITE) { - unsigned Diff = CI.IsX2 ? 2 : 1; - return (EltOffset0 + Diff == EltOffset1 || - EltOffset1 + Diff == EltOffset0) && + if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { + return (EltOffset0 + CI.Width0 == EltOffset1 || + EltOffset1 + CI.Width1 == EltOffset0) && CI.GLC0 == CI.GLC1 && (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1); } @@ -305,34 +464,64 @@ return false; } +bool SILoadStoreOptimizer::widthsFit(const CombineInfo &CI) { + const unsigned Width = (CI.Width0 + CI.Width1); + switch (CI.InstClass) { + default: + return Width <= 4; + case S_BUFFER_LOAD_IMM: + switch (Width) { + default: + return false; + case 2: + case 4: + case 8: + case 16: + return true; + } + } +} + +static unsigned getOpcodeWidth(unsigned Opc) { + return OpcodeInfoMap.lookup(Opc).Width; +} + bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); MachineBasicBlock::iterator E = MBB->end(); MachineBasicBlock::iterator MBBI = CI.I; - unsigned AddrOpName[3] = {0}; - int AddrIdx[3]; - const MachineOperand *AddrReg[3]; + const unsigned Opc = CI.I->getOpcode(); + + const auto lookup = OpcodeInfoMap.lookup(Opc); + + if (lookup.InstClass == UNKNOWN) { + return false; + } + + unsigned AddrOpName[5] = {0}; + int AddrIdx[5]; + const MachineOperand *AddrReg[5]; unsigned NumAddresses = 0; - switch (CI.InstClass) { - case DS_READ_WRITE: + if (lookup.Regs & ADDR) { AddrOpName[NumAddresses++] = AMDGPU::OpName::addr; - break; - case S_BUFFER_LOAD_IMM: + } + + if (lookup.Regs & SBASE) { AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase; - break; - case BUFFER_LOAD_OFFEN: - case BUFFER_STORE_OFFEN: - AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; - AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; - AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; - break; - case BUFFER_LOAD_OFFSET: - case BUFFER_STORE_OFFSET: + } + + if (lookup.Regs & SRSRC) { AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; + } + + if (lookup.Regs & SOFFSET) { AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; - break; + } + + if (lookup.Regs & VADDR) { + AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; } for (unsigned i = 0; i < NumAddresses; i++) { @@ -354,7 +543,13 @@ addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); for ( ; MBBI != E; ++MBBI) { - if (MBBI->getOpcode() != CI.I->getOpcode()) { + const auto MBBILookup = OpcodeInfoMap.lookup(MBBI->getOpcode()); + + const bool IsDS = + (lookup.InstClass == DS_READ) || (lookup.InstClass == DS_WRITE); + + if ((MBBILookup.InstClass != lookup.InstClass) || + (IsDS && (MBBI->getOpcode() != Opc))) { // This is not a matching DS instruction, but we can keep looking as // long as one of these conditions are met: // 1. It is safe to move I down past MBBI. @@ -426,10 +621,12 @@ int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset); CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm(); + CI.Width0 = getOpcodeWidth(Opc); CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm(); + CI.Width1 = getOpcodeWidth(MBBI->getOpcode()); CI.Paired = MBBI; - if (CI.InstClass == DS_READ_WRITE) { + if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) { CI.Offset0 &= 0xffff; CI.Offset1 &= 0xffff; } else { @@ -445,7 +642,7 @@ // We also need to go through the list of instructions that we plan to // move and make sure they are all safe to move down past the merged // instruction. - if (offsetsCanBeCombined(CI)) + if (widthsFit(CI) && offsetsCanBeCombined(CI)) if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) return true; } @@ -642,11 +839,10 @@ CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); - unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM : - AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; + const unsigned Opcode = getNewOpcode(CI); + + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); - const TargetRegisterClass *SuperRC = - CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass; unsigned DestReg = MRI->createVirtualRegister(SuperRC); unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); @@ -656,12 +852,9 @@ .addImm(CI.GLC0) // glc .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); - unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; - unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; - - // Handle descending offsets - if (CI.Offset0 > CI.Offset1) - std::swap(SubRegIdx0, SubRegIdx1); + std::pair SubRegIdx = getSubRegIdxs(CI); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); // Copy to the old destination registers. const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); @@ -687,25 +880,21 @@ CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); - unsigned Opcode; - if (CI.InstClass == BUFFER_LOAD_OFFEN) { - Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN : - AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; - } else { - Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET : - AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; - } + const unsigned Opcode = getNewOpcode(CI); - const TargetRegisterClass *SuperRC = - CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); + + // Copy to the new source register. unsigned DestReg = MRI->createVirtualRegister(SuperRC); unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg); - if (CI.InstClass == BUFFER_LOAD_OFFEN) - MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); + const auto lookup = OpcodeInfoMap.lookup(Opcode); + + if (lookup.Regs & VADDR) + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) @@ -715,12 +904,9 @@ .addImm(0) // tfe .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); - unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; - unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; - - // Handle descending offsets - if (CI.Offset0 > CI.Offset1) - std::swap(SubRegIdx0, SubRegIdx1); + std::pair SubRegIdx = getSubRegIdxs(CI); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); // Copy to the old destination registers. const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); @@ -742,57 +928,173 @@ return Next; } -unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode( - const MachineInstr &I, bool &IsX2, bool &IsOffen) const { - IsX2 = false; - IsOffen = false; - - switch (I.getOpcode()) { - case AMDGPU::BUFFER_STORE_DWORD_OFFEN: - IsOffen = true; - return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN; - case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: - IsOffen = true; - return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact; - case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: - IsX2 = true; - IsOffen = true; - return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN; - case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact: - IsX2 = true; - IsOffen = true; - return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact; - case AMDGPU::BUFFER_STORE_DWORD_OFFSET: - return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; - case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: - return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact; - case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: - IsX2 = true; - return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; - case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact: - IsX2 = true; - return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact; +unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) { + const unsigned Width = CI.Width0 + CI.Width1; + + for (auto Row : OpcodeInfoMap) { + const OpcodeInfo &info = Row.second; + if ((info.InstClass == CI.InstClass) && (info.Width == Width)) { + return Row.first; + } } + return 0; } +std::pair SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) { + if (CI.Offset0 > CI.Offset1) { + switch (CI.Width0) { + default: + break; + case 1: + switch (CI.Width1) { + default: + break; + case 1: + return std::make_pair(AMDGPU::sub1, AMDGPU::sub0); + case 2: + return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1); + case 3: + return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2); + } + break; + case 2: + switch (CI.Width1) { + default: + break; + case 1: + return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0); + case 2: + return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1); + } + break; + case 3: + switch (CI.Width1) { + default: + break; + case 1: + return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0); + } + break; + case 4: + switch (CI.Width1) { + default: + break; + case 4: + return std::make_pair(AMDGPU::sub4_sub5_sub6_sub7, + AMDGPU::sub0_sub1_sub2_sub3); + } + break; + case 8: + switch (CI.Width1) { + default: + break; + case 8: + return std::make_pair( + AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15, + AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7); + } + break; + } + } else { + switch (CI.Width0) { + default: + break; + case 1: + switch (CI.Width1) { + default: + break; + case 1: + return std::make_pair(AMDGPU::sub0, AMDGPU::sub1); + case 2: + return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2); + case 3: + return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3); + } + break; + case 2: + switch (CI.Width1) { + default: + break; + case 1: + return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2); + case 2: + return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3); + } + break; + case 3: + switch (CI.Width1) { + default: + break; + case 1: + return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3); + } + case 4: + switch (CI.Width1) { + default: + break; + case 4: + return std::make_pair(AMDGPU::sub0_sub1_sub2_sub3, + AMDGPU::sub4_sub5_sub6_sub7); + } + break; + case 8: + switch (CI.Width1) { + default: + break; + case 8: + return std::make_pair( + AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, + AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15); + } + break; + } + } + + return std::make_pair(0, 0); +} + +const TargetRegisterClass *SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) { + if ((CI.InstClass == S_LOAD_IMM) || (CI.InstClass == S_BUFFER_LOAD_IMM)) { + switch (CI.Width0 + CI.Width1) { + default: + return nullptr; + case 2: + return &AMDGPU::SReg_64_XEXECRegClass; + case 4: + return &AMDGPU::SReg_128RegClass; + case 8: + return &AMDGPU::SReg_256RegClass; + case 16: + return &AMDGPU::SReg_512RegClass; + } + } else { + switch (CI.Width0 + CI.Width1) { + default: + return nullptr; + case 2: + return &AMDGPU::VReg_64RegClass; + case 3: + return &AMDGPU::VReg_96RegClass; + case 4: + return &AMDGPU::VReg_128RegClass; + } + } +} + MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); - bool Unused1, Unused2; - unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2); - - unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; - unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; + + const unsigned Opcode = getNewOpcode(CI); - // Handle descending offsets - if (CI.Offset0 > CI.Offset1) - std::swap(SubRegIdx0, SubRegIdx1); + std::pair SubRegIdx = getSubRegIdxs(CI); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); // Copy to the new source register. - const TargetRegisterClass *SuperRC = - CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); unsigned SrcReg = MRI->createVirtualRegister(SuperRC); const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); @@ -807,7 +1109,9 @@ auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode)) .addReg(SrcReg, RegState::Kill); - if (CI.InstClass == BUFFER_STORE_OFFEN) + const auto lookup = OpcodeInfoMap.lookup(Opcode); + + if (lookup.Regs & VADDR) MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) @@ -841,90 +1145,73 @@ continue; } + const unsigned Opc = MI.getOpcode(); + const auto lookup = OpcodeInfoMap.lookup(Opc); + CombineInfo CI; CI.I = I; - unsigned Opc = MI.getOpcode(); - if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 || - Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) { + CI.InstClass = lookup.InstClass; - CI.InstClass = DS_READ_WRITE; + switch (lookup.InstClass) { + default: + break; + case DS_READ: CI.EltSize = - (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4; - + (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 + : 4; if (findMatchingInst(CI)) { Modified = true; I = mergeRead2Pair(CI); } else { ++I; } - continue; - } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 || - Opc == AMDGPU::DS_WRITE_B32_gfx9 || - Opc == AMDGPU::DS_WRITE_B64_gfx9) { - CI.InstClass = DS_READ_WRITE; + case DS_WRITE: CI.EltSize = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4; - if (findMatchingInst(CI)) { Modified = true; I = mergeWrite2Pair(CI); } else { ++I; } - continue; - } - if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM || - Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) { - // EltSize is in units of the offset encoding. - CI.InstClass = S_BUFFER_LOAD_IMM; + case S_BUFFER_LOAD_IMM: CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4); - CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; if (findMatchingInst(CI)) { Modified = true; I = mergeSBufferLoadImmPair(CI); - if (!CI.IsX2) - CreatedX2++; + OptimizeAgain |= (CI.Width0 + CI.Width1) <= 16; } else { ++I; } continue; - } - if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || - Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || - Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET || - Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) { - if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || - Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN) - CI.InstClass = BUFFER_LOAD_OFFEN; - else - CI.InstClass = BUFFER_LOAD_OFFSET; - + case BUFFER_LOAD_IDXEN: + case BUFFER_LOAD_OFFEN: + case BUFFER_LOAD_OFFSET: + case BUFFER_LOAD_IDXEN_exact: + case BUFFER_LOAD_OFFEN_exact: + case BUFFER_LOAD_OFFSET_exact: CI.EltSize = 4; - CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || - Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; if (findMatchingInst(CI)) { Modified = true; I = mergeBufferLoadPair(CI); - if (!CI.IsX2) - CreatedX2++; + OptimizeAgain |= (CI.Width0 + CI.Width1) <= 4; } else { ++I; } continue; - } - - bool StoreIsX2, IsOffen; - if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) { - CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET; + case BUFFER_STORE_IDXEN: + case BUFFER_STORE_OFFEN: + case BUFFER_STORE_OFFSET: + case BUFFER_STORE_IDXEN_exact: + case BUFFER_STORE_OFFEN_exact: + case BUFFER_STORE_OFFSET_exact: CI.EltSize = 4; - CI.IsX2 = StoreIsX2; if (findMatchingInst(CI)) { Modified = true; I = mergeBufferStorePair(CI); - if (!CI.IsX2) - CreatedX2++; + OptimizeAgain |= (CI.Width0 + CI.Width1) <= 4; } else { ++I; } @@ -958,12 +1245,10 @@ bool Modified = false; for (MachineBasicBlock &MBB : MF) { - CreatedX2 = 0; - Modified |= optimizeBlock(MBB); - - // Run again to convert x2 to x4. - if (CreatedX2 >= 1) + do { + OptimizeAgain = false; Modified |= optimizeBlock(MBB); + } while (OptimizeAgain); } return Modified; Index: test/CodeGen/AMDGPU/cvt_f32_ubyte.ll =================================================================== --- test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -36,10 +36,10 @@ ; GCN-LABEL: {{^}}load_v3i8_to_v3f32: ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]] ; GCN-NOT: v_cvt_f32_ubyte3_e32 -; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]] -; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]] +; GCN-DAG: v_cvt_f32_ubyte2_e32 v[[HIRESULT:[0-9]+]], [[VAL]] +; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[VAL]] ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]] -; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, +; GCN: buffer_store_dwordx3 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid Index: test/CodeGen/AMDGPU/early-if-convert-cost.ll =================================================================== --- test/CodeGen/AMDGPU/early-if-convert-cost.ll +++ test/CodeGen/AMDGPU/early-if-convert-cost.ll @@ -60,8 +60,7 @@ ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc -; GCN-DAG: buffer_store_dword v -; GCN-DAG: buffer_store_dwordx2 +; GCN-DAG: buffer_store_dwordx3 define amdgpu_kernel void @test_vccnz_ifcvt_triangle96(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in, float %cnd) #0 { entry: %v = load <3 x i32>, <3 x i32> addrspace(1)* %in Index: test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -95,8 +95,7 @@ ; GCN-LABEL: {{^}}dynamic_insertelement_v3f32: ; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]] -; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]: -; GCN-DAG: buffer_store_dword v +; GCN-DAG: buffer_store_dwordx3 {{v\[}}[[LOW_RESULT_REG]]: define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind { %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 @@ -146,8 +145,7 @@ ; GCN-LABEL: {{^}}dynamic_insertelement_v3i32: ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], 5 -; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]: -; GCN-DAG: buffer_store_dword v +; GCN-DAG: buffer_store_dwordx3 {{v\[}}[[LOW_RESULT_REG]]: define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind { %vecins = insertelement <3 x i32> %a, i32 5, i32 %b store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16 Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll @@ -193,6 +193,22 @@ ret void } +;CHECK-LABEL: {{^}}buffer_load_x3_offen_merged: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 +;CHECK: s_waitcnt +define amdgpu_ps void @buffer_load_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a) { +main_body: + %a1 = add i32 %a, 4 + %a2 = add i32 %a, 12 + %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) + %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) + %r1 = extractelement <2 x float> %vr1, i32 0 + %r2 = extractelement <2 x float> %vr1, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float undef, i1 true, i1 true) + ret void +} + ;CHECK-LABEL: {{^}}buffer_load_x1_offset_merged: ;CHECK-NEXT: %bb. ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 @@ -227,6 +243,117 @@ ret void } +;CHECK-LABEL: {{^}}buffer_load_x3_offset_merged: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +;CHECK: s_waitcnt +define amdgpu_ps void @buffer_load_x3_offset_merged(<4 x i32> inreg %rsrc) { +main_body: + %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) + %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) + %r1 = extractelement <2 x float> %vr1, i32 0 + %r2 = extractelement <2 x float> %vr1, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}buffer_load_x1_idxen_merged: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_load_dwordx2 v[0:1], v0, s[0:3], 0 idxen offset:4 +define amdgpu_ps void @buffer_load_x1_idxen_merged(<4 x i32> inreg %rsrc, i32 %index) { + %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 4, i1 0, i1 0) + %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float 0.0, float 0.0, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}buffer_load_x1_idxen_merged2: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen offset:4 +define amdgpu_ps void @buffer_load_x1_idxen_merged2(<4 x i32> inreg %rsrc, i32 %index) { + %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 4, i1 0, i1 0) + %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 12, i1 0, i1 0) + %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 16, i1 0, i1 0) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}buffer_load_x2_idxen_merged: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen +define amdgpu_ps void @buffer_load_x2_idxen_merged(<4 x i32> inreg %rsrc, i32 %index) { + %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + %vr2 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + %r1 = extractelement <2 x float> %vr1, i32 0 + %r2 = extractelement <2 x float> %vr1, i32 1 + %r3 = extractelement <2 x float> %vr2, i32 0 + %r4 = extractelement <2 x float> %vr2, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}buffer_load_x3_idxen_merged: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_load_dwordx3 v[0:2], v0, s[0:3], 0 idxen +define amdgpu_ps void @buffer_load_x3_idxen_merged(<4 x i32> inreg %rsrc, i32 %index) { + %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 4, i1 0, i1 0) + %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}buffer_load_x3_idxen_merged2: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_load_dwordx3 v[0:2], v0, s[0:3], 0 idxen +define amdgpu_ps void @buffer_load_x3_idxen_merged2(<4 x i32> inreg %rsrc, i32 %index) { + %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + %r1 = extractelement <2 x float> %vr1, i32 0 + %r2 = extractelement <2 x float> %vr1, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}buffer_load_x3_idxen_merged3: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_load_dwordx3 v[0:2], v0, s[0:3], 0 idxen +define amdgpu_ps void @buffer_load_x3_idxen_merged3(<4 x i32> inreg %rsrc, i32 %index) { + %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %index, i32 12, i1 0, i1 0) + %r2 = extractelement <2 x float> %vr1, i32 0 + %r3 = extractelement <2 x float> %vr1, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}buffer_load_x4_idxen_merged: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen +define amdgpu_ps void @buffer_load_x4_idxen_merged(<4 x i32> inreg %rsrc, i32 %index) { + %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 12, i1 0, i1 0) + %r1 = extractelement <2 x float> %vr1, i32 0 + %r2 = extractelement <2 x float> %vr1, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}buffer_load_x4_idxen_merged2: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen offset:8 +define amdgpu_ps void @buffer_load_x4_idxen_merged2(<4 x i32> inreg %rsrc, i32 %index) { + %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %index, i32 12, i1 0, i1 0) + %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %index, i32 20, i1 0, i1 0) + %r2 = extractelement <2 x float> %vr1, i32 0 + %r3 = extractelement <2 x float> %vr1, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) + ret void +} + declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0 declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0 declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0 Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll @@ -147,6 +147,41 @@ ret void } +;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 +define amdgpu_ps void @buffer_store_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3) { + %a1 = add i32 %a, 28 + %a2 = add i32 %a, 32 + %a3 = add i32 %a, 36 + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 %a3, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged2: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 +define amdgpu_ps void @buffer_store_x3_offen_merged2(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, float %v2) { + %a1 = add i32 %a, 4 + %a2 = add i32 %a, 12 + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged3: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 +define amdgpu_ps void @buffer_store_x3_offen_merged3(<4 x i32> inreg %rsrc, i32 %a, float %v1, <2 x float> %v2) { + %a1 = add i32 %a, 4 + %a2 = add i32 %a, 8 + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) + ret void +} + ;CHECK-LABEL: {{^}}buffer_store_x1_offset_merged: ;CHECK-NOT: s_waitcnt ;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 @@ -164,12 +199,118 @@ ;CHECK-LABEL: {{^}}buffer_store_x2_offset_merged: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1,<2 x float> %v2) { +define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1, <2 x float> %v2) { call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) ret void } +;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged: +;CHECK-NOT: s_waitcnt +;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +define amdgpu_ps void @buffer_store_x3_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3) { + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged2: +;CHECK-NOT: s_waitcnt +;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +define amdgpu_ps void @buffer_store_x3_offset_merged2(<4 x i32> inreg %rsrc, float %v1, <2 x float> %v2) { + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged3: +;CHECK-NOT: s_waitcnt +;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:8 +define amdgpu_ps void @buffer_store_x3_offset_merged3(<4 x i32> inreg %rsrc, <2 x float> %v1, float %v2) { + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) + ret void +} + + +;CHECK-LABEL: {{^}}buffer_store_x1_idxen_merged: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx2 v[1:2], v0, s[0:3], 0 idxen offset:4 +define amdgpu_ps void @buffer_store_x1_idxen_merged(<4 x i32> inreg %rsrc, i32 %index, float %v1, float %v2) { + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 %index, i32 4, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x1_idxen_merged2: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 idxen offset:4 +define amdgpu_ps void @buffer_store_x1_idxen_merged2(<4 x i32> inreg %rsrc, i32 %index, float %v1, float %v2, float %v3, float %v4) { + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 %index, i32 4, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 %index, i32 12, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 %index, i32 16, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x2_idxen_merged: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x2_idxen_merged(<4 x i32> inreg %rsrc, i32 %index, <2 x float> %v1, <2 x float> %v2) { + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x3_idxen_merged: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx3 v[1:3], v0, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x3_idxen_merged(<4 x i32> inreg %rsrc, i32 %index, float %v1, float %v2, float %v3) { + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 %index, i32 12, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 %index, i32 16, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x3_idxen_merged2: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx3 v[1:3], v0, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x3_idxen_merged2(<4 x i32> inreg %rsrc, i32 %index, <2 x float> %v1, float %v2) { + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 %index, i32 16, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x3_idxen_merged3: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx3 v[1:3], v0, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x3_idxen_merged3(<4 x i32> inreg %rsrc, i32 %index, float %v1, <2 x float> %v2) { + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 %index, i32 4, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x4_idxen_merged: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x4_idxen_merged(<4 x i32> inreg %rsrc, i32 %index, float %v1, <2 x float> %v2, float %v3) { + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 %index, i32 12, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 %index, i32 20, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x4_idxen_merged2: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x4_idxen_merged2(<4 x i32> inreg %rsrc, i32 %index, <2 x float> %v1, float %v2, float %v3) { + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 %index, i32 8, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 %index, i32 12, i1 0, i1 0) + ret void +} + declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #0 declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0 declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0 Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll @@ -0,0 +1,242 @@ +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}s_buffer_load_imm: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x4 +define amdgpu_ps void @s_buffer_load_imm(<4 x i32> inreg %desc) { +main_body: + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0) + %bitcast = bitcast i32 %load to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_load_index: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} +define amdgpu_ps void @s_buffer_load_index(<4 x i32> inreg %desc, i32 inreg %index) { +main_body: + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %index, i32 0) + %bitcast = bitcast i32 %load to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_loadx2_imm: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x40 +define amdgpu_ps void @s_buffer_loadx2_imm(<4 x i32> inreg %desc) { +main_body: + %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 64, i32 0) + %bitcast = bitcast <2 x i32> %load to <2 x float> + %x = extractelement <2 x float> %bitcast, i32 0 + %y = extractelement <2 x float> %bitcast, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_loadx2_index: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} +define amdgpu_ps void @s_buffer_loadx2_index(<4 x i32> inreg %desc, i32 inreg %index) { +main_body: + %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %index, i32 0) + %bitcast = bitcast <2 x i32> %load to <2 x float> + %x = extractelement <2 x float> %bitcast, i32 0 + %y = extractelement <2 x float> %bitcast, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_loadx4_imm: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0xc8 +define amdgpu_ps void @s_buffer_loadx4_imm(<4 x i32> inreg %desc) { +main_body: + %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 200, i32 0) + %bitcast = bitcast <4 x i32> %load to <4 x float> + %x = extractelement <4 x float> %bitcast, i32 0 + %y = extractelement <4 x float> %bitcast, i32 1 + %z = extractelement <4 x float> %bitcast, i32 2 + %w = extractelement <4 x float> %bitcast, i32 3 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_loadx4_index: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} +define amdgpu_ps void @s_buffer_loadx4_index(<4 x i32> inreg %desc, i32 inreg %index) { +main_body: + %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %index, i32 0) + %bitcast = bitcast <4 x i32> %load to <4 x float> + %x = extractelement <4 x float> %bitcast, i32 0 + %y = extractelement <4 x float> %bitcast, i32 1 + %z = extractelement <4 x float> %bitcast, i32 2 + %w = extractelement <4 x float> %bitcast, i32 3 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_loadx8_imm: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x8 +define amdgpu_ps void @s_buffer_loadx8_imm(<4 x i32> inreg %desc) { +main_body: + %load = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %desc, i32 8, i32 0) + %bitcast = bitcast <8 x i32> %load to <8 x float> + %x = extractelement <8 x float> %bitcast, i32 0 + %y = extractelement <8 x float> %bitcast, i32 2 + %z = extractelement <8 x float> %bitcast, i32 4 + %w = extractelement <8 x float> %bitcast, i32 6 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_loadx8_index: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} +define amdgpu_ps void @s_buffer_loadx8_index(<4 x i32> inreg %desc, i32 inreg %index) { +main_body: + %load = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %desc, i32 %index, i32 0) + %bitcast = bitcast <8 x i32> %load to <8 x float> + %x = extractelement <8 x float> %bitcast, i32 0 + %y = extractelement <8 x float> %bitcast, i32 2 + %z = extractelement <8 x float> %bitcast, i32 4 + %w = extractelement <8 x float> %bitcast, i32 6 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_loadx16_imm: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0xc +define amdgpu_ps void @s_buffer_loadx16_imm(<4 x i32> inreg %desc) { +main_body: + %load = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %desc, i32 12, i32 0) + %bitcast = bitcast <16 x i32> %load to <16 x float> + %x = extractelement <16 x float> %bitcast, i32 0 + %y = extractelement <16 x float> %bitcast, i32 4 + %z = extractelement <16 x float> %bitcast, i32 8 + %w = extractelement <16 x float> %bitcast, i32 12 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_loadx16_index: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} +define amdgpu_ps void @s_buffer_loadx16_index(<4 x i32> inreg %desc, i32 inreg %index) { +main_body: + %load = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %desc, i32 %index, i32 0) + %bitcast = bitcast <16 x i32> %load to <16 x float> + %x = extractelement <16 x float> %bitcast, i32 0 + %y = extractelement <16 x float> %bitcast, i32 4 + %z = extractelement <16 x float> %bitcast, i32 8 + %w = extractelement <16 x float> %bitcast, i32 12 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_load_imm_mergex2: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x4 +define amdgpu_ps void @s_buffer_load_imm_mergex2(<4 x i32> inreg %desc) { +main_body: + %load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0) + %load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 8, i32 0) + %x = bitcast i32 %load0 to float + %y = bitcast i32 %load1 to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_load_imm_mergex4: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x8 +define amdgpu_ps void @s_buffer_load_imm_mergex4(<4 x i32> inreg %desc) { +main_body: + %load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 8, i32 0) + %load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 12, i32 0) + %load2 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 16, i32 0) + %load3 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 20, i32 0) + %x = bitcast i32 %load0 to float + %y = bitcast i32 %load1 to float + %z = bitcast i32 %load2 to float + %w = bitcast i32 %load3 to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_load_imm_mergex8: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x0 +define amdgpu_ps void @s_buffer_load_imm_mergex8(<4 x i32> inreg %desc) { +main_body: + %load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0) + %load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0) + %load2 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 8, i32 0) + %load3 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 12, i32 0) + %load4 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 16, i32 0) + %load5 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 20, i32 0) + %load6 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 24, i32 0) + %load7 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 28, i32 0) + %add0 = add i32 %load0, %load1 + %add1 = add i32 %load2, %load3 + %add2 = add i32 %load4, %load5 + %add3 = add i32 %load6, %load7 + %x = bitcast i32 %add0 to float + %y = bitcast i32 %add1 to float + %z = bitcast i32 %add2 to float + %w = bitcast i32 %add3 to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_load_imm_mergex16: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x0 +define amdgpu_ps void @s_buffer_load_imm_mergex16(<4 x i32> inreg %desc) { +main_body: + %load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0) + %load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0) + %load2 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 8, i32 0) + %load3 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 12, i32 0) + %load4 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 16, i32 0) + %load5 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 20, i32 0) + %load6 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 24, i32 0) + %load7 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 28, i32 0) + %load8 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 32, i32 0) + %load9 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 36, i32 0) + %loada = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 40, i32 0) + %loadb = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 44, i32 0) + %loadc = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 48, i32 0) + %loadd = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 52, i32 0) + %loade = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 56, i32 0) + %loadf = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 60, i32 0) + %add0 = add i32 %load0, %load1 + %add1 = add i32 %load2, %load3 + %add2 = add i32 %load4, %load5 + %add3 = add i32 %load6, %load7 + %add4 = add i32 %load8, %load9 + %add5 = add i32 %loada, %loadb + %add6 = add i32 %loadc, %loadd + %add7 = add i32 %loade, %loadf + %mul0 = mul i32 %add0, %add1 + %mul1 = mul i32 %add2, %add3 + %mul2 = mul i32 %add4, %add5 + %mul3 = mul i32 %add6, %add7 + %x = bitcast i32 %mul0 to float + %y = bitcast i32 %mul1 to float + %z = bitcast i32 %mul2 to float + %w = bitcast i32 %mul3 to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true) + ret void +} + +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) +declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32) +declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32) +declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) +declare <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32>, i32, i32) +declare <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32>, i32, i32) Index: test/CodeGen/AMDGPU/merge-stores.ll =================================================================== --- test/CodeGen/AMDGPU/merge-stores.ll +++ test/CodeGen/AMDGPU/merge-stores.ll @@ -164,8 +164,8 @@ } ; GCN-LABEL: {{^}}merge_global_store_3_constants_i32: -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dword +; SI-DAG: buffer_store_dwordx3 +; SI-NOT: buffer_store_dwordx2 ; SI-NOT: buffer_store_dword ; GCN: s_endpgm define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 { @@ -274,11 +274,9 @@ } ; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32: -; SI-DAG: buffer_load_dwordx2 -; SI-DAG: buffer_load_dword v +; SI-DAG: buffer_load_dwordx3 ; GCN: s_waitcnt -; SI-DAG: buffer_store_dword v -; SI-DAG: buffer_store_dwordx2 v +; SI-DAG: buffer_store_dwordx3 v ; GCN: s_endpgm define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 @@ -563,8 +561,7 @@ ; GCN-LABEL: {{^}}merge_global_store_7_constants_i32: ; GCN: buffer_store_dwordx4 -; GCN: buffer_store_dwordx2 -; GCN: buffer_store_dword v +; GCN: buffer_store_dwordx3 define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) { store i32 34, i32 addrspace(1)* %out, align 4 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 @@ -611,13 +608,11 @@ ; GCN-LABEL: {{^}}copy_v3i32_align4: ; GCN-NOT: SCRATCH_RSRC_DWORD -; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 -; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-NOT: offen ; GCN: s_waitcnt vmcnt ; GCN-NOT: offen -; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 +; GCN-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN: ScratchSize: 0{{$}} define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 { @@ -644,13 +639,11 @@ ; GCN-LABEL: {{^}}copy_v3f32_align4: ; GCN-NOT: SCRATCH_RSRC_DWORD -; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 -; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-NOT: offen ; GCN: s_waitcnt vmcnt ; GCN-NOT: offen -; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 +; GCN-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN: ScratchSize: 0{{$}} define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4 Index: test/CodeGen/AMDGPU/store-global.ll =================================================================== --- test/CodeGen/AMDGPU/store-global.ll +++ test/CodeGen/AMDGPU/store-global.ll @@ -273,8 +273,7 @@ } ; FUNC-LABEL: {{^}}store_v3i32: -; SIVI-DAG: buffer_store_dwordx2 -; SIVI-DAG: buffer_store_dword v +; SIVI-DAG: buffer_store_dwordx3 ; GFX9-DAG: global_store_dwordx2 ; GFX9-DAG: global_store_dword v Index: test/CodeGen/AMDGPU/store-v3i64.ll =================================================================== --- test/CodeGen/AMDGPU/store-v3i64.ll +++ test/CodeGen/AMDGPU/store-v3i64.ll @@ -89,8 +89,7 @@ } ; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i32: -; GCN-DAG: buffer_store_dwordx2 -; GCN-DAG: buffer_store_dword v +; GCN-DAG: buffer_store_dwordx3 define amdgpu_kernel void @global_truncstore_v3i64_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i64> %x) { %trunc = trunc <3 x i64> %x to <3 x i32> store <3 x i32> %trunc, <3 x i32> addrspace(1)* %out