diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -23,6 +23,8 @@ #include "llvm/ADT/BitmaskEnum.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/TargetOpcodes.h" +#include +#include using namespace llvm; @@ -36,41 +38,252 @@ "their ordering for scheduling"), cl::init(false)); -static cl::opt> - VMEMGroupMaxSize("amdgpu-igrouplp-vmem-group-size", cl::init(None), - cl::Hidden, - cl::desc("The maximum number of instructions to include " - "in VMEM group.")); - -static cl::opt> - MFMAGroupMaxSize("amdgpu-igrouplp-mfma-group-size", cl::init(None), - cl::Hidden, - cl::desc("The maximum number of instructions to include " - "in MFMA group.")); - -static cl::opt> - LDRGroupMaxSize("amdgpu-igrouplp-ldr-group-size", cl::init(None), - cl::Hidden, - cl::desc("The maximum number of instructions to include " - "in lds/gds read group.")); - -static cl::opt> - LDWGroupMaxSize("amdgpu-igrouplp-ldw-group-size", cl::init(None), - cl::Hidden, - cl::desc("The maximum number of instructions to include " - "in lds/gds write group.")); - -typedef function_ref - CanAddMIFn; +enum class SchedGroupMask { + NONE = 0u, + ALU = 1u << 0, + VALU = 1u << 1, + SALU = 1u << 2, + MFMA = 1u << 3, + VMEM = 1u << 4, + VMEM_READ = 1u << 5, + VMEM_WRITE = 1u << 6, + DS = 1u << 7, + DS_READ = 1u << 8, + DS_WRITE = 1u << 9, + ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS | + DS_READ | DS_WRITE, + LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) +}; + +// The order of IGroup stages and their optional sizes as returned +// by the parser. +static SmallVector>, 8> + IGroupLPOrder; + +struct IGroupOrderParser + : public cl::parser>> { + IGroupOrderParser(cl::Option &O) + : cl::parser>>(O) {} + + // Possible categories that a comma seperated string may + // fall into + enum Token { tok_start, tok_group, tok_number, tok_error }; + + // The previously encountered token type + unsigned PrevToken = tok_start; + + // A bit vector encoding the encountered groups thus far + SchedGroupMask ObservedGroups = SchedGroupMask::NONE; + + SchedGroupMask getMaskFromStr(const std::string &Token) { + if (Token == "alu") + return SchedGroupMask::ALU; + else if (Token == "valu") + return SchedGroupMask::VALU; + else if (Token == "salu") + return SchedGroupMask::SALU; + else if (Token == "mfma") + return SchedGroupMask::MFMA; + else if (Token == "vmem") + return SchedGroupMask::VMEM; + else if (Token == "vmemr") + return SchedGroupMask::VMEM_READ; + else if (Token == "vmemw") + return SchedGroupMask::VMEM_WRITE; + else if (Token == "ds") + return SchedGroupMask::DS; + else if (Token == "dsr") + return SchedGroupMask::DS_READ; + else if (Token == "dsw") + return SchedGroupMask::DS_WRITE; + + else + return SchedGroupMask::NONE; + } + + // Ensure that we do not have multiple occurances of the same + // igroup (including potential SubGroups). + unsigned handleGroup(std::string &Token, cl::Option &O, + SchedGroupMask TokenMask) { + assert(TokenMask != SchedGroupMask::NONE && + TokenMask != SchedGroupMask::ALL); + if ((ObservedGroups & TokenMask) != SchedGroupMask::NONE) { + O.error("Multiple occurance of " + Token); + return tok_error; + } + + SchedGroupMask SubGroupMask = SchedGroupMask::NONE; + if (TokenMask == SchedGroupMask::ALU) + SubGroupMask = + SchedGroupMask::SALU | SchedGroupMask::VALU | SchedGroupMask::MFMA; + else if (TokenMask == SchedGroupMask::DS) + SubGroupMask = SchedGroupMask::DS_READ | SchedGroupMask::DS_WRITE; + else if (TokenMask == SchedGroupMask::VMEM) + SubGroupMask = SchedGroupMask::VMEM_READ | SchedGroupMask::VMEM_WRITE; + + if (SubGroupMask != SchedGroupMask::NONE) { + if ((ObservedGroups & SubGroupMask) != SchedGroupMask::NONE) { + O.error("Multiple occurance " + Token + + ". Overlaps with existing SubGroup"); + return tok_error; + } + } + + // Add group token to encountered groups + ObservedGroups |= TokenMask; + // Add sub group token to encountered groups + ObservedGroups |= SubGroupMask; + + return tok_group; + } + + // Check for properly formatted numbers and igroup strings. + // If we are unable to easily find one, then flag as error. + unsigned getTokenType(StringRef Value, cl::Option &O, + SchedGroupMask &TokenMask) { + std::string Token = Value.str(); + std::string::const_iterator it = Token.begin(); + + // Check for a complete natural number. Decimals and + // negatives don't make sense in the context of group size, + // and are thus not supported + if (std::isdigit(*it)) { + while (it != Token.end() && std::isdigit(*it)) + ++it; + + return (it == Token.end()) ? tok_number : tok_error; + } + + if (std::isalpha(*it)) { + // Transform the string to lower case to allow for + // more matching + std::transform(Token.begin(), Token.end(), Token.begin(), + [](unsigned char c) { return std::tolower(c); }); + + // Check if the token matches with a supported IGroup + TokenMask = getMaskFromStr(Token); + if (TokenMask != SchedGroupMask::NONE) { + return handleGroup(Token, O, TokenMask); + } + } + // Bad alphabetical string, or non alpha/numeric string + return tok_error; + } + + bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, + std::pair> &Value) { + int CurrToken = getTokenType(Arg, O, Value.first); + + if (CurrToken == tok_error) + return O.error("Invalid Token '" + Arg + "'"); + + switch (PrevToken) { + case tok_start: + case tok_number: + // If there has been no token, or if the previous token was a group size, + // then we must encounter a group name. + if (CurrToken != tok_group) + return O.error("Invalid Token '" + Arg + "'. Expected group token."); + break; + case tok_group: + if (CurrToken == tok_number) { + IGroupLPOrder.back().second = std::stoi(Arg.str()); + break; + } + // If we previously encountered a group name, and the current token is not + // a number, then the current token must be a group name + if (CurrToken != tok_group) + return O.error("Invalid Token '" + Arg + + "'. Expected group or number token."); + break; + case tok_error: + default: + // The only other possible token value is tok_error which is already + // handled. + llvm_unreachable("Unsupported Token occured"); + } + + PrevToken = CurrToken; + return 0; + } +}; + +static cl::list>, 8>, + IGroupOrderParser> + List("amdgpu-igrouplp-order", + cl::desc("This option is used to specify the order of groups and " + "their sizes to be used in AMDGPUIGroupLP. To specify, " + "enter a comma seperated list of groups in {salu, valu, " + "mfma, dsr, dsw, vmemr, vmemw, vmem} and an optional size " + "after each."), + cl::CommaSeparated, cl::location(IGroupLPOrder)); // Classify instructions into groups to enable fine tuned control over the // scheduler. These groups may be more specific than current SchedModel // instruction classes. class SchedGroup { private: - // Function that returns true if a non-bundle MI may be inserted into this - // group. - const CanAddMIFn canAddMI; + // Mask that defines which instruction types can be classified into this + // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER + // and SCHED_GROUP_BARRIER. + SchedGroupMask SGMask; + + // Use SGMask to determine whether we can classify MI as a member of this + // SchedGroup object. + bool canAddMI(const MachineInstr &MI) const { + bool Result = false; + if (MI.isMetaInstruction()) + Result = false; + + else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) && + (TII->isVALU(MI) || TII->isMFMA(MI) || TII->isSALU(MI))) + Result = true; + + else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) && + TII->isVALU(MI) && !TII->isMFMA(MI)) + Result = true; + + else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) && + TII->isSALU(MI)) + Result = true; + + else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) && + TII->isMFMA(MI)) + Result = true; + + else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) && + (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) + Result = true; + + else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) && + MI.mayLoad() && + (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) + Result = true; + + else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) && + MI.mayStore() && + (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) + Result = true; + + else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) && + TII->isDS(MI)) + Result = true; + + else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) && + MI.mayLoad() && TII->isDS(MI)) + Result = true; + + else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) && + MI.mayStore() && TII->isDS(MI)) + Result = true; + + LLVM_DEBUG(dbgs() << "For SchedGroup with mask " + << format_hex((int)SGMask, 10, true) + << (Result ? " added " : " unable to add ") << MI); + + return Result; + } // Maximum number of SUnits that can be added to this group. Optional MaxSize; @@ -78,7 +291,9 @@ // Collection of SUnits that are classified as members of this group. SmallVector Collection; - ScheduleDAGInstrs *DAG; + ScheduleDAGInstrs *DAG = nullptr; + + const SIInstrInfo *TII; void tryAddEdge(SUnit *A, SUnit *B) { if (A != B && DAG->canAddEdge(B, A)) { @@ -124,7 +339,7 @@ } // Returns true if no more instructions may be added to this group. - bool isFull() { return MaxSize.hasValue() && Collection.size() >= *MaxSize; } + bool isFull() { return MaxSize && Collection.size() >= *MaxSize; } // Returns true if SU can be added to this SchedGroup. bool canAddSU(SUnit &SU, const SIInstrInfo *TII) { @@ -132,9 +347,9 @@ return false; MachineInstr &MI = *SU.getInstr(); - if (MI.getOpcode() != TargetOpcode::BUNDLE) - return canAddMI(MI, TII); - + if (MI.getOpcode() != TargetOpcode::BUNDLE) { + return canAddMI(MI); + } // Special case for bundled MIs. const MachineBasicBlock *MBB = MI.getParent(); MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B; @@ -142,50 +357,21 @@ ++E; // Return true if all of the bundled MIs can be added to this group. - return std::all_of( - B, E, [this, TII](MachineInstr &MI) { return canAddMI(MI, TII); }); + return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); }); } void add(SUnit &SU) { Collection.push_back(&SU); } - SchedGroup(CanAddMIFn canAddMI, Optional MaxSize, - ScheduleDAGInstrs *DAG) - : canAddMI(canAddMI), MaxSize(MaxSize), DAG(DAG) {} -}; - -bool isMFMASGMember(const MachineInstr &MI, const SIInstrInfo *TII) { - return TII->isMFMA(MI); -} - -bool isVALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { - return TII->isVALU(MI) && !TII->isMFMA(MI); -} - -bool isSALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { - return TII->isSALU(MI); -} - -bool isVMEMSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { - return TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)); -} - -bool isVMEMReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { - return MI.mayLoad() && - (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))); -} - -bool isVMEMWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { - return MI.mayStore() && - (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))); -} + SchedGroup(SchedGroupMask SGMask) : SGMask(SGMask) {} -bool isDSWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { - return MI.mayStore() && TII->isDS(MI); -} + SchedGroup(SchedGroupMask SGMask, Optional MaxSize, + ScheduleDAGInstrs *DAG) + : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG) {} -bool isDSReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { - return MI.mayLoad() && TII->isDS(MI); -} + SchedGroup(SchedGroupMask SGMask, Optional MaxSize, + ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) + : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {} +}; class IGroupLPDAGMutation : public ScheduleDAGMutation { public: @@ -206,23 +392,6 @@ ScheduleDAGMI *DAG; - // Components of the mask that determines which instructions may not be - // scheduled across the SCHED_BARRIER. - enum class SchedBarrierMasks { - NONE = 0u, - ALU = 1u << 0, - VALU = 1u << 1, - SALU = 1u << 2, - MFMA = 1u << 3, - VMEM = 1u << 4, - VMEM_READ = 1u << 5, - VMEM_WRITE = 1u << 6, - DS = 1u << 7, - DS_READ = 1u << 8, - DS_WRITE = 1u << 9, - LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ DS_WRITE) - }; - // Cache SchedGroups of each type if we have multiple SCHED_BARRIERs in a // region. // @@ -268,11 +437,26 @@ // order in which edges will be added. In other words, given the // present ordering, we will try to make each VMEMRead instruction // a predecessor of each DSRead instruction, and so on. - SmallVector PipelineOrderGroups = { - SchedGroup(isVMEMSGMember, VMEMGroupMaxSize, DAG), - SchedGroup(isDSReadSGMember, LDRGroupMaxSize, DAG), - SchedGroup(isMFMASGMember, MFMAGroupMaxSize, DAG), - SchedGroup(isDSWriteSGMember, LDWGroupMaxSize, DAG)}; + SmallVector PipelineOrderGroups; + + // Since the input string has been pre-parsed, we know we have a + // well formed sequence of well formed strings. They will start with + // an IGroup and will optinally be followed by a size. + if (IGroupLPOrder.size() > 0) { + for (auto &Stage : IGroupLPOrder) { + PipelineOrderGroups.push_back( + SchedGroup(Stage.first, Stage.second, DAG, TII)); + } + } + + // Default to backwardsly compatible behavior + else { + PipelineOrderGroups = { + SchedGroup(SchedGroupMask::VMEM, None, DAG, TII), + SchedGroup(SchedGroupMask::DS_READ, None, DAG, TII), + SchedGroup(SchedGroupMask::MFMA, None, DAG, TII), + SchedGroup(SchedGroupMask::DS_WRITE, None, DAG, TII)}; + } for (SUnit &SU : DAG->SUnits) { LLVM_DEBUG(dbgs() << "Checking Node"; DAG->dumpNode(SU)); @@ -324,78 +508,81 @@ void SchedBarrierDAGMutation::getSchedGroupsFromMask( int32_t Mask, SmallVectorImpl &SchedGroups) { - SchedBarrierMasks SBMask = (SchedBarrierMasks)Mask; + SchedGroupMask SBMask = (SchedGroupMask)Mask; // See IntrinsicsAMDGPU.td for an explanation of these masks and their // mappings. // - if ((SBMask & SchedBarrierMasks::VALU) == SchedBarrierMasks::NONE && - (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) { + if ((SBMask & SchedGroupMask::VALU) == SchedGroupMask::NONE && + (SBMask & SchedGroupMask::ALU) == SchedGroupMask::NONE) { if (!VALUSchedGroup) { - VALUSchedGroup = std::make_unique(isVALUSGMember, None, DAG); + VALUSchedGroup = + std::make_unique(SchedGroupMask::VALU, None, DAG); initSchedGroup(VALUSchedGroup.get()); } SchedGroups.push_back(VALUSchedGroup.get()); } - if ((SBMask & SchedBarrierMasks::SALU) == SchedBarrierMasks::NONE && - (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) { + if ((SBMask & SchedGroupMask::SALU) == SchedGroupMask::NONE && + (SBMask & SchedGroupMask::ALU) == SchedGroupMask::NONE) { if (!SALUSchedGroup) { - SALUSchedGroup = std::make_unique(isSALUSGMember, None, DAG); + SALUSchedGroup = + std::make_unique(SchedGroupMask::SALU, None, DAG); initSchedGroup(SALUSchedGroup.get()); } SchedGroups.push_back(SALUSchedGroup.get()); } - if ((SBMask & SchedBarrierMasks::MFMA) == SchedBarrierMasks::NONE && - (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) { + if ((SBMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE && + (SBMask & SchedGroupMask::ALU) == SchedGroupMask::NONE) { if (!MFMASchedGroup) { - MFMASchedGroup = std::make_unique(isMFMASGMember, None, DAG); + MFMASchedGroup = + std::make_unique(SchedGroupMask::MFMA, None, DAG); initSchedGroup(MFMASchedGroup.get()); } SchedGroups.push_back(MFMASchedGroup.get()); } - if ((SBMask & SchedBarrierMasks::VMEM_READ) == SchedBarrierMasks::NONE && - (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) { + if ((SBMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE && + (SBMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE) { if (!VMEMReadSchedGroup) { VMEMReadSchedGroup = - std::make_unique(isVMEMReadSGMember, None, DAG); + std::make_unique(SchedGroupMask::VMEM_READ, None, DAG); initSchedGroup(VMEMReadSchedGroup.get()); } SchedGroups.push_back(VMEMReadSchedGroup.get()); } - if ((SBMask & SchedBarrierMasks::VMEM_WRITE) == SchedBarrierMasks::NONE && - (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) { + if ((SBMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE && + (SBMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE) { if (!VMEMWriteSchedGroup) { VMEMWriteSchedGroup = - std::make_unique(isVMEMWriteSGMember, None, DAG); + std::make_unique(SchedGroupMask::VMEM_WRITE, None, DAG); initSchedGroup(VMEMWriteSchedGroup.get()); } SchedGroups.push_back(VMEMWriteSchedGroup.get()); } - if ((SBMask & SchedBarrierMasks::DS_READ) == SchedBarrierMasks::NONE && - (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) { + if ((SBMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE && + (SBMask & SchedGroupMask::DS) == SchedGroupMask::NONE) { if (!DSReadSchedGroup) { DSReadSchedGroup = - std::make_unique(isDSReadSGMember, None, DAG); + std::make_unique(SchedGroupMask::DS_READ, None, DAG); initSchedGroup(DSReadSchedGroup.get()); } SchedGroups.push_back(DSReadSchedGroup.get()); } - if ((SBMask & SchedBarrierMasks::DS_WRITE) == SchedBarrierMasks::NONE && - (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) { + if ((SBMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE && + (SBMask & SchedGroupMask::DS) == SchedGroupMask::NONE) { if (!DSWriteSchedGroup) { DSWriteSchedGroup = - std::make_unique(isDSWriteSGMember, None, DAG); + std::make_unique(SchedGroupMask::DS_WRITE, None, DAG); initSchedGroup(DSWriteSchedGroup.get()); } diff --git a/llvm/test/CodeGen/AMDGPU/igrouplp-dag-mutation.mir b/llvm/test/CodeGen/AMDGPU/igrouplp-dag-mutation.mir --- a/llvm/test/CodeGen/AMDGPU/igrouplp-dag-mutation.mir +++ b/llvm/test/CodeGen/AMDGPU/igrouplp-dag-mutation.mir @@ -1,14 +1,17 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - 2>&1 | FileCheck -check-prefix=DEFAULT %s -# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - -amdgpu-igrouplp=1 2>&1 | FileCheck -check-prefix=PIPELINE %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - -amdgpu-igrouplp=1 2>&1 | FileCheck -check-prefix=DEFAULTPIPE %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - -amdgpu-igrouplp=1 -amdgpu-igrouplp-order=vmem,8,mfma,8 2>&1 | FileCheck -check-prefix=PARTIALPIPE %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - -amdgpu-igrouplp=1 -amdgpu-igrouplp-order=valu,salu,dsr,vmemr,mfma,20,vmemw,dsw 2>&1 | FileCheck -check-prefix=COMPLETEPIPE %s + --- -name: no_pipeline +name: no_default_pipeline tracksRegLiveness: true body: | bb.0: liveins: $sgpr0, $vgpr10_vgpr11 - ; DEFAULT-LABEL: name: no_pipeline + ; DEFAULT-LABEL: name: no_default_pipeline ; DEFAULT: liveins: $sgpr0, $vgpr10_vgpr11 ; DEFAULT-NEXT: {{ $}} ; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec @@ -21,19 +24,45 @@ ; DEFAULT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec ; DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec ; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec - ; PIPELINE-LABEL: name: no_pipeline - ; PIPELINE: liveins: $sgpr0, $vgpr10_vgpr11 - ; PIPELINE-NEXT: {{ $}} - ; PIPELINE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec - ; PIPELINE-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec - ; PIPELINE-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec - ; PIPELINE-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec - ; PIPELINE-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec - ; PIPELINE-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec - ; PIPELINE-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec + ; DEFAULTPIPE-LABEL: name: no_default_pipeline + ; DEFAULTPIPE: liveins: $sgpr0, $vgpr10_vgpr11 + ; DEFAULTPIPE-NEXT: {{ $}} + ; DEFAULTPIPE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; DEFAULTPIPE-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec + ; PARTIALPIPE-LABEL: name: no_default_pipeline + ; PARTIALPIPE: liveins: $sgpr0, $vgpr10_vgpr11 + ; PARTIALPIPE-NEXT: {{ $}} + ; PARTIALPIPE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; PARTIALPIPE-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec + ; COMPLETEPIPE-LABEL: name: no_default_pipeline + ; COMPLETEPIPE: liveins: $sgpr0, $vgpr10_vgpr11 + ; COMPLETEPIPE-NEXT: {{ $}} + ; COMPLETEPIPE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, killed $sgpr0, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; COMPLETEPIPE-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, killed $vgpr1, 0, 0, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec $vgpr1 = V_MOV_B32_e32 1, implicit $exec $vgpr0 = V_MOV_B32_e32 1, implicit $exec $vgpr8 = V_MOV_B32_e32 0, implicit $exec @@ -48,12 +77,12 @@ --- -name: full_pipe +name: full_pipeline tracksRegLiveness: true body: | bb.0: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $sgpr0, $vgpr10_vgpr11 - ; DEFAULT-LABEL: name: full_pipe + ; DEFAULT-LABEL: name: full_pipeline ; DEFAULT: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $vgpr10_vgpr11 ; DEFAULT-NEXT: {{ $}} ; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec @@ -97,52 +126,144 @@ ; DEFAULT-NEXT: $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr10, killed $vgpr11, killed $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec ; DEFAULT-NEXT: DS_WRITE_B32 killed $vgpr23, killed $vgpr3, 0, 16, implicit $m0, implicit $exec ; DEFAULT-NEXT: DS_WRITE_B32 killed $vgpr9, killed $vgpr24, 0, 16, implicit $m0, implicit $exec - ; PIPELINE-LABEL: name: full_pipe - ; PIPELINE: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $vgpr10_vgpr11 - ; PIPELINE-NEXT: {{ $}} - ; PIPELINE-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; PIPELINE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: $vgpr2 = V_MOV_B32_e32 2, implicit $exec - ; PIPELINE-NEXT: $vgpr3 = V_MOV_B32_e32 3, implicit $exec - ; PIPELINE-NEXT: $vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec - ; PIPELINE-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec - ; PIPELINE-NEXT: $vgpr4 = V_MOV_B32_e32 4, implicit $exec - ; PIPELINE-NEXT: $vgpr5 = V_MOV_B32_e32 5, implicit $exec - ; PIPELINE-NEXT: $vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec - ; PIPELINE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec - ; PIPELINE-NEXT: $vgpr26 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: $vgpr27 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: $vgpr24 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: $vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - ; PIPELINE-NEXT: $vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - ; PIPELINE-NEXT: $vgpr21 = V_MUL_LO_U32_e64 $vgpr1, killed $sgpr0, implicit $exec - ; PIPELINE-NEXT: $vgpr30 = V_MOV_B32_e32 30, implicit $exec - ; PIPELINE-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: $vgpr18 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec { - ; PIPELINE-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec - ; PIPELINE-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec - ; PIPELINE-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec - ; PIPELINE-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec - ; PIPELINE-NEXT: $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec - ; PIPELINE-NEXT: } - ; PIPELINE-NEXT: DS_WRITE_B32 $vgpr3, $vgpr1, 0, 16, implicit $m0, implicit $exec - ; PIPELINE-NEXT: BUNDLE implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit killed $vgpr26_vgpr27, implicit $exec { - ; PIPELINE-NEXT: $vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec - ; PIPELINE-NEXT: $vgpr20 = GLOBAL_LOAD_USHORT killed $vgpr26_vgpr27, 0, 0, implicit $exec - ; PIPELINE-NEXT: } - ; PIPELINE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; PIPELINE-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; PIPELINE-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec - ; PIPELINE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; PIPELINE-NEXT: $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr10, killed $vgpr11, killed $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec - ; PIPELINE-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec - ; PIPELINE-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr7, implicit $m0, implicit $exec, implicit killed $vgpr23, implicit killed $vgpr3 { - ; PIPELINE-NEXT: DS_WRITE_B32 killed $vgpr0, killed $vgpr7, 0, 16, implicit $m0, implicit $exec - ; PIPELINE-NEXT: DS_WRITE_B32 killed $vgpr23, killed $vgpr3, 0, 16, implicit $m0, implicit $exec - ; PIPELINE-NEXT: } - ; PIPELINE-NEXT: DS_WRITE_B32 killed $vgpr9, killed $vgpr24, 0, 16, implicit $m0, implicit $exec + ; DEFAULTPIPE-LABEL: name: full_pipeline + ; DEFAULTPIPE: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $vgpr10_vgpr11 + ; DEFAULTPIPE-NEXT: {{ $}} + ; DEFAULTPIPE-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr2 = V_MOV_B32_e32 2, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr3 = V_MOV_B32_e32 3, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr4 = V_MOV_B32_e32 4, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr5 = V_MOV_B32_e32 5, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr26 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr27 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr24 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr21 = V_MUL_LO_U32_e64 $vgpr1, killed $sgpr0, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr30 = V_MOV_B32_e32 30, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr18 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULTPIPE-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec { + ; DEFAULTPIPE-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + ; DEFAULTPIPE-NEXT: } + ; DEFAULTPIPE-NEXT: DS_WRITE_B32 $vgpr3, $vgpr1, 0, 16, implicit $m0, implicit $exec + ; DEFAULTPIPE-NEXT: BUNDLE implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit killed $vgpr26_vgpr27, implicit $exec { + ; DEFAULTPIPE-NEXT: $vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr20 = GLOBAL_LOAD_USHORT killed $vgpr26_vgpr27, 0, 0, implicit $exec + ; DEFAULTPIPE-NEXT: } + ; DEFAULTPIPE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULTPIPE-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULTPIPE-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULTPIPE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULTPIPE-NEXT: $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr10, killed $vgpr11, killed $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULTPIPE-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULTPIPE-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr7, implicit $m0, implicit $exec, implicit killed $vgpr23, implicit killed $vgpr3 { + ; DEFAULTPIPE-NEXT: DS_WRITE_B32 killed $vgpr0, killed $vgpr7, 0, 16, implicit $m0, implicit $exec + ; DEFAULTPIPE-NEXT: DS_WRITE_B32 killed $vgpr23, killed $vgpr3, 0, 16, implicit $m0, implicit $exec + ; DEFAULTPIPE-NEXT: } + ; DEFAULTPIPE-NEXT: DS_WRITE_B32 killed $vgpr9, killed $vgpr24, 0, 16, implicit $m0, implicit $exec + ; PARTIALPIPE-LABEL: name: full_pipeline + ; PARTIALPIPE: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $vgpr10_vgpr11 + ; PARTIALPIPE-NEXT: {{ $}} + ; PARTIALPIPE-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr2 = V_MOV_B32_e32 2, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr3 = V_MOV_B32_e32 3, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr4 = V_MOV_B32_e32 4, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr5 = V_MOV_B32_e32 5, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr26 = V_MOV_B32_e32 1, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr27 = V_MOV_B32_e32 1, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr24 = V_MOV_B32_e32 1, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr21 = V_MUL_LO_U32_e64 $vgpr1, killed $sgpr0, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr30 = V_MOV_B32_e32 30, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr18 = V_MOV_B32_e32 1, implicit $exec + ; PARTIALPIPE-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec { + ; PARTIALPIPE-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + ; PARTIALPIPE-NEXT: } + ; PARTIALPIPE-NEXT: DS_WRITE_B32 $vgpr3, $vgpr1, 0, 16, implicit $m0, implicit $exec + ; PARTIALPIPE-NEXT: BUNDLE implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit killed $vgpr26_vgpr27, implicit $exec { + ; PARTIALPIPE-NEXT: $vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr20 = GLOBAL_LOAD_USHORT killed $vgpr26_vgpr27, 0, 0, implicit $exec + ; PARTIALPIPE-NEXT: } + ; PARTIALPIPE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; PARTIALPIPE-NEXT: DS_WRITE_B32 $vgpr0, killed $vgpr7, 0, 16, implicit $m0, implicit $exec + ; PARTIALPIPE-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; PARTIALPIPE-NEXT: DS_WRITE_B32 killed $vgpr23, $vgpr3, 0, 16, implicit $m0, implicit $exec + ; PARTIALPIPE-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec + ; PARTIALPIPE-NEXT: DS_WRITE_B32 killed $vgpr9, killed $vgpr24, 0, 16, implicit $m0, implicit $exec + ; PARTIALPIPE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; PARTIALPIPE-NEXT: $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr10, killed $vgpr11, killed $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec + ; PARTIALPIPE-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; COMPLETEPIPE-LABEL: name: full_pipeline + ; COMPLETEPIPE: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $vgpr10_vgpr11 + ; COMPLETEPIPE-NEXT: {{ $}} + ; COMPLETEPIPE-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr2 = V_MOV_B32_e32 2, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr3 = V_MOV_B32_e32 3, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr4 = V_MOV_B32_e32 4, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr5 = V_MOV_B32_e32 5, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr30 = V_MOV_B32_e32 30, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr24 = V_MOV_B32_e32 1, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr26 = V_MOV_B32_e32 1, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr27 = V_MOV_B32_e32 1, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr18 = V_MOV_B32_e32 1, implicit $exec + ; COMPLETEPIPE-NEXT: BUNDLE implicit-def $vgpr6, implicit-def $vgpr6_lo16, implicit-def $vgpr6_hi16, implicit-def $vgpr7, implicit-def $vgpr7_lo16, implicit-def $vgpr7_hi16, implicit-def $vgpr8, implicit-def $vgpr8_lo16, implicit-def $vgpr8_hi16, implicit $vgpr0_vgpr1, implicit $exec, implicit $vgpr2_vgpr3, implicit $vgpr4_vgpr5 { + ; COMPLETEPIPE-NEXT: $vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec + ; COMPLETEPIPE-NEXT: } + ; COMPLETEPIPE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr21 = V_MUL_LO_U32_e64 $vgpr1, killed $sgpr0, implicit $exec + ; COMPLETEPIPE-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec { + ; COMPLETEPIPE-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + ; COMPLETEPIPE-NEXT: } + ; COMPLETEPIPE-NEXT: DS_WRITE_B32 $vgpr3, $vgpr1, 0, 16, implicit $m0, implicit $exec + ; COMPLETEPIPE-NEXT: BUNDLE implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit killed $vgpr26_vgpr27, implicit $exec { + ; COMPLETEPIPE-NEXT: $vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr20 = GLOBAL_LOAD_USHORT killed $vgpr26_vgpr27, 0, 0, implicit $exec + ; COMPLETEPIPE-NEXT: } + ; COMPLETEPIPE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; COMPLETEPIPE-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; COMPLETEPIPE-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec + ; COMPLETEPIPE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; COMPLETEPIPE-NEXT: $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr10, killed $vgpr11, killed $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec + ; COMPLETEPIPE-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; COMPLETEPIPE-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr7, implicit $m0, implicit $exec, implicit killed $vgpr23, implicit killed $vgpr3, implicit killed $vgpr9, implicit killed $vgpr24 { + ; COMPLETEPIPE-NEXT: DS_WRITE_B32 killed $vgpr0, killed $vgpr7, 0, 16, implicit $m0, implicit $exec + ; COMPLETEPIPE-NEXT: DS_WRITE_B32 killed $vgpr23, killed $vgpr3, 0, 16, implicit $m0, implicit $exec + ; COMPLETEPIPE-NEXT: DS_WRITE_B32 killed $vgpr9, killed $vgpr24, 0, 16, implicit $m0, implicit $exec + ; COMPLETEPIPE-NEXT: } $vgpr0 = V_MOV_B32_e32 0, implicit $exec $vgpr1 = V_MOV_B32_e32 1, implicit $exec $vgpr2 = V_MOV_B32_e32 2, implicit $exec @@ -199,17 +320,39 @@ ; DEFAULT-NEXT: $vgpr16 = DS_READ_U16_gfx9 killed $vgpr7, 0, 2048, implicit $exec ; DEFAULT-NEXT: } ; DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; PIPELINE-LABEL: name: block_ends_in_bundle - ; PIPELINE: liveins: $vgpr0, $vgpr1, $vgpr7, $agpr0_agpr1_agpr2_agpr3 - ; PIPELINE-NEXT: {{ $}} - ; PIPELINE-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit killed $vgpr7, implicit $exec { - ; PIPELINE-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec - ; PIPELINE-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec - ; PIPELINE-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec - ; PIPELINE-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec - ; PIPELINE-NEXT: $vgpr16 = DS_READ_U16_gfx9 killed $vgpr7, 0, 2048, implicit $exec - ; PIPELINE-NEXT: } - ; PIPELINE-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULTPIPE-LABEL: name: block_ends_in_bundle + ; DEFAULTPIPE: liveins: $vgpr0, $vgpr1, $vgpr7, $agpr0_agpr1_agpr2_agpr3 + ; DEFAULTPIPE-NEXT: {{ $}} + ; DEFAULTPIPE-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit killed $vgpr7, implicit $exec { + ; DEFAULTPIPE-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr16 = DS_READ_U16_gfx9 killed $vgpr7, 0, 2048, implicit $exec + ; DEFAULTPIPE-NEXT: } + ; DEFAULTPIPE-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; PARTIALPIPE-LABEL: name: block_ends_in_bundle + ; PARTIALPIPE: liveins: $vgpr0, $vgpr1, $vgpr7, $agpr0_agpr1_agpr2_agpr3 + ; PARTIALPIPE-NEXT: {{ $}} + ; PARTIALPIPE-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit killed $vgpr7, implicit $exec { + ; PARTIALPIPE-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr16 = DS_READ_U16_gfx9 killed $vgpr7, 0, 2048, implicit $exec + ; PARTIALPIPE-NEXT: } + ; PARTIALPIPE-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; COMPLETEPIPE-LABEL: name: block_ends_in_bundle + ; COMPLETEPIPE: liveins: $vgpr0, $vgpr1, $vgpr7, $agpr0_agpr1_agpr2_agpr3 + ; COMPLETEPIPE-NEXT: {{ $}} + ; COMPLETEPIPE-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit killed $vgpr7, implicit $exec { + ; COMPLETEPIPE-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr16 = DS_READ_U16_gfx9 killed $vgpr7, 0, 2048, implicit $exec + ; COMPLETEPIPE-NEXT: } + ; COMPLETEPIPE-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec { $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec