diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -23,6 +23,8 @@ #include "llvm/ADT/BitmaskEnum.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/TargetOpcodes.h" +#include +#include using namespace llvm; @@ -36,6 +38,219 @@ "their ordering for scheduling"), cl::init(false)); +typedef function_ref + CanAddMIFn; + +// IGroupTableEntry maps together various data relevant for IGroups. +// In order to support a new IGroup, define its classification (CanAddMIFn) +// function, add an entry to the IGroupClass enum and input the data into +// the IGroupTable. +struct IGroupTableEntry { + std::string GroupName; + unsigned IGP; + CanAddMIFn canAddMI; + SmallVector SubGroups; + Optional MaxSize; + + IGroupTableEntry(std::string GroupName, unsigned IGP, CanAddMIFn canAddMI) + : GroupName(GroupName), IGP(IGP), canAddMI(canAddMI) {} + + IGroupTableEntry(std::string GroupName, unsigned IGP, CanAddMIFn canAddMI, + SmallVector SubGroups) + : GroupName(GroupName), IGP(IGP), canAddMI(canAddMI), + SubGroups(SubGroups) {} +}; + +// Definition of classification (CanAddMIFn) functions +bool isMFMASGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return TII->isMFMA(MI); +} + +bool isVALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return TII->isVALU(MI) && !TII->isMFMA(MI); +} + +bool isSALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return TII->isSALU(MI); +} + +bool isVMEMSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)); +} + +bool isVMEMReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return MI.mayLoad() && + (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))); +} + +bool isVMEMWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return MI.mayStore() && + (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))); +} + +bool isDSWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return MI.mayStore() && TII->isDS(MI); +} + +bool isDSReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return MI.mayLoad() && TII->isDS(MI); +} + +// Enum holding the supported IGroups +enum IGroupClass { + igp_mfma, + igp_valu, + igp_salu, + igp_vmem, + igp_vmemr, + igp_vmemw, + igp_dsr, + igp_dsw +}; + +// The table which maps together various data for IGroups +class IGroupTable { +private: + SmallVector IGroupEntries; + +public: + IGroupTable() { + IGroupEntries = {{"mfma", igp_mfma, isMFMASGMember}, + {"valu", igp_valu, isVALUSGMember}, + {"salu", igp_salu, isSALUSGMember}, + {"vmem", igp_vmem, isVMEMSGMember, {igp_vmemr, igp_vmemw}}, + {"vmemr", igp_vmemr, isVMEMReadSGMember}, + {"vmemw", igp_vmemw, isVMEMWriteSGMember}, + {"dsr", igp_dsr, isDSReadSGMember}, + {"dsw", igp_dsw, isDSWriteSGMember}}; + } + + IGroupTableEntry *operator[](std::string key) { + auto match = std::find_if( + IGroupEntries.begin(), IGroupEntries.end(), + [&key](IGroupTableEntry &IGPData) { return key == IGPData.GroupName; }); + return (match != IGroupEntries.end()) ? match : nullptr; + } +}; + +IGroupTable IGroupMasterTable; + +// The order of IGroup stages and their optional sizes as returned +// by the parser. +static SmallVector IGroupLPOrder; + +struct IGroupOrderParser : public cl::parser { + IGroupOrderParser(cl::Option &O) : cl::parser(O) {} + + // Possible categories that a comma seperated string may + // fall into + enum Token { tok_start, tok_group, tok_number, tok_error }; + + // The previously encountered token type + unsigned PrevToken = tok_start; + + // A bit vector encoding the encountered groups thus far + unsigned ObservedGroups = 0; + + // Ensure that we do not have multiple occurances of the same + // igroup (including potential SubGroups). + unsigned handleGroup(std::string &Token, cl::Option &O, + IGroupTableEntry &IGPData) { + unsigned Group = IGPData.IGP; + assert(Group < sizeof(ObservedGroups) * 8); + if (ObservedGroups & (1 << (unsigned)Group)) { + O.error("Multiple occurance of " + Token); + return tok_error; + } + // Add group token to encountered groups + ObservedGroups |= 1 << (unsigned)Group; + + for (unsigned SubGroup : IGPData.SubGroups) { + assert(SubGroup < sizeof(ObservedGroups) * 8); + if (ObservedGroups & (1 << (unsigned)SubGroup)) { + O.error("Multiple occurance " + Token + + ". Overlaps with existing SubGroup"); + return tok_error; + } + // Add group token to encountered groups + ObservedGroups |= 1 << (unsigned)SubGroup; + } + + return tok_group; + } + + // Check for properly formatted numbers and igroup strings. + // If we are unable to easily find one, then flag as error. + unsigned getTokenType(std::string &Token, cl::Option &O) { + std::string::const_iterator it = Token.begin(); + + // Check for a complete natural number. Decimals and + // negatives don't make sense in the context of group size, + // and are thus not supported + if (std::isdigit(*it)) { + while (it != Token.end() && std::isdigit(*it)) + ++it; + return (it == Token.end()) ? tok_number : tok_error; + } + + if (std::isalpha(*it)) { + // Transform the string to lower case to allow for + // more matching + std::transform(Token.begin(), Token.end(), Token.begin(), + [](unsigned char c) { return std::tolower(c); }); + + // Check if the token matches with a supported IGroup + auto Match = IGroupMasterTable[Token]; + return (!Match) ? tok_error : handleGroup(Token, O, *Match); + } + + return tok_error; + } + + bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, + std::string &Value) { + Value = Arg.str(); + int CurrToken = getTokenType(Value, O); + + if (CurrToken == tok_error) + return O.error("Invalid Token '" + Arg + "'"); + + switch (PrevToken) { + case tok_start: + case tok_number: + // If there has been no token, or if the previous token was a group size, + // then we must encounter a group name. + if (CurrToken != tok_group) + return O.error("Invalid Token '" + Arg + "'. Expected group token."); + break; + case tok_group: + // If we previously encountered a group name, then we can encounter either + // a size or another group name + if (CurrToken != tok_group && CurrToken != tok_number) + return O.error("Invalid Token '" + Arg + + "'. Expected group or number token."); + break; + case tok_error: + default: + // The only other possible token value is tok_error which is already + // handled. + llvm_unreachable("Unsupported Token occured"); + } + + PrevToken = CurrToken; + return 0; + } +}; + +static cl::list, IGroupOrderParser> + List("amdgpu-igrouplp-order", + cl::desc("This option is used to specify the order of groups and " + "their sizes to be used in AMDGPUIGroupLP. To specify, " + "enter a comma seperated list of groups in {salu, valu, " + "mfma, dsr, dsw, vmemr, vmemw, vmem} and an optional size " + "after each."), + cl::CommaSeparated, cl::location(IGroupLPOrder)); + static cl::opt> VMEMGroupMaxSize("amdgpu-igrouplp-vmem-group-size", cl::init(None), cl::Hidden, @@ -60,9 +275,6 @@ cl::desc("The maximum number of instructions to include " "in lds/gds write group.")); -typedef function_ref - CanAddMIFn; - // Classify instructions into groups to enable fine tuned control over the // scheduler. These groups may be more specific than current SchedModel // instruction classes. @@ -153,40 +365,6 @@ : canAddMI(canAddMI), MaxSize(MaxSize), DAG(DAG) {} }; -bool isMFMASGMember(const MachineInstr &MI, const SIInstrInfo *TII) { - return TII->isMFMA(MI); -} - -bool isVALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { - return TII->isVALU(MI) && !TII->isMFMA(MI); -} - -bool isSALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { - return TII->isSALU(MI); -} - -bool isVMEMSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { - return TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)); -} - -bool isVMEMReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { - return MI.mayLoad() && - (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))); -} - -bool isVMEMWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { - return MI.mayStore() && - (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))); -} - -bool isDSWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { - return MI.mayStore() && TII->isDS(MI); -} - -bool isDSReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { - return MI.mayLoad() && TII->isDS(MI); -} - class IGroupLPDAGMutation : public ScheduleDAGMutation { public: const SIInstrInfo *TII; @@ -268,11 +446,39 @@ // order in which edges will be added. In other words, given the // present ordering, we will try to make each VMEMRead instruction // a predecessor of each DSRead instruction, and so on. - SmallVector PipelineOrderGroups = { - SchedGroup(isVMEMSGMember, VMEMGroupMaxSize, DAG), - SchedGroup(isDSReadSGMember, LDRGroupMaxSize, DAG), - SchedGroup(isMFMASGMember, MFMAGroupMaxSize, DAG), - SchedGroup(isDSWriteSGMember, LDWGroupMaxSize, DAG)}; + + SmallVector PipelineOrderGroups; + + // Since the input string has been pre-parsed, we know we have a + // well formed sequence of well formed strings. They will start with + // an IGroup and will optinally be followed by a size. + if (IGroupLPOrder.size() > 0) { + auto it = IGroupLPOrder.begin(); + while (it != IGroupLPOrder.end()) { + assert(!std::isdigit((*it)[0])); + // Grab the relevant IGroup Data. + // auto Match = std::find_if(IGroupTableEntries.begin(), + // IGroupTableEntries.end(), + // [&it](IGroupTableEntry &IGPData) {return IGPData.GroupName == *it;}); + auto Match = IGroupMasterTable[*it]; + assert(Match); + ++it; + if (it != IGroupLPOrder.end() && std::isdigit((*it)[0])) { + Match->MaxSize = std::stoi(*it); + ++it; + } + PipelineOrderGroups.push_back( + SchedGroup(Match->canAddMI, Match->MaxSize, DAG)); + } + } + + // Default to backwardsly compatible behavior + else { + PipelineOrderGroups = {SchedGroup(isVMEMSGMember, VMEMGroupMaxSize, DAG), + SchedGroup(isDSReadSGMember, LDRGroupMaxSize, DAG), + SchedGroup(isMFMASGMember, MFMAGroupMaxSize, DAG), + SchedGroup(isDSWriteSGMember, LDWGroupMaxSize, DAG)}; + } for (SUnit &SU : DAG->SUnits) { LLVM_DEBUG(dbgs() << "Checking Node"; DAG->dumpNode(SU)); diff --git a/llvm/test/CodeGen/AMDGPU/igrouplp-dag-mutation.mir b/llvm/test/CodeGen/AMDGPU/igrouplp-dag-mutation.mir --- a/llvm/test/CodeGen/AMDGPU/igrouplp-dag-mutation.mir +++ b/llvm/test/CodeGen/AMDGPU/igrouplp-dag-mutation.mir @@ -1,14 +1,17 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - 2>&1 | FileCheck -check-prefix=DEFAULT %s -# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - -amdgpu-igrouplp=1 2>&1 | FileCheck -check-prefix=PIPELINE %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - -amdgpu-igrouplp=1 2>&1 | FileCheck -check-prefix=DEFAULTPIPE %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - -amdgpu-igrouplp=1 -amdgpu-igrouplp-order=vmem,8,mfma,8 2>&1 | FileCheck -check-prefix=PARTIALPIPE %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - -amdgpu-igrouplp=1 -amdgpu-igrouplp-order=valu,salu,dsr,vmemr,mfma,20,vmemw,dsw 2>&1 | FileCheck -check-prefix=COMPLETEPIPE %s + --- -name: no_pipeline +name: no_default_pipeline tracksRegLiveness: true body: | bb.0: liveins: $sgpr0, $vgpr10_vgpr11 - ; DEFAULT-LABEL: name: no_pipeline + ; DEFAULT-LABEL: name: no_default_pipeline ; DEFAULT: liveins: $sgpr0, $vgpr10_vgpr11 ; DEFAULT-NEXT: {{ $}} ; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec @@ -21,19 +24,45 @@ ; DEFAULT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec ; DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec ; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec - ; PIPELINE-LABEL: name: no_pipeline - ; PIPELINE: liveins: $sgpr0, $vgpr10_vgpr11 - ; PIPELINE-NEXT: {{ $}} - ; PIPELINE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec - ; PIPELINE-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec - ; PIPELINE-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec - ; PIPELINE-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec - ; PIPELINE-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec - ; PIPELINE-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec - ; PIPELINE-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec + ; DEFAULTPIPE-LABEL: name: no_default_pipeline + ; DEFAULTPIPE: liveins: $sgpr0, $vgpr10_vgpr11 + ; DEFAULTPIPE-NEXT: {{ $}} + ; DEFAULTPIPE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; DEFAULTPIPE-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec + ; PARTIALPIPE-LABEL: name: no_default_pipeline + ; PARTIALPIPE: liveins: $sgpr0, $vgpr10_vgpr11 + ; PARTIALPIPE-NEXT: {{ $}} + ; PARTIALPIPE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; PARTIALPIPE-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec + ; COMPLETEPIPE-LABEL: name: no_default_pipeline + ; COMPLETEPIPE: liveins: $sgpr0, $vgpr10_vgpr11 + ; COMPLETEPIPE-NEXT: {{ $}} + ; COMPLETEPIPE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, killed $sgpr0, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; COMPLETEPIPE-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, killed $vgpr1, 0, 0, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec $vgpr1 = V_MOV_B32_e32 1, implicit $exec $vgpr0 = V_MOV_B32_e32 1, implicit $exec $vgpr8 = V_MOV_B32_e32 0, implicit $exec @@ -48,12 +77,12 @@ --- -name: full_pipe +name: full_pipeline tracksRegLiveness: true body: | bb.0: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $sgpr0, $vgpr10_vgpr11 - ; DEFAULT-LABEL: name: full_pipe + ; DEFAULT-LABEL: name: full_pipeline ; DEFAULT: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $vgpr10_vgpr11 ; DEFAULT-NEXT: {{ $}} ; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec @@ -97,52 +126,144 @@ ; DEFAULT-NEXT: $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr10, killed $vgpr11, killed $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec ; DEFAULT-NEXT: DS_WRITE_B32 killed $vgpr23, killed $vgpr3, 0, 16, implicit $m0, implicit $exec ; DEFAULT-NEXT: DS_WRITE_B32 killed $vgpr9, killed $vgpr24, 0, 16, implicit $m0, implicit $exec - ; PIPELINE-LABEL: name: full_pipe - ; PIPELINE: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $vgpr10_vgpr11 - ; PIPELINE-NEXT: {{ $}} - ; PIPELINE-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; PIPELINE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: $vgpr2 = V_MOV_B32_e32 2, implicit $exec - ; PIPELINE-NEXT: $vgpr3 = V_MOV_B32_e32 3, implicit $exec - ; PIPELINE-NEXT: $vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec - ; PIPELINE-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec - ; PIPELINE-NEXT: $vgpr4 = V_MOV_B32_e32 4, implicit $exec - ; PIPELINE-NEXT: $vgpr5 = V_MOV_B32_e32 5, implicit $exec - ; PIPELINE-NEXT: $vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec - ; PIPELINE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec - ; PIPELINE-NEXT: $vgpr26 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: $vgpr27 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: $vgpr24 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: $vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - ; PIPELINE-NEXT: $vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - ; PIPELINE-NEXT: $vgpr21 = V_MUL_LO_U32_e64 $vgpr1, killed $sgpr0, implicit $exec - ; PIPELINE-NEXT: $vgpr30 = V_MOV_B32_e32 30, implicit $exec - ; PIPELINE-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: $vgpr18 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec { - ; PIPELINE-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec - ; PIPELINE-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec - ; PIPELINE-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec - ; PIPELINE-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec - ; PIPELINE-NEXT: $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec - ; PIPELINE-NEXT: } - ; PIPELINE-NEXT: DS_WRITE_B32 $vgpr3, $vgpr1, 0, 16, implicit $m0, implicit $exec - ; PIPELINE-NEXT: BUNDLE implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit killed $vgpr26_vgpr27, implicit $exec { - ; PIPELINE-NEXT: $vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec - ; PIPELINE-NEXT: $vgpr20 = GLOBAL_LOAD_USHORT killed $vgpr26_vgpr27, 0, 0, implicit $exec - ; PIPELINE-NEXT: } - ; PIPELINE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; PIPELINE-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; PIPELINE-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec - ; PIPELINE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; PIPELINE-NEXT: $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr10, killed $vgpr11, killed $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec - ; PIPELINE-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec - ; PIPELINE-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr7, implicit $m0, implicit $exec, implicit killed $vgpr23, implicit killed $vgpr3 { - ; PIPELINE-NEXT: DS_WRITE_B32 killed $vgpr0, killed $vgpr7, 0, 16, implicit $m0, implicit $exec - ; PIPELINE-NEXT: DS_WRITE_B32 killed $vgpr23, killed $vgpr3, 0, 16, implicit $m0, implicit $exec - ; PIPELINE-NEXT: } - ; PIPELINE-NEXT: DS_WRITE_B32 killed $vgpr9, killed $vgpr24, 0, 16, implicit $m0, implicit $exec + ; DEFAULTPIPE-LABEL: name: full_pipeline + ; DEFAULTPIPE: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $vgpr10_vgpr11 + ; DEFAULTPIPE-NEXT: {{ $}} + ; DEFAULTPIPE-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr2 = V_MOV_B32_e32 2, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr3 = V_MOV_B32_e32 3, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr4 = V_MOV_B32_e32 4, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr5 = V_MOV_B32_e32 5, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr26 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr27 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr24 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr21 = V_MUL_LO_U32_e64 $vgpr1, killed $sgpr0, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr30 = V_MOV_B32_e32 30, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr18 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULTPIPE-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec { + ; DEFAULTPIPE-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + ; DEFAULTPIPE-NEXT: } + ; DEFAULTPIPE-NEXT: DS_WRITE_B32 $vgpr3, $vgpr1, 0, 16, implicit $m0, implicit $exec + ; DEFAULTPIPE-NEXT: BUNDLE implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit killed $vgpr26_vgpr27, implicit $exec { + ; DEFAULTPIPE-NEXT: $vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr20 = GLOBAL_LOAD_USHORT killed $vgpr26_vgpr27, 0, 0, implicit $exec + ; DEFAULTPIPE-NEXT: } + ; DEFAULTPIPE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULTPIPE-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULTPIPE-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULTPIPE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULTPIPE-NEXT: $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr10, killed $vgpr11, killed $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULTPIPE-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULTPIPE-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr7, implicit $m0, implicit $exec, implicit killed $vgpr23, implicit killed $vgpr3 { + ; DEFAULTPIPE-NEXT: DS_WRITE_B32 killed $vgpr0, killed $vgpr7, 0, 16, implicit $m0, implicit $exec + ; DEFAULTPIPE-NEXT: DS_WRITE_B32 killed $vgpr23, killed $vgpr3, 0, 16, implicit $m0, implicit $exec + ; DEFAULTPIPE-NEXT: } + ; DEFAULTPIPE-NEXT: DS_WRITE_B32 killed $vgpr9, killed $vgpr24, 0, 16, implicit $m0, implicit $exec + ; PARTIALPIPE-LABEL: name: full_pipeline + ; PARTIALPIPE: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $vgpr10_vgpr11 + ; PARTIALPIPE-NEXT: {{ $}} + ; PARTIALPIPE-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr2 = V_MOV_B32_e32 2, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr3 = V_MOV_B32_e32 3, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr4 = V_MOV_B32_e32 4, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr5 = V_MOV_B32_e32 5, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr26 = V_MOV_B32_e32 1, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr27 = V_MOV_B32_e32 1, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr24 = V_MOV_B32_e32 1, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr21 = V_MUL_LO_U32_e64 $vgpr1, killed $sgpr0, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr30 = V_MOV_B32_e32 30, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr18 = V_MOV_B32_e32 1, implicit $exec + ; PARTIALPIPE-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec { + ; PARTIALPIPE-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + ; PARTIALPIPE-NEXT: } + ; PARTIALPIPE-NEXT: DS_WRITE_B32 $vgpr3, $vgpr1, 0, 16, implicit $m0, implicit $exec + ; PARTIALPIPE-NEXT: BUNDLE implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit killed $vgpr26_vgpr27, implicit $exec { + ; PARTIALPIPE-NEXT: $vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr20 = GLOBAL_LOAD_USHORT killed $vgpr26_vgpr27, 0, 0, implicit $exec + ; PARTIALPIPE-NEXT: } + ; PARTIALPIPE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; PARTIALPIPE-NEXT: DS_WRITE_B32 $vgpr0, killed $vgpr7, 0, 16, implicit $m0, implicit $exec + ; PARTIALPIPE-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; PARTIALPIPE-NEXT: DS_WRITE_B32 killed $vgpr23, $vgpr3, 0, 16, implicit $m0, implicit $exec + ; PARTIALPIPE-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec + ; PARTIALPIPE-NEXT: DS_WRITE_B32 killed $vgpr9, killed $vgpr24, 0, 16, implicit $m0, implicit $exec + ; PARTIALPIPE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; PARTIALPIPE-NEXT: $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr10, killed $vgpr11, killed $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec + ; PARTIALPIPE-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; COMPLETEPIPE-LABEL: name: full_pipeline + ; COMPLETEPIPE: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $vgpr10_vgpr11 + ; COMPLETEPIPE-NEXT: {{ $}} + ; COMPLETEPIPE-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr2 = V_MOV_B32_e32 2, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr3 = V_MOV_B32_e32 3, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr4 = V_MOV_B32_e32 4, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr5 = V_MOV_B32_e32 5, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr30 = V_MOV_B32_e32 30, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr24 = V_MOV_B32_e32 1, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr26 = V_MOV_B32_e32 1, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr27 = V_MOV_B32_e32 1, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr18 = V_MOV_B32_e32 1, implicit $exec + ; COMPLETEPIPE-NEXT: BUNDLE implicit-def $vgpr6, implicit-def $vgpr6_lo16, implicit-def $vgpr6_hi16, implicit-def $vgpr7, implicit-def $vgpr7_lo16, implicit-def $vgpr7_hi16, implicit-def $vgpr8, implicit-def $vgpr8_lo16, implicit-def $vgpr8_hi16, implicit $vgpr0_vgpr1, implicit $exec, implicit $vgpr2_vgpr3, implicit $vgpr4_vgpr5 { + ; COMPLETEPIPE-NEXT: $vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec + ; COMPLETEPIPE-NEXT: } + ; COMPLETEPIPE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr21 = V_MUL_LO_U32_e64 $vgpr1, killed $sgpr0, implicit $exec + ; COMPLETEPIPE-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec { + ; COMPLETEPIPE-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + ; COMPLETEPIPE-NEXT: } + ; COMPLETEPIPE-NEXT: DS_WRITE_B32 $vgpr3, $vgpr1, 0, 16, implicit $m0, implicit $exec + ; COMPLETEPIPE-NEXT: BUNDLE implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit killed $vgpr26_vgpr27, implicit $exec { + ; COMPLETEPIPE-NEXT: $vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr20 = GLOBAL_LOAD_USHORT killed $vgpr26_vgpr27, 0, 0, implicit $exec + ; COMPLETEPIPE-NEXT: } + ; COMPLETEPIPE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; COMPLETEPIPE-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; COMPLETEPIPE-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec + ; COMPLETEPIPE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; COMPLETEPIPE-NEXT: $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr10, killed $vgpr11, killed $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec + ; COMPLETEPIPE-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; COMPLETEPIPE-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr7, implicit $m0, implicit $exec, implicit killed $vgpr23, implicit killed $vgpr3, implicit killed $vgpr9, implicit killed $vgpr24 { + ; COMPLETEPIPE-NEXT: DS_WRITE_B32 killed $vgpr0, killed $vgpr7, 0, 16, implicit $m0, implicit $exec + ; COMPLETEPIPE-NEXT: DS_WRITE_B32 killed $vgpr23, killed $vgpr3, 0, 16, implicit $m0, implicit $exec + ; COMPLETEPIPE-NEXT: DS_WRITE_B32 killed $vgpr9, killed $vgpr24, 0, 16, implicit $m0, implicit $exec + ; COMPLETEPIPE-NEXT: } $vgpr0 = V_MOV_B32_e32 0, implicit $exec $vgpr1 = V_MOV_B32_e32 1, implicit $exec $vgpr2 = V_MOV_B32_e32 2, implicit $exec @@ -199,17 +320,39 @@ ; DEFAULT-NEXT: $vgpr16 = DS_READ_U16_gfx9 killed $vgpr7, 0, 2048, implicit $exec ; DEFAULT-NEXT: } ; DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; PIPELINE-LABEL: name: block_ends_in_bundle - ; PIPELINE: liveins: $vgpr0, $vgpr1, $vgpr7, $agpr0_agpr1_agpr2_agpr3 - ; PIPELINE-NEXT: {{ $}} - ; PIPELINE-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit killed $vgpr7, implicit $exec { - ; PIPELINE-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec - ; PIPELINE-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec - ; PIPELINE-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec - ; PIPELINE-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec - ; PIPELINE-NEXT: $vgpr16 = DS_READ_U16_gfx9 killed $vgpr7, 0, 2048, implicit $exec - ; PIPELINE-NEXT: } - ; PIPELINE-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULTPIPE-LABEL: name: block_ends_in_bundle + ; DEFAULTPIPE: liveins: $vgpr0, $vgpr1, $vgpr7, $agpr0_agpr1_agpr2_agpr3 + ; DEFAULTPIPE-NEXT: {{ $}} + ; DEFAULTPIPE-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit killed $vgpr7, implicit $exec { + ; DEFAULTPIPE-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec + ; DEFAULTPIPE-NEXT: $vgpr16 = DS_READ_U16_gfx9 killed $vgpr7, 0, 2048, implicit $exec + ; DEFAULTPIPE-NEXT: } + ; DEFAULTPIPE-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; PARTIALPIPE-LABEL: name: block_ends_in_bundle + ; PARTIALPIPE: liveins: $vgpr0, $vgpr1, $vgpr7, $agpr0_agpr1_agpr2_agpr3 + ; PARTIALPIPE-NEXT: {{ $}} + ; PARTIALPIPE-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit killed $vgpr7, implicit $exec { + ; PARTIALPIPE-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec + ; PARTIALPIPE-NEXT: $vgpr16 = DS_READ_U16_gfx9 killed $vgpr7, 0, 2048, implicit $exec + ; PARTIALPIPE-NEXT: } + ; PARTIALPIPE-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; COMPLETEPIPE-LABEL: name: block_ends_in_bundle + ; COMPLETEPIPE: liveins: $vgpr0, $vgpr1, $vgpr7, $agpr0_agpr1_agpr2_agpr3 + ; COMPLETEPIPE-NEXT: {{ $}} + ; COMPLETEPIPE-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit killed $vgpr7, implicit $exec { + ; COMPLETEPIPE-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec + ; COMPLETEPIPE-NEXT: $vgpr16 = DS_READ_U16_gfx9 killed $vgpr7, 0, 2048, implicit $exec + ; COMPLETEPIPE-NEXT: } + ; COMPLETEPIPE-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec { $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec