diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -64,6 +64,7 @@ BUILTIN(__builtin_amdgcn_wave_barrier, "v", "n") BUILTIN(__builtin_amdgcn_sched_barrier, "vIi", "n") BUILTIN(__builtin_amdgcn_sched_group_barrier, "vIiIiIi", "n") +BUILTIN(__builtin_amdgcn_iglp_opt, "vIi", "n") BUILTIN(__builtin_amdgcn_s_dcache_inv, "v", "n") BUILTIN(__builtin_amdgcn_buffer_wbinvl1, "v", "n") BUILTIN(__builtin_amdgcn_ds_gws_init, "vUiUi", "n") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -422,6 +422,19 @@ __builtin_amdgcn_sched_group_barrier(15, 10000, -1); } +// CHECK-LABEL: @test_iglp_opt +// CHECK: call void @llvm.amdgcn.iglp.opt(i32 0) +// CHECK: call void @llvm.amdgcn.iglp.opt(i32 1) +// CHECK: call void @llvm.amdgcn.iglp.opt(i32 4) +// CHECK: call void @llvm.amdgcn.iglp.opt(i32 15) +void test_iglp_opt() +{ + __builtin_amdgcn_iglp_opt(0); + __builtin_amdgcn_iglp_opt(1); + __builtin_amdgcn_iglp_opt(4); + __builtin_amdgcn_iglp_opt(15); +} + // CHECK-LABEL: @test_s_sleep // CHECK: call void @llvm.amdgcn.s.sleep(i32 1) // CHECK: call void @llvm.amdgcn.s.sleep(i32 15) diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error.cl --- a/clang/test/SemaOpenCL/builtins-amdgcn-error.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error.cl @@ -72,6 +72,11 @@ __builtin_amdgcn_sched_group_barrier(0, 1, x); // expected-error {{argument to '__builtin_amdgcn_sched_group_barrier' must be a constant integer}} } +void test_iglp_opt(int x) +{ + __builtin_amdgcn_iglp_opt(x); // expected-error {{argument to '__builtin_amdgcn_iglp_opt' must be a constant integer}} +} + void test_sicmp_i32(global ulong* out, int a, int b, uint c) { *out = __builtin_amdgcn_sicmp(a, b, c); // expected-error {{argument to '__builtin_amdgcn_sicmp' must be a constant integer}} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -254,6 +254,12 @@ [ImmArg>, ImmArg>, ImmArg>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn]>; +// Scheduler optimization hint. +// MASK = 0: Small gemm opt +def int_amdgcn_iglp_opt : ClangBuiltin<"__builtin_amdgcn_iglp_opt">, + Intrinsic<[], [llvm_i32_ty], [ImmArg>, IntrNoMem, IntrHasSideEffects, IntrConvergent, + IntrWillReturn]>; + def int_amdgcn_s_waitcnt : ClangBuiltin<"__builtin_amdgcn_s_waitcnt">, Intrinsic<[], [llvm_i32_ty], [ImmArg>, IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h @@ -15,7 +15,6 @@ namespace llvm { std::unique_ptr createIGroupLPDAGMutation(); -std::unique_ptr createSchedBarrierDAGMutation(); } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -31,12 +31,6 @@ namespace { -static cl::opt - EnableIGroupLP("amdgpu-igrouplp", - cl::desc("Enable construction of Instruction Groups and " - "their ordering for scheduling"), - cl::init(false)); - static cl::opt EnableExactSolver( "amdgpu-igrouplp-exact-solver", cl::Hidden, cl::desc("Whether to use the exponential time solver to fit " @@ -106,7 +100,10 @@ int SyncID = 0; // SGID is used to map instructions to candidate SchedGroups - int SGID; + unsigned SGID; + + // Count of the number of created SchedGroups, used to initialize SGID. + static unsigned NumSchedGroups; ScheduleDAGInstrs *DAG; @@ -180,18 +177,22 @@ SchedGroup(SchedGroupMask SGMask, Optional MaxSize, ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) - : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {} + : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) { + SGID = NumSchedGroups++; + } SchedGroup(SchedGroupMask SGMask, Optional MaxSize, int SyncID, - int SGID, ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) - : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), SGID(SGID), DAG(DAG), - TII(TII) {} + ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) + : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) { + SGID = NumSchedGroups++; + } }; // Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER. static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) { assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER || - SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER); + SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || + SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT); while (!SU.Preds.empty()) for (auto &P : SU.Preds) @@ -725,31 +726,107 @@ makePipeline(); } -class IGroupLPDAGMutation : public ScheduleDAGMutation { -private: - // Organize lists of SchedGroups by their SyncID. SchedGroups / - // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added - // between then. - DenseMap> SyncedSchedGroups; +enum IGLPStrategyID : int { MFMASmallGemmOptID = 0 }; - // The number of created sched groups -- also used as SGID - int NumCreatedSchedGroups = 0; +// Implement a IGLP scheduling strategy. +class IGLPStrategy { +protected: + ScheduleDAGInstrs *DAG; - // Used to track instructions that can be mapped to multiple sched groups - DenseMap SyncedInstrs; + const SIInstrInfo *TII; public: - const SIInstrInfo *TII; - ScheduleDAGMI *DAG; + // Add SchedGroups to \p Pipeline to implement this Strategy. + virtual void applyIGLPStrategy( + DenseMap &SyncedInstrs, + DenseMap> &SyncedSchedGroups) = 0; - IGroupLPDAGMutation() = default; - void apply(ScheduleDAGInstrs *DAGInstrs) override; + // Returns true if this strategy should be applied to a ScheduleDAG. + virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0; + + IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) + : DAG(DAG), TII(TII) {} + + virtual ~IGLPStrategy() = default; }; -// DAG mutation that coordinates with the SCHED_BARRIER instruction and -// corresponding builtin. The mutation adds edges from specific instruction -// classes determined by the SCHED_BARRIER mask so that they cannot be -class SchedBarrierDAGMutation : public ScheduleDAGMutation { +class MFMASmallGemmOpt final : public IGLPStrategy { +public: + void applyIGLPStrategy( + DenseMap &SyncedInstrs, + DenseMap> &SyncedSchedGroups) override; + + bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; } + + MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) + : IGLPStrategy(DAG, TII) {} +}; + +void MFMASmallGemmOpt::applyIGLPStrategy( + DenseMap &SyncedInstrs, + DenseMap> &SyncedSchedGroups) { + // Count the number of MFMA instructions. + unsigned MFMACount = 0; + for (auto I = DAG->begin(), E = DAG->end(); I != E; ++I) { + if (TII->isMFMA(*I)) + ++MFMACount; + } + + const unsigned PipelineSyncID = 0; + SchedGroup *SG = nullptr; + for (unsigned I = 0; I < MFMACount; ++I) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VMEM_READ, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VMEM_WRITE, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } + + for (unsigned I = 0; I < MFMACount; ++I) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VMEM_READ, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VMEM_WRITE, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } +} + +static std::unique_ptr +createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG, + const SIInstrInfo *TII) { + switch (ID) { + case MFMASmallGemmOptID: + return std::make_unique(DAG, TII); + } + + llvm_unreachable("Unknown IGLPStrategyID"); +} + +class IGroupLPDAGMutation : public ScheduleDAGMutation { private: const SIInstrInfo *TII; @@ -760,9 +837,6 @@ // between then. DenseMap> SyncedSchedGroups; - // The number of create sched groups -- also used as SGID - int NumCreatedSchedGroups = 0; - // Used to track instructions that can be mapped to multiple sched groups DenseMap SyncedInstrs; @@ -784,12 +858,16 @@ void initSchedGroupBarrierPipelineStage( std::vector::reverse_iterator RIter); + void initIGLPOpt(SUnit &SU); + public: void apply(ScheduleDAGInstrs *DAGInstrs) override; - SchedBarrierDAGMutation() = default; + IGroupLPDAGMutation() = default; }; +unsigned SchedGroup::NumSchedGroups = 0; + bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) { if (A != B && DAG->canAddEdge(B, A)) { DAG->addEdge(B, SDep(A, SDep::Artificial)); @@ -960,88 +1038,44 @@ } void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { - const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget(); - TII = ST.getInstrInfo(); - DAG = static_cast(DAGInstrs); - - // IGroupLP and sched_group_barrier are mutually exclusive mutations. - // Check for sched_group_barriers as that mutation gets priority. - for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) { - if (R->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) { - return; - } - } - - SyncedSchedGroups.clear(); - SyncedInstrs.clear(); - const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); - if (!TSchedModel || DAG->SUnits.empty()) - return; - - LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n"); - - // The order of InstructionGroups in this vector defines the - // order in which edges will be added. In other words, given the - // present ordering, we will try to make each VMEMRead instruction - // a predecessor of each DSRead instruction, and so on. - - struct SGParams { - SchedGroupMask Mask; - Optional Size; - int SyncID; - - SGParams(SchedGroupMask Mask, Optional Size, int SyncID) - : Mask(Mask), Size(Size), SyncID(SyncID) {} - }; - - SmallVector PipelineOrderGroups; - - for (size_t i = 0; i < DAG->SUnits.size() / 4; i++) { - PipelineOrderGroups.push_back({SchedGroupMask::DS_READ, 8, 0}); - PipelineOrderGroups.push_back({SchedGroupMask::MFMA, 1, 0}); - PipelineOrderGroups.push_back({SchedGroupMask::DS_WRITE, 8, 0}); - } - - auto I = PipelineOrderGroups.rbegin(); - auto E = PipelineOrderGroups.rend(); - for (; I < E; I++) { - auto &SG = SyncedSchedGroups[I->SyncID].emplace_back( - I->Mask, I->Size, I->SyncID, NumCreatedSchedGroups++, DAG, TII); - SG.initSchedGroup(SyncedInstrs[SG.getSyncID()]); - } - - PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG); - // PipelineSolver performs the mutation by adding the edges it - // determined as the best - PS.solve(); -} - -void SchedBarrierDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); if (!TSchedModel || DAGInstrs->SUnits.empty()) return; - LLVM_DEBUG(dbgs() << "Applying SchedBarrierDAGMutation...\n"); + LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n"); const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget(); TII = ST.getInstrInfo(); DAG = static_cast(DAGInstrs); SyncedSchedGroups.clear(); SyncedInstrs.clear(); + bool foundSB = false; + bool foundIGLP = false; for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) { - if (R->getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER) + unsigned Opc = R->getInstr()->getOpcode(); + // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive. + if (Opc == AMDGPU::SCHED_BARRIER) { addSchedBarrierEdges(*R); - - else if (R->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) + foundSB = true; + } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) { initSchedGroupBarrierPipelineStage(R); + foundSB = true; + } else if (Opc == AMDGPU::IGLP_OPT) { + resetEdges(*R, DAG); + if (!foundSB && !foundIGLP) + initIGLPOpt(*R); + foundIGLP = true; + } } - PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG); - // PipelineSolver performs the mutation by adding the edges it - // determined as the best - PS.solve(); + if (foundSB || foundIGLP) { + PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG); + // PipelineSolver performs the mutation by adding the edges it + // determined as the best + PS.solve(); + } } -void SchedBarrierDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) { +void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) { MachineInstr &MI = *SchedBarrier.getInstr(); assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER); // Remove all existing edges from the SCHED_BARRIER that were added due to the @@ -1059,7 +1093,7 @@ } SchedGroupMask -SchedBarrierDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const { +IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const { // Invert mask and erase bits for types of instructions that are implied to be // allowed past the SCHED_BARRIER. SchedGroupMask InvertedMask = ~Mask; @@ -1093,7 +1127,7 @@ return InvertedMask; } -void SchedBarrierDAGMutation::initSchedGroupBarrierPipelineStage( +void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage( std::vector::reverse_iterator RIter) { // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due // to the instruction having side effects. @@ -1104,22 +1138,26 @@ int32_t Size = SGB.getOperand(1).getImm(); int32_t SyncID = SGB.getOperand(2).getImm(); - auto &SG = SyncedSchedGroups[SyncID].emplace_back( - (SchedGroupMask)SGMask, Size, SyncID, NumCreatedSchedGroups++, DAG, TII); + auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask, + Size, SyncID, DAG, TII); SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]); } +void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { + IGLPStrategyID StrategyID = + (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm(); + auto S = createIGLPStrategy(StrategyID, DAG, TII); + if (S->shouldApplyStrategy(DAG)) + S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups); +} + } // namespace namespace llvm { std::unique_ptr createIGroupLPDAGMutation() { - return EnableIGroupLP ? std::make_unique() : nullptr; -} - -std::unique_ptr createSchedBarrierDAGMutation() { - return std::make_unique(); + return std::make_unique(); } } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -234,6 +234,16 @@ return; } + if (MI->getOpcode() == AMDGPU::IGLP_OPT) { + if (isVerbose()) { + std::string HexString; + raw_string_ostream HexStream(HexString); + HexStream << format_hex(MI->getOperand(0).getImm(), 10, true); + OutStreamer->emitRawComment(" iglp_opt mask(" + HexString + ")"); + } + return; + } + if (MI->getOpcode() == AMDGPU::SI_MASKED_UNREACHABLE) { if (isVerbose()) OutStreamer->emitRawComment(" divergent unreachable"); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -425,7 +425,6 @@ if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createIGroupLPDAGMutation()); - DAG->addMutation(createSchedBarrierDAGMutation()); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); return DAG; @@ -436,7 +435,6 @@ ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(C, std::make_unique(C)); DAG->addMutation(createIGroupLPDAGMutation()); - DAG->addMutation(createSchedBarrierDAGMutation()); return DAG; } @@ -939,14 +937,15 @@ ScheduleDAGInstrs * createPostMachineScheduler(MachineSchedContext *C) const override { - ScheduleDAGMI *DAG = createGenericSchedPostRA(C); + ScheduleDAGMI *DAG = new GCNPostScheduleDAGMILive( + C, std::make_unique(C), + /*RemoveKillFlags=*/true); const GCNSubtarget &ST = C->MF->getSubtarget(); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII)); DAG->addMutation(createIGroupLPDAGMutation()); - DAG->addMutation(createSchedBarrierDAGMutation()); if (isPassEnabled(EnableVOPD, CodeGenOpt::Less)) DAG->addMutation(createVOPDPairingMutation()); return DAG; diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -162,6 +162,9 @@ // Regions that has the same occupancy as the latest MinOccupancy BitVector RegionsWithMinOcc; + // Regions that have IGLP instructions (SCHED_GROUP_BARRIER or IGLP_OPT). + BitVector RegionsWithIGLPInstrs; + // Region live-in cache. SmallVector LiveIns; @@ -231,6 +234,8 @@ // RP after scheduling the current region. GCNRegPressure PressureAfter; + std::vector> SavedMutations; + GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG); public: @@ -278,8 +283,6 @@ class UnclusteredHighRPStage : public GCNSchedStage { private: - std::vector> SavedMutations; - // Save the initial occupancy before starting this stage. unsigned InitialOccupancy; @@ -355,6 +358,22 @@ : GCNSchedStage(StageID, DAG) {} }; +class GCNPostScheduleDAGMILive final : public ScheduleDAGMI { +private: + std::vector> SavedMutations; + + bool HasIGLPInstrs = false; + +public: + void schedule() override; + + void finalizeSchedule() override; + + GCNPostScheduleDAGMILive(MachineSchedContext *C, + std::unique_ptr S, + bool RemoveKillFlags); +}; + } // End namespace llvm #endif // LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -24,6 +24,7 @@ //===----------------------------------------------------------------------===// #include "GCNSchedStrategy.h" +#include "AMDGPUIGroupLP.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/RegisterClassInfo.h" @@ -31,7 +32,7 @@ using namespace llvm; -cl::opt +static cl::opt DisableUnclusterHighRP("amdgpu-disable-unclustred-high-rp-reschedule", cl::Hidden, cl::desc("Disable unclustred high register pressure " @@ -570,10 +571,12 @@ RegionsWithHighRP.resize(Regions.size()); RegionsWithExcessRP.resize(Regions.size()); RegionsWithMinOcc.resize(Regions.size()); + RegionsWithIGLPInstrs.resize(Regions.size()); RescheduleRegions.set(); RegionsWithHighRP.reset(); RegionsWithExcessRP.reset(); RegionsWithMinOcc.reset(); + RegionsWithIGLPInstrs.reset(); runSchedStages(); } @@ -655,6 +658,8 @@ return false; SavedMutations.swap(DAG.Mutations); + DAG.addMutation(createIGroupLPDAGMutation()); + InitialOccupancy = DAG.MinOccupancy; // Aggressivly try to reduce register pressure in the unclustered high RP // stage. Temporarily increase occupancy target in the region. @@ -760,8 +765,18 @@ // Save original instruction order before scheduling for possible revert. Unsched.clear(); Unsched.reserve(DAG.NumRegionInstrs); - for (auto &I : DAG) - Unsched.push_back(&I); + if (StageID == GCNSchedStageID::OccInitialSchedule || + StageID == GCNSchedStageID::ILPInitialSchedule) { + for (auto &I : DAG) { + Unsched.push_back(&I); + if (I.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || + I.getOpcode() == AMDGPU::IGLP_OPT) + DAG.RegionsWithIGLPInstrs[RegionIdx] = true; + } + } else { + for (auto &I : DAG) + Unsched.push_back(&I); + } PressureBefore = DAG.Pressure[RegionIdx]; @@ -774,6 +789,13 @@ S.HasHighPressure = false; + if (DAG.RegionsWithIGLPInstrs[RegionIdx] && + StageID != GCNSchedStageID::UnclusteredHighRPReschedule) { + SavedMutations.clear(); + SavedMutations.swap(DAG.Mutations); + DAG.addMutation(createIGroupLPDAGMutation()); + } + return true; } @@ -829,6 +851,10 @@ // reason that the original schedule is better. checkScheduling(); + if (DAG.RegionsWithIGLPInstrs[RegionIdx] && + StageID != GCNSchedStageID::UnclusteredHighRPReschedule) + SavedMutations.swap(DAG.Mutations); + DAG.exitRegion(); RegionIdx++; } @@ -1316,3 +1342,34 @@ } } } + +static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) { + return std::any_of( + DAG->begin(), DAG->end(), [](MachineBasicBlock::iterator MI) { + unsigned Opc = MI->getOpcode(); + return Opc == AMDGPU::SCHED_GROUP_BARRIER || Opc == AMDGPU::IGLP_OPT; + }); +} + +GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive( + MachineSchedContext *C, std::unique_ptr S, + bool RemoveKillFlags) + : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {} + +void GCNPostScheduleDAGMILive::schedule() { + HasIGLPInstrs = hasIGLPInstrs(this); + if (HasIGLPInstrs) { + SavedMutations.clear(); + SavedMutations.swap(Mutations); + addMutation(createIGroupLPDAGMutation()); + } + + ScheduleDAGMI::schedule(); +} + +void GCNPostScheduleDAGMILive::finalizeSchedule() { + if (HasIGLPInstrs) + SavedMutations.swap(Mutations); + + ScheduleDAGMI::finalizeSchedule(); +} diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -344,6 +344,19 @@ let isMeta = 1; } +def IGLP_OPT : SPseudoInstSI<(outs), (ins i32imm:$mask), + [(int_amdgcn_iglp_opt (i32 timm:$mask))]> { + let SchedRW = []; + let hasNoSchedulingInfo = 1; + let hasSideEffects = 1; + let mayLoad = 0; + let mayStore = 0; + let isConvergent = 1; + let FixedSize = 1; + let Size = 0; + let isMeta = 1; +} + // SI pseudo instructions. These are used by the CFG structurizer pass // and should be lowered to ISA instructions prior to codegen. diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp --- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp +++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp @@ -131,6 +131,17 @@ bool Changed = false; for (MachineBasicBlock &MBB : MF) { + bool HasIGLPInstrs = + std::any_of(MBB.instr_begin(), MBB.instr_end(), [](MachineInstr &MI) { + unsigned Opc = MI.getOpcode(); + return (Opc == AMDGPU::SCHED_GROUP_BARRIER || + Opc == AMDGPU::IGLP_OPT); + }); + + // Don't cluster with IGLP instructions. + if (HasIGLPInstrs) + continue; + MachineBasicBlock::instr_iterator Next; MachineBasicBlock::instr_iterator B = MBB.instr_begin(); MachineBasicBlock::instr_iterator E = MBB.instr_end(); diff --git a/llvm/test/CodeGen/AMDGPU/igrouplp-dag-mutation.ll b/llvm/test/CodeGen/AMDGPU/igrouplp-dag-mutation.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/igrouplp-dag-mutation.ll +++ /dev/null @@ -1,277 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx90a -amdgpu-igrouplp=1 < %s | FileCheck -check-prefix=GREEDY %s -; RUN: llc -march=amdgcn -mcpu=gfx90a -amdgpu-igrouplp-exact-solver -amdgpu-igrouplp=1 < %s | FileCheck -check-prefix=EXACT %s - -define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(<32 x float> addrspace(3)* noalias %in, <32 x float> addrspace(3)* noalias %out) #0 { -; GREEDY-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: -; GREEDY: ; %bb.0: ; %entry -; GREEDY-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GREEDY-NEXT: v_lshlrev_b32_e32 v33, 7, v0 -; GREEDY-NEXT: v_mov_b32_e32 v34, 1.0 -; GREEDY-NEXT: v_mov_b32_e32 v35, 2.0 -; GREEDY-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY-NEXT: v_add_u32_e32 v32, s0, v33 -; GREEDY-NEXT: ds_read_b128 v[28:31], v32 offset:112 -; GREEDY-NEXT: ds_read_b128 v[24:27], v32 offset:96 -; GREEDY-NEXT: ds_read_b128 v[20:23], v32 offset:80 -; GREEDY-NEXT: ds_read_b128 v[16:19], v32 offset:64 -; GREEDY-NEXT: ds_read_b128 v[0:3], v32 -; GREEDY-NEXT: ds_read_b128 v[4:7], v32 offset:16 -; GREEDY-NEXT: ds_read_b128 v[8:11], v32 offset:32 -; GREEDY-NEXT: ds_read_b128 v[12:15], v32 offset:48 -; GREEDY-NEXT: v_add_u32_e32 v33, s1, v33 -; GREEDY-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v34, v35, v[0:31] -; GREEDY-NEXT: s_nop 7 -; GREEDY-NEXT: s_nop 7 -; GREEDY-NEXT: s_nop 2 -; GREEDY-NEXT: ds_write_b128 v33, v[28:31] offset:112 -; GREEDY-NEXT: ds_write_b128 v33, v[24:27] offset:96 -; GREEDY-NEXT: ds_write_b128 v33, v[20:23] offset:80 -; GREEDY-NEXT: ds_write_b128 v33, v[16:19] offset:64 -; GREEDY-NEXT: ds_write_b128 v33, v[12:15] offset:48 -; GREEDY-NEXT: ds_write_b128 v33, v[8:11] offset:32 -; GREEDY-NEXT: ds_write_b128 v33, v[4:7] offset:16 -; GREEDY-NEXT: ds_write_b128 v33, v[0:3] -; GREEDY-NEXT: ds_read_b128 v[64:67], v32 offset:8304 -; GREEDY-NEXT: ds_read_b128 v[60:63], v32 offset:8288 -; GREEDY-NEXT: ds_read_b128 v[56:59], v32 offset:8272 -; GREEDY-NEXT: ds_read_b128 v[52:55], v32 offset:8256 -; GREEDY-NEXT: ds_read_b128 v[48:51], v32 offset:8240 -; GREEDY-NEXT: ds_read_b128 v[44:47], v32 offset:8224 -; GREEDY-NEXT: ds_read_b128 v[40:43], v32 offset:8208 -; GREEDY-NEXT: ds_read_b128 v[36:39], v32 offset:8192 -; GREEDY-NEXT: v_mov_b32_e32 v0, s1 -; GREEDY-NEXT: v_add_u32_e32 v1, 0x6000, v32 -; GREEDY-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY-NEXT: v_mfma_f32_32x32x1f32 v[36:67], v34, v35, v[36:67] -; GREEDY-NEXT: s_nop 7 -; GREEDY-NEXT: s_nop 7 -; GREEDY-NEXT: s_nop 2 -; GREEDY-NEXT: ds_write_b128 v0, v[60:63] offset:8288 -; GREEDY-NEXT: ds_write_b128 v0, v[64:67] offset:8304 -; GREEDY-NEXT: ds_write_b128 v0, v[52:55] offset:8256 -; GREEDY-NEXT: ds_write_b128 v0, v[56:59] offset:8272 -; GREEDY-NEXT: ds_write_b128 v0, v[44:47] offset:8224 -; GREEDY-NEXT: ds_write_b128 v0, v[48:51] offset:8240 -; GREEDY-NEXT: ds_write_b128 v0, v[36:39] offset:8192 -; GREEDY-NEXT: ds_write_b128 v0, v[40:43] offset:8208 -; GREEDY-NEXT: ds_read_b128 v[64:67], v32 offset:24688 -; GREEDY-NEXT: ds_read_b128 v[60:63], v32 offset:24672 -; GREEDY-NEXT: ds_read_b128 v[56:59], v32 offset:24656 -; GREEDY-NEXT: ds_read_b128 v[52:55], v32 offset:24640 -; GREEDY-NEXT: ds_read_b128 v[48:51], v32 offset:24624 -; GREEDY-NEXT: ds_read_b128 v[44:47], v32 offset:24608 -; GREEDY-NEXT: ds_read_b128 v[40:43], v32 offset:24592 -; GREEDY-NEXT: ds_read_b128 v[36:39], v32 offset:24576 -; GREEDY-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY-NEXT: v_mfma_f32_32x32x1f32 v[36:67], v34, v35, v[36:67] -; GREEDY-NEXT: s_nop 7 -; GREEDY-NEXT: s_nop 7 -; GREEDY-NEXT: s_nop 2 -; GREEDY-NEXT: ds_write_b128 v0, v[60:63] offset:16480 -; GREEDY-NEXT: ds_write_b128 v0, v[64:67] offset:16496 -; GREEDY-NEXT: ds_write_b128 v0, v[52:55] offset:16448 -; GREEDY-NEXT: ds_write_b128 v0, v[56:59] offset:16464 -; GREEDY-NEXT: ds_write_b128 v0, v[44:47] offset:16416 -; GREEDY-NEXT: ds_write_b128 v0, v[48:51] offset:16432 -; GREEDY-NEXT: ds_write_b128 v0, v[36:39] offset:16384 -; GREEDY-NEXT: ds_write_b128 v0, v[40:43] offset:16400 -; GREEDY-NEXT: ds_read_b128 v[64:67], v32 offset:49264 -; GREEDY-NEXT: ds_read_b128 v[60:63], v32 offset:49248 -; GREEDY-NEXT: ds_read_b128 v[56:59], v32 offset:49232 -; GREEDY-NEXT: ds_read_b128 v[52:55], v32 offset:49216 -; GREEDY-NEXT: ds_read_b128 v[48:51], v32 offset:49200 -; GREEDY-NEXT: ds_read_b128 v[44:47], v32 offset:49184 -; GREEDY-NEXT: ds_read_b128 v[40:43], v32 offset:49168 -; GREEDY-NEXT: ds_read_b128 v[36:39], v32 offset:49152 -; GREEDY-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY-NEXT: v_mfma_f32_32x32x1f32 v[36:67], v34, v35, v[36:67] -; GREEDY-NEXT: s_nop 7 -; GREEDY-NEXT: s_nop 7 -; GREEDY-NEXT: s_nop 2 -; GREEDY-NEXT: ds_write_b128 v0, v[60:63] offset:24672 -; GREEDY-NEXT: ds_write_b128 v0, v[64:67] offset:24688 -; GREEDY-NEXT: ds_write_b128 v0, v[52:55] offset:24640 -; GREEDY-NEXT: ds_write_b128 v0, v[56:59] offset:24656 -; GREEDY-NEXT: ds_write_b128 v0, v[44:47] offset:24608 -; GREEDY-NEXT: ds_write_b128 v0, v[48:51] offset:24624 -; GREEDY-NEXT: ds_write_b128 v0, v[36:39] offset:24576 -; GREEDY-NEXT: ds_write_b128 v0, v[40:43] offset:24592 -; GREEDY-NEXT: ds_read_b128 v[30:33], v1 offset:57456 -; GREEDY-NEXT: ds_read_b128 v[26:29], v1 offset:57440 -; GREEDY-NEXT: ds_read_b128 v[22:25], v1 offset:57424 -; GREEDY-NEXT: ds_read_b128 v[18:21], v1 offset:57408 -; GREEDY-NEXT: ds_read_b128 v[2:5], v1 offset:57344 -; GREEDY-NEXT: ds_read_b128 v[6:9], v1 offset:57360 -; GREEDY-NEXT: ds_read_b128 v[10:13], v1 offset:57376 -; GREEDY-NEXT: ds_read_b128 v[14:17], v1 offset:57392 -; GREEDY-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY-NEXT: v_mfma_f32_32x32x1f32 v[2:33], v34, v35, v[2:33] -; GREEDY-NEXT: s_nop 7 -; GREEDY-NEXT: s_nop 7 -; GREEDY-NEXT: s_nop 2 -; GREEDY-NEXT: ds_write_b128 v0, v[26:29] offset:32864 -; GREEDY-NEXT: ds_write_b128 v0, v[30:33] offset:32880 -; GREEDY-NEXT: ds_write_b128 v0, v[18:21] offset:32832 -; GREEDY-NEXT: ds_write_b128 v0, v[22:25] offset:32848 -; GREEDY-NEXT: ds_write_b128 v0, v[10:13] offset:32800 -; GREEDY-NEXT: ds_write_b128 v0, v[14:17] offset:32816 -; GREEDY-NEXT: ds_write_b128 v0, v[2:5] offset:32768 -; GREEDY-NEXT: ds_write_b128 v0, v[6:9] offset:32784 -; GREEDY-NEXT: s_endpgm -; -; EXACT-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: -; EXACT: ; %bb.0: ; %entry -; EXACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; EXACT-NEXT: v_lshlrev_b32_e32 v33, 7, v0 -; EXACT-NEXT: v_mov_b32_e32 v34, 1.0 -; EXACT-NEXT: v_mov_b32_e32 v35, 2.0 -; EXACT-NEXT: s_waitcnt lgkmcnt(0) -; EXACT-NEXT: v_add_u32_e32 v32, s0, v33 -; EXACT-NEXT: ds_read_b128 v[28:31], v32 offset:112 -; EXACT-NEXT: ds_read_b128 v[24:27], v32 offset:96 -; EXACT-NEXT: ds_read_b128 v[20:23], v32 offset:80 -; EXACT-NEXT: ds_read_b128 v[16:19], v32 offset:64 -; EXACT-NEXT: ds_read_b128 v[0:3], v32 -; EXACT-NEXT: ds_read_b128 v[4:7], v32 offset:16 -; EXACT-NEXT: ds_read_b128 v[8:11], v32 offset:32 -; EXACT-NEXT: ds_read_b128 v[12:15], v32 offset:48 -; EXACT-NEXT: v_add_u32_e32 v33, s1, v33 -; EXACT-NEXT: s_waitcnt lgkmcnt(0) -; EXACT-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v34, v35, v[0:31] -; EXACT-NEXT: s_nop 7 -; EXACT-NEXT: s_nop 7 -; EXACT-NEXT: s_nop 2 -; EXACT-NEXT: ds_write_b128 v33, v[28:31] offset:112 -; EXACT-NEXT: ds_write_b128 v33, v[24:27] offset:96 -; EXACT-NEXT: ds_write_b128 v33, v[20:23] offset:80 -; EXACT-NEXT: ds_write_b128 v33, v[16:19] offset:64 -; EXACT-NEXT: ds_write_b128 v33, v[12:15] offset:48 -; EXACT-NEXT: ds_write_b128 v33, v[8:11] offset:32 -; EXACT-NEXT: ds_write_b128 v33, v[4:7] offset:16 -; EXACT-NEXT: ds_write_b128 v33, v[0:3] -; EXACT-NEXT: ds_read_b128 v[64:67], v32 offset:8304 -; EXACT-NEXT: ds_read_b128 v[60:63], v32 offset:8288 -; EXACT-NEXT: ds_read_b128 v[56:59], v32 offset:8272 -; EXACT-NEXT: ds_read_b128 v[52:55], v32 offset:8256 -; EXACT-NEXT: ds_read_b128 v[48:51], v32 offset:8240 -; EXACT-NEXT: ds_read_b128 v[44:47], v32 offset:8224 -; EXACT-NEXT: ds_read_b128 v[40:43], v32 offset:8208 -; EXACT-NEXT: ds_read_b128 v[36:39], v32 offset:8192 -; EXACT-NEXT: v_mov_b32_e32 v0, s1 -; EXACT-NEXT: v_add_u32_e32 v1, 0x6000, v32 -; EXACT-NEXT: s_waitcnt lgkmcnt(0) -; EXACT-NEXT: v_mfma_f32_32x32x1f32 v[36:67], v34, v35, v[36:67] -; EXACT-NEXT: s_nop 7 -; EXACT-NEXT: s_nop 7 -; EXACT-NEXT: s_nop 2 -; EXACT-NEXT: ds_write_b128 v0, v[60:63] offset:8288 -; EXACT-NEXT: ds_write_b128 v0, v[64:67] offset:8304 -; EXACT-NEXT: ds_write_b128 v0, v[52:55] offset:8256 -; EXACT-NEXT: ds_write_b128 v0, v[56:59] offset:8272 -; EXACT-NEXT: ds_write_b128 v0, v[44:47] offset:8224 -; EXACT-NEXT: ds_write_b128 v0, v[48:51] offset:8240 -; EXACT-NEXT: ds_write_b128 v0, v[36:39] offset:8192 -; EXACT-NEXT: ds_write_b128 v0, v[40:43] offset:8208 -; EXACT-NEXT: ds_read_b128 v[64:67], v32 offset:24688 -; EXACT-NEXT: ds_read_b128 v[60:63], v32 offset:24672 -; EXACT-NEXT: ds_read_b128 v[56:59], v32 offset:24656 -; EXACT-NEXT: ds_read_b128 v[52:55], v32 offset:24640 -; EXACT-NEXT: ds_read_b128 v[48:51], v32 offset:24624 -; EXACT-NEXT: ds_read_b128 v[44:47], v32 offset:24608 -; EXACT-NEXT: ds_read_b128 v[40:43], v32 offset:24592 -; EXACT-NEXT: ds_read_b128 v[36:39], v32 offset:24576 -; EXACT-NEXT: s_waitcnt lgkmcnt(0) -; EXACT-NEXT: v_mfma_f32_32x32x1f32 v[36:67], v34, v35, v[36:67] -; EXACT-NEXT: s_nop 7 -; EXACT-NEXT: s_nop 7 -; EXACT-NEXT: s_nop 2 -; EXACT-NEXT: ds_write_b128 v0, v[60:63] offset:16480 -; EXACT-NEXT: ds_write_b128 v0, v[64:67] offset:16496 -; EXACT-NEXT: ds_write_b128 v0, v[52:55] offset:16448 -; EXACT-NEXT: ds_write_b128 v0, v[56:59] offset:16464 -; EXACT-NEXT: ds_write_b128 v0, v[44:47] offset:16416 -; EXACT-NEXT: ds_write_b128 v0, v[48:51] offset:16432 -; EXACT-NEXT: ds_write_b128 v0, v[36:39] offset:16384 -; EXACT-NEXT: ds_write_b128 v0, v[40:43] offset:16400 -; EXACT-NEXT: ds_read_b128 v[64:67], v32 offset:49264 -; EXACT-NEXT: ds_read_b128 v[60:63], v32 offset:49248 -; EXACT-NEXT: ds_read_b128 v[56:59], v32 offset:49232 -; EXACT-NEXT: ds_read_b128 v[52:55], v32 offset:49216 -; EXACT-NEXT: ds_read_b128 v[48:51], v32 offset:49200 -; EXACT-NEXT: ds_read_b128 v[44:47], v32 offset:49184 -; EXACT-NEXT: ds_read_b128 v[40:43], v32 offset:49168 -; EXACT-NEXT: ds_read_b128 v[36:39], v32 offset:49152 -; EXACT-NEXT: s_waitcnt lgkmcnt(0) -; EXACT-NEXT: v_mfma_f32_32x32x1f32 v[36:67], v34, v35, v[36:67] -; EXACT-NEXT: s_nop 7 -; EXACT-NEXT: s_nop 7 -; EXACT-NEXT: s_nop 2 -; EXACT-NEXT: ds_write_b128 v0, v[60:63] offset:24672 -; EXACT-NEXT: ds_write_b128 v0, v[64:67] offset:24688 -; EXACT-NEXT: ds_write_b128 v0, v[52:55] offset:24640 -; EXACT-NEXT: ds_write_b128 v0, v[56:59] offset:24656 -; EXACT-NEXT: ds_write_b128 v0, v[44:47] offset:24608 -; EXACT-NEXT: ds_write_b128 v0, v[48:51] offset:24624 -; EXACT-NEXT: ds_write_b128 v0, v[36:39] offset:24576 -; EXACT-NEXT: ds_write_b128 v0, v[40:43] offset:24592 -; EXACT-NEXT: ds_read_b128 v[30:33], v1 offset:57456 -; EXACT-NEXT: ds_read_b128 v[26:29], v1 offset:57440 -; EXACT-NEXT: ds_read_b128 v[22:25], v1 offset:57424 -; EXACT-NEXT: ds_read_b128 v[18:21], v1 offset:57408 -; EXACT-NEXT: ds_read_b128 v[2:5], v1 offset:57344 -; EXACT-NEXT: ds_read_b128 v[6:9], v1 offset:57360 -; EXACT-NEXT: ds_read_b128 v[10:13], v1 offset:57376 -; EXACT-NEXT: ds_read_b128 v[14:17], v1 offset:57392 -; EXACT-NEXT: s_waitcnt lgkmcnt(0) -; EXACT-NEXT: v_mfma_f32_32x32x1f32 v[2:33], v34, v35, v[2:33] -; EXACT-NEXT: s_nop 7 -; EXACT-NEXT: s_nop 7 -; EXACT-NEXT: s_nop 2 -; EXACT-NEXT: ds_write_b128 v0, v[26:29] offset:32864 -; EXACT-NEXT: ds_write_b128 v0, v[30:33] offset:32880 -; EXACT-NEXT: ds_write_b128 v0, v[18:21] offset:32832 -; EXACT-NEXT: ds_write_b128 v0, v[22:25] offset:32848 -; EXACT-NEXT: ds_write_b128 v0, v[10:13] offset:32800 -; EXACT-NEXT: ds_write_b128 v0, v[14:17] offset:32816 -; EXACT-NEXT: ds_write_b128 v0, v[2:5] offset:32768 -; EXACT-NEXT: ds_write_b128 v0, v[6:9] offset:32784 -; EXACT-NEXT: s_endpgm -entry: - %idx = call i32 @llvm.amdgcn.workitem.id.x() - %load.0.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %in, i32 %idx - %load.0 = load <32 x float>, <32 x float> addrspace(3)* %load.0.addr - %load.1.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.0.addr, i32 64 - %load.1 = load <32 x float>, <32 x float> addrspace(3)* %load.1.addr - %load.2.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.1.addr, i32 128 - %load.2 = load <32 x float>, <32 x float> addrspace(3)* %load.2.addr - %load.3.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.2.addr, i32 192 - %load.3 = load <32 x float>, <32 x float> addrspace(3)* %load.3.addr - %load.4.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.3.addr, i32 256 - %load.4 = load <32 x float>, <32 x float> addrspace(3)* %load.4.addr - %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.0, i32 0, i32 0, i32 0) - %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.1, i32 0, i32 0, i32 0) - %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.2, i32 0, i32 0, i32 0) - %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.3, i32 0, i32 0, i32 0) - %mai.4 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.4, i32 0, i32 0, i32 0) - %store.0.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 %idx - store <32 x float> %mai.0, <32 x float> addrspace(3)* %store.0.addr - %store.1.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 64 - store <32 x float> %mai.1, <32 x float> addrspace(3)* %store.1.addr - %store.2.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 128 - store <32 x float> %mai.2, <32 x float> addrspace(3)* %store.2.addr - %store.3.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 192 - store <32 x float> %mai.3, <32 x float> addrspace(3)* %store.3.addr - %store.4.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 256 - store <32 x float> %mai.4, <32 x float> addrspace(3)* %store.4.addr - ret void -} - -declare i32 @llvm.amdgcn.workitem.id.x() #2 -declare void @llvm.amdgcn.sched.group.barrier(i32, i32, i32) #1 -declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #1 - -attributes #0 = { nounwind "amdgpu-flat-workgroup-size"="1,256" } -attributes #1 = { nounwind } -attributes #2 = { nounwind readnone speculatable } diff --git a/llvm/test/CodeGen/AMDGPU/igrouplp-dag-mutation.mir b/llvm/test/CodeGen/AMDGPU/igrouplp-dag-mutation.mir deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/igrouplp-dag-mutation.mir +++ /dev/null @@ -1,292 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - 2>&1 | FileCheck -check-prefix=DEFAULT %s -# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - -amdgpu-igrouplp=1 2>&1 | FileCheck -check-prefix=PIPELINE %s -# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - -amdgpu-igrouplp=1 -amdgpu-igrouplp-exact-solver 2>&1 | FileCheck -check-prefix=EXACT %s - ---- -name: no_pipeline -tracksRegLiveness: true -body: | - bb.0: - liveins: $sgpr0, $vgpr10_vgpr11 - ; DEFAULT-LABEL: name: no_pipeline - ; DEFAULT: liveins: $sgpr0, $vgpr10_vgpr11 - ; DEFAULT-NEXT: {{ $}} - ; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec - ; DEFAULT-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec - ; DEFAULT-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec - ; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec - ; DEFAULT-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec - ; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec - ; DEFAULT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec - ; DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec - ; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec - ; PIPELINE-LABEL: name: no_pipeline - ; PIPELINE: liveins: $sgpr0, $vgpr10_vgpr11 - ; PIPELINE-NEXT: {{ $}} - ; PIPELINE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec - ; PIPELINE-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec - ; PIPELINE-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec - ; PIPELINE-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec - ; PIPELINE-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec - ; PIPELINE-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec - ; PIPELINE-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec - ; EXACT-LABEL: name: no_pipeline - ; EXACT: liveins: $sgpr0, $vgpr10_vgpr11 - ; EXACT-NEXT: {{ $}} - ; EXACT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; EXACT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec - ; EXACT-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec - ; EXACT-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec - ; EXACT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec - ; EXACT-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec - ; EXACT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec - ; EXACT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec - ; EXACT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec - ; EXACT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec - $vgpr1 = V_MOV_B32_e32 1, implicit $exec - $vgpr0 = V_MOV_B32_e32 1, implicit $exec - $vgpr8 = V_MOV_B32_e32 0, implicit $exec - $vgpr9 = V_MOV_B32_e32 9, implicit $exec - $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec - GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec - $vgpr2 = V_MOV_B32_e32 1, implicit $exec - $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec - $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec -... - - ---- -name: full_pipe -tracksRegLiveness: true -body: | - bb.0: - liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $sgpr0, $vgpr10_vgpr11 - ; DEFAULT-LABEL: name: full_pipe - ; DEFAULT: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $vgpr10_vgpr11 - ; DEFAULT-NEXT: {{ $}} - ; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 2, implicit $exec - ; DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 3, implicit $exec - ; DEFAULT-NEXT: $vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec - ; DEFAULT-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec - ; DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 4, implicit $exec - ; DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 5, implicit $exec - ; DEFAULT-NEXT: $vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec - ; DEFAULT-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec - ; DEFAULT-NEXT: $vgpr26 = V_MOV_B32_e32 1, implicit $exec - ; DEFAULT-NEXT: $vgpr27 = V_MOV_B32_e32 1, implicit $exec - ; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; DEFAULT-NEXT: $vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - ; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec - ; DEFAULT-NEXT: $vgpr24 = V_MOV_B32_e32 1, implicit $exec - ; DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; DEFAULT-NEXT: $vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - ; DEFAULT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec - ; DEFAULT-NEXT: $vgpr21 = V_MUL_LO_U32_e64 $vgpr1, killed $sgpr0, implicit $exec - ; DEFAULT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec - ; DEFAULT-NEXT: $vgpr30 = V_MOV_B32_e32 30, implicit $exec - ; DEFAULT-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec - ; DEFAULT-NEXT: $vgpr18 = V_MOV_B32_e32 1, implicit $exec - ; DEFAULT-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec { - ; DEFAULT-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec - ; DEFAULT-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec - ; DEFAULT-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec - ; DEFAULT-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec - ; DEFAULT-NEXT: $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec - ; DEFAULT-NEXT: } - ; DEFAULT-NEXT: DS_WRITE_B32 $vgpr3, killed $vgpr1, 0, 16, implicit $m0, implicit $exec - ; DEFAULT-NEXT: BUNDLE implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit killed $vgpr26_vgpr27, implicit $exec { - ; DEFAULT-NEXT: $vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec - ; DEFAULT-NEXT: $vgpr20 = GLOBAL_LOAD_USHORT killed $vgpr26_vgpr27, 0, 0, implicit $exec - ; DEFAULT-NEXT: } - ; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; DEFAULT-NEXT: DS_WRITE_B32 killed $vgpr0, killed $vgpr7, 0, 16, implicit $m0, implicit $exec - ; DEFAULT-NEXT: $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr10, killed $vgpr11, killed $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec - ; DEFAULT-NEXT: DS_WRITE_B32 killed $vgpr23, killed $vgpr3, 0, 16, implicit $m0, implicit $exec - ; DEFAULT-NEXT: DS_WRITE_B32 killed $vgpr9, killed $vgpr24, 0, 16, implicit $m0, implicit $exec - ; PIPELINE-LABEL: name: full_pipe - ; PIPELINE: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $vgpr10_vgpr11 - ; PIPELINE-NEXT: {{ $}} - ; PIPELINE-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; PIPELINE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: $vgpr2 = V_MOV_B32_e32 2, implicit $exec - ; PIPELINE-NEXT: $vgpr3 = V_MOV_B32_e32 3, implicit $exec - ; PIPELINE-NEXT: $vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec - ; PIPELINE-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec - ; PIPELINE-NEXT: $vgpr4 = V_MOV_B32_e32 4, implicit $exec - ; PIPELINE-NEXT: $vgpr5 = V_MOV_B32_e32 5, implicit $exec - ; PIPELINE-NEXT: $vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec - ; PIPELINE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec - ; PIPELINE-NEXT: $vgpr26 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: $vgpr27 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: $vgpr24 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: $vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - ; PIPELINE-NEXT: $vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - ; PIPELINE-NEXT: $vgpr21 = V_MUL_LO_U32_e64 $vgpr1, killed $sgpr0, implicit $exec - ; PIPELINE-NEXT: $vgpr30 = V_MOV_B32_e32 30, implicit $exec - ; PIPELINE-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: $vgpr18 = V_MOV_B32_e32 1, implicit $exec - ; PIPELINE-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec { - ; PIPELINE-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec - ; PIPELINE-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec - ; PIPELINE-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec - ; PIPELINE-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec - ; PIPELINE-NEXT: $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec - ; PIPELINE-NEXT: } - ; PIPELINE-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; PIPELINE-NEXT: DS_WRITE_B32 $vgpr3, $vgpr1, 0, 16, implicit $m0, implicit $exec - ; PIPELINE-NEXT: BUNDLE implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit killed $vgpr26_vgpr27, implicit $exec { - ; PIPELINE-NEXT: $vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec - ; PIPELINE-NEXT: $vgpr20 = GLOBAL_LOAD_USHORT killed $vgpr26_vgpr27, 0, 0, implicit $exec - ; PIPELINE-NEXT: } - ; PIPELINE-NEXT: BUNDLE implicit $vgpr0, implicit killed $vgpr7, implicit $m0, implicit $exec, implicit killed $vgpr23, implicit $vgpr3 { - ; PIPELINE-NEXT: DS_WRITE_B32 $vgpr0, killed $vgpr7, 0, 16, implicit $m0, implicit $exec - ; PIPELINE-NEXT: DS_WRITE_B32 killed $vgpr23, $vgpr3, 0, 16, implicit $m0, implicit $exec - ; PIPELINE-NEXT: } - ; PIPELINE-NEXT: DS_WRITE_B32 killed $vgpr9, killed $vgpr24, 0, 16, implicit $m0, implicit $exec - ; PIPELINE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; PIPELINE-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec - ; PIPELINE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; PIPELINE-NEXT: $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr10, killed $vgpr11, killed $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec - ; PIPELINE-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec - ; EXACT-LABEL: name: full_pipe - ; EXACT: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $vgpr10_vgpr11 - ; EXACT-NEXT: {{ $}} - ; EXACT-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; EXACT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; EXACT-NEXT: $vgpr2 = V_MOV_B32_e32 2, implicit $exec - ; EXACT-NEXT: $vgpr3 = V_MOV_B32_e32 3, implicit $exec - ; EXACT-NEXT: $vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec - ; EXACT-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec - ; EXACT-NEXT: $vgpr4 = V_MOV_B32_e32 4, implicit $exec - ; EXACT-NEXT: $vgpr5 = V_MOV_B32_e32 5, implicit $exec - ; EXACT-NEXT: $vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec - ; EXACT-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec - ; EXACT-NEXT: $vgpr26 = V_MOV_B32_e32 1, implicit $exec - ; EXACT-NEXT: $vgpr27 = V_MOV_B32_e32 1, implicit $exec - ; EXACT-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec - ; EXACT-NEXT: $vgpr24 = V_MOV_B32_e32 1, implicit $exec - ; EXACT-NEXT: $vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - ; EXACT-NEXT: $vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - ; EXACT-NEXT: $vgpr21 = V_MUL_LO_U32_e64 $vgpr1, killed $sgpr0, implicit $exec - ; EXACT-NEXT: $vgpr30 = V_MOV_B32_e32 30, implicit $exec - ; EXACT-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec - ; EXACT-NEXT: $vgpr18 = V_MOV_B32_e32 1, implicit $exec - ; EXACT-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec { - ; EXACT-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec - ; EXACT-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec - ; EXACT-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec - ; EXACT-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec - ; EXACT-NEXT: $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec - ; EXACT-NEXT: } - ; EXACT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; EXACT-NEXT: DS_WRITE_B32 $vgpr3, $vgpr1, 0, 16, implicit $m0, implicit $exec - ; EXACT-NEXT: BUNDLE implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit killed $vgpr26_vgpr27, implicit $exec { - ; EXACT-NEXT: $vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec - ; EXACT-NEXT: $vgpr20 = GLOBAL_LOAD_USHORT killed $vgpr26_vgpr27, 0, 0, implicit $exec - ; EXACT-NEXT: } - ; EXACT-NEXT: BUNDLE implicit $vgpr0, implicit killed $vgpr7, implicit $m0, implicit $exec, implicit killed $vgpr23, implicit $vgpr3 { - ; EXACT-NEXT: DS_WRITE_B32 $vgpr0, killed $vgpr7, 0, 16, implicit $m0, implicit $exec - ; EXACT-NEXT: DS_WRITE_B32 killed $vgpr23, $vgpr3, 0, 16, implicit $m0, implicit $exec - ; EXACT-NEXT: } - ; EXACT-NEXT: DS_WRITE_B32 killed $vgpr9, killed $vgpr24, 0, 16, implicit $m0, implicit $exec - ; EXACT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; EXACT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec - ; EXACT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; EXACT-NEXT: $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr10, killed $vgpr11, killed $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec - ; EXACT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec - $vgpr0 = V_MOV_B32_e32 0, implicit $exec - $vgpr1 = V_MOV_B32_e32 1, implicit $exec - $vgpr2 = V_MOV_B32_e32 2, implicit $exec - $vgpr3 = V_MOV_B32_e32 3, implicit $exec - $vgpr4 = V_MOV_B32_e32 4, implicit $exec - $vgpr5 = V_MOV_B32_e32 5, implicit $exec - $vgpr30 = V_MOV_B32_e32 30, implicit $exec - $vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec - $vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec - $vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec - $vgpr9 = V_MOV_B32_e32 1, implicit $exec - $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec - $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - $vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - $vgpr24 = V_MOV_B32_e32 1, implicit $exec - $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec - $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec - $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec - $vgpr26 = V_MOV_B32_e32 1, implicit $exec - $vgpr27 = V_MOV_B32_e32 1, implicit $exec - $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec - $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - $vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - $vgpr21 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec - $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec - $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec - DS_WRITE_B32 $vgpr3, $vgpr1, 0, 16, implicit $m0, implicit $exec - $vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec - $vgpr17 = V_MOV_B32_e32 1, implicit $exec - $vgpr18 = V_MOV_B32_e32 1, implicit $exec - $vgpr20 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec - DS_WRITE_B32 $vgpr0, $vgpr7, 0, 16, implicit $m0, implicit $exec - $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 $vgpr10, $vgpr11, $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec - DS_WRITE_B32 $vgpr23, $vgpr3, 0, 16, implicit $m0, implicit $exec - $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec - DS_WRITE_B32 $vgpr9, $vgpr24, 0, 16, implicit $m0, implicit $exec -... - ---- -name: block_ends_in_bundle -tracksRegLiveness: true -body: | - bb.0: - liveins: $vgpr0, $vgpr1, $vgpr7, $agpr0_agpr1_agpr2_agpr3 - ; DEFAULT-LABEL: name: block_ends_in_bundle - ; DEFAULT: liveins: $vgpr0, $vgpr1, $vgpr7, $agpr0_agpr1_agpr2_agpr3 - ; DEFAULT-NEXT: {{ $}} - ; DEFAULT-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit killed $vgpr7, implicit $exec { - ; DEFAULT-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec - ; DEFAULT-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec - ; DEFAULT-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec - ; DEFAULT-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec - ; DEFAULT-NEXT: $vgpr16 = DS_READ_U16_gfx9 killed $vgpr7, 0, 2048, implicit $exec - ; DEFAULT-NEXT: } - ; DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; PIPELINE-LABEL: name: block_ends_in_bundle - ; PIPELINE: liveins: $vgpr0, $vgpr1, $vgpr7, $agpr0_agpr1_agpr2_agpr3 - ; PIPELINE-NEXT: {{ $}} - ; PIPELINE-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit killed $vgpr7, implicit $exec { - ; PIPELINE-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec - ; PIPELINE-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec - ; PIPELINE-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec - ; PIPELINE-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec - ; PIPELINE-NEXT: $vgpr16 = DS_READ_U16_gfx9 killed $vgpr7, 0, 2048, implicit $exec - ; PIPELINE-NEXT: } - ; PIPELINE-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; EXACT-LABEL: name: block_ends_in_bundle - ; EXACT: liveins: $vgpr0, $vgpr1, $vgpr7, $agpr0_agpr1_agpr2_agpr3 - ; EXACT-NEXT: {{ $}} - ; EXACT-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit killed $vgpr7, implicit $exec { - ; EXACT-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec - ; EXACT-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec - ; EXACT-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec - ; EXACT-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec - ; EXACT-NEXT: $vgpr16 = DS_READ_U16_gfx9 killed $vgpr7, 0, 2048, implicit $exec - ; EXACT-NEXT: } - ; EXACT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec { - $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec - $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec - $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec - $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec - $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec - } -... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -0,0 +1,158 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +define amdgpu_kernel void @test_iglp_opt() #0 { +; GCN-LABEL: test_iglp_opt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: ; iglp_opt mask(0x00000000) +; GCN-NEXT: s_endpgm +entry: + call void @llvm.amdgcn.iglp.opt(i32 0) #1 + ret void +} + +define amdgpu_kernel void @test_iglp_opt_mfma_gemm(<32 x float> addrspace(3)* noalias %in, <32 x float> addrspace(3)* noalias %out) #0 { +; GCN-LABEL: test_iglp_opt_mfma_gemm: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 1.0 +; GCN-NEXT: v_mov_b32_e32 v3, 2.0 +; GCN-NEXT: ; iglp_opt mask(0x00000000) +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_add_u32_e32 v1, s0, v0 +; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 +; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 +; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 +; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:64 +; GCN-NEXT: ds_read_b128 a[0:3], v1 +; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:16 +; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:32 +; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-NEXT: v_add_u32_e32 v0, s1, v0 +; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:8240 +; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:8224 +; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:8304 +; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:8208 +; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:8192 +; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:8288 +; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v1 +; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49264 +; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49248 +; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49232 +; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49216 +; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49200 +; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49184 +; GCN-NEXT: ds_read_b128 a[116:119], v4 offset:57456 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:48 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 +; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:8272 +; GCN-NEXT: ds_write_b128 v0, a[0:3] +; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:8256 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:24688 +; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:24672 +; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:24656 +; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:24640 +; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:24624 +; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:24608 +; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:24592 +; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:24576 +; GCN-NEXT: ds_read_b128 a[112:115], v4 offset:57440 +; GCN-NEXT: ds_read_b128 a[108:111], v4 offset:57424 +; GCN-NEXT: ds_read_b128 a[104:107], v4 offset:57408 +; GCN-NEXT: ds_read_b128 a[88:91], v4 offset:57344 +; GCN-NEXT: ds_read_b128 a[92:95], v4 offset:57360 +; GCN-NEXT: ds_read_b128 a[96:99], v4 offset:57376 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:8288 +; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:8304 +; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:49168 +; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:49152 +; GCN-NEXT: ds_read_b128 a[100:103], v4 offset:57392 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[88:119], v2, v3, a[88:119] +; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:8256 +; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:8272 +; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:8224 +; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:8240 +; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:8192 +; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:8208 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[56:87], v2, v3, a[56:87] +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:32864 +; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:32880 +; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:32832 +; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:32848 +; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:32800 +; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:32816 +; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:32768 +; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:32784 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:24672 +; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:24688 +; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:24640 +; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:24656 +; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:24608 +; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:24624 +; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:24576 +; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:24592 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:16480 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:16496 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:16448 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:16464 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:16416 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:16432 +; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:16384 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16400 +; GCN-NEXT: s_endpgm +entry: + call void @llvm.amdgcn.iglp.opt(i32 0) + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %load.0.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %in, i32 %idx + %load.0 = load <32 x float>, <32 x float> addrspace(3)* %load.0.addr + %load.1.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.0.addr, i32 64 + %load.1 = load <32 x float>, <32 x float> addrspace(3)* %load.1.addr + %load.2.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.1.addr, i32 128 + %load.2 = load <32 x float>, <32 x float> addrspace(3)* %load.2.addr + %load.3.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.2.addr, i32 192 + %load.3 = load <32 x float>, <32 x float> addrspace(3)* %load.3.addr + %load.4.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.3.addr, i32 256 + %load.4 = load <32 x float>, <32 x float> addrspace(3)* %load.4.addr + %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.0, i32 0, i32 0, i32 0) + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.1, i32 0, i32 0, i32 0) + %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.2, i32 0, i32 0, i32 0) + %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.3, i32 0, i32 0, i32 0) + %mai.4 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.4, i32 0, i32 0, i32 0) + %store.0.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 %idx + store <32 x float> %mai.0, <32 x float> addrspace(3)* %store.0.addr + %store.1.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 64 + store <32 x float> %mai.1, <32 x float> addrspace(3)* %store.1.addr + %store.2.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 128 + store <32 x float> %mai.2, <32 x float> addrspace(3)* %store.2.addr + %store.3.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 192 + store <32 x float> %mai.3, <32 x float> addrspace(3)* %store.3.addr + %store.4.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 256 + store <32 x float> %mai.4, <32 x float> addrspace(3)* %store.4.addr + ret void +} + +declare void @llvm.amdgcn.iglp.opt(i32) #1 +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #1 + +attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" } +attributes #1 = { convergent nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -31,6 +31,7 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; GCN-NEXT: ; kill: killed $sgpr0_sgpr1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] ; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 @@ -50,32 +51,37 @@ ; GCN-NEXT: v_mul_lo_u32 v7, v7, v7 ; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 ; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_lo_u32 v31, v31, v31 -; GCN-NEXT: v_mul_lo_u32 v30, v30, v30 -; GCN-NEXT: v_mul_lo_u32 v29, v29, v29 -; GCN-NEXT: v_mul_lo_u32 v28, v28, v28 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 +; GCN-NEXT: s_waitcnt vmcnt(5) ; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 ; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 ; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 +; GCN-NEXT: s_waitcnt vmcnt(4) ; GCN-NEXT: v_mul_lo_u32 v15, v15, v15 ; GCN-NEXT: v_mul_lo_u32 v14, v14, v14 ; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 ; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 +; GCN-NEXT: s_waitcnt vmcnt(3) ; GCN-NEXT: v_mul_lo_u32 v19, v19, v19 ; GCN-NEXT: v_mul_lo_u32 v18, v18, v18 ; GCN-NEXT: v_mul_lo_u32 v17, v17, v17 ; GCN-NEXT: v_mul_lo_u32 v16, v16, v16 +; GCN-NEXT: s_waitcnt vmcnt(2) ; GCN-NEXT: v_mul_lo_u32 v23, v23, v23 ; GCN-NEXT: v_mul_lo_u32 v22, v22, v22 ; GCN-NEXT: v_mul_lo_u32 v21, v21, v21 ; GCN-NEXT: v_mul_lo_u32 v20, v20, v20 +; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_lo_u32 v27, v27, v27 ; GCN-NEXT: v_mul_lo_u32 v26, v26, v26 ; GCN-NEXT: v_mul_lo_u32 v25, v25, v25 ; GCN-NEXT: v_mul_lo_u32 v24, v24, v24 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_lo_u32 v31, v31, v31 +; GCN-NEXT: v_mul_lo_u32 v30, v30, v30 +; GCN-NEXT: v_mul_lo_u32 v29, v29, v29 +; GCN-NEXT: v_mul_lo_u32 v28, v28, v28 ; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:112 ; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:96 ; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:80 @@ -92,6 +98,7 @@ ; EXACTCUTOFF: ; %bb.0: ; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; EXACTCUTOFF-NEXT: ; kill: killed $sgpr0_sgpr1 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 @@ -111,32 +118,37 @@ ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v7, v7, v7 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v31, v31, v31 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v30, v30, v30 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v29, v29, v29 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v28, v28, v28 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(5) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(4) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v14, v14, v14 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(3) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v19, v19, v19 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v18, v18, v18 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v17, v17, v17 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v16, v16, v16 +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(2) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v23, v23, v23 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v22, v22, v22 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v21, v21, v21 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v20, v20, v20 +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v27, v27, v27 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v26, v26, v26 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v25, v25, v25 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v24, v24, v24 +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v31, v31, v31 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v30, v30, v30 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v29, v29, v29 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v28, v28, v28 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:112 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:96 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:80 @@ -168,13 +180,19 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v32, 7, v0 -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:16 ; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96 +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_lo_u32 v29, v29, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 ; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] ; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 +; GCN-NEXT: v_mul_lo_u32 v28, v28, v28 +; GCN-NEXT: v_mul_lo_u32 v31, v31, v31 +; GCN-NEXT: v_mul_lo_u32 v30, v30, v30 ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -212,27 +230,18 @@ ; GCN-NEXT: s_waitcnt vmcnt(2) ; GCN-NEXT: v_mul_lo_u32 v19, v19, v19 ; GCN-NEXT: v_mul_lo_u32 v18, v18, v18 +; GCN-NEXT: v_mul_lo_u32 v17, v17, v17 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_lo_u32 v23, v23, v23 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_lo_u32 v25, v25, v25 -; GCN-NEXT: v_mul_lo_u32 v24, v24, v24 -; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:16 ; GCN-NEXT: v_mul_lo_u32 v27, v27, v27 ; GCN-NEXT: v_mul_lo_u32 v26, v26, v26 +; GCN-NEXT: v_mul_lo_u32 v25, v25, v25 +; GCN-NEXT: v_mul_lo_u32 v24, v24, v24 ; GCN-NEXT: v_mul_lo_u32 v22, v22, v22 ; GCN-NEXT: v_mul_lo_u32 v21, v21, v21 ; GCN-NEXT: v_mul_lo_u32 v20, v20, v20 -; GCN-NEXT: v_mul_lo_u32 v17, v17, v17 ; GCN-NEXT: v_mul_lo_u32 v16, v16, v16 -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_lo_u32 v29, v29, v29 -; GCN-NEXT: v_mul_lo_u32 v28, v28, v28 -; GCN-NEXT: v_mul_lo_u32 v31, v31, v31 -; GCN-NEXT: v_mul_lo_u32 v30, v30, v30 ; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:112 ; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:96 ; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:80 @@ -241,6 +250,9 @@ ; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:32 ; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:16 ; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(8) SyncID(0) ; GCN-NEXT: s_endpgm ; @@ -248,13 +260,19 @@ ; EXACTCUTOFF: ; %bb.0: ; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v32, 7, v0 -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:16 ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v29, v29, v29 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v28, v28, v28 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v31, v31, v31 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v30, v30, v30 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) @@ -292,27 +310,18 @@ ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(2) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v19, v19, v19 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v18, v18, v18 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v17, v17, v17 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v23, v23, v23 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v25, v25, v25 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v24, v24, v24 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:16 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v27, v27, v27 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v26, v26, v26 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v25, v25, v25 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v24, v24, v24 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v22, v22, v22 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v21, v21, v21 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v20, v20, v20 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v17, v17, v17 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v16, v16, v16 -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v29, v29, v29 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v28, v28, v28 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v31, v31, v31 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v30, v30, v30 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:112 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:96 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:80 @@ -321,6 +330,9 @@ ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:32 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:16 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #2 @@ -371,9 +383,21 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 7, v0 -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: ; kill: killed $sgpr0_sgpr1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 +; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_lo_u32 v7, v7, v7 ; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] +; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 +; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 +; GCN-NEXT: v_mul_lo_u32 v15, v15, v15 +; GCN-NEXT: v_mul_lo_u32 v14, v14, v14 +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 @@ -388,20 +412,29 @@ ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 ; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:112 ; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:96 -; GCN-NEXT: s_nop 0 -; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 -; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 +; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 +; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 +; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96 +; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 +; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 +; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64 +; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v7, v7, v7 +; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 ; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 -; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:64 +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 ; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16 -; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 -; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 -; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 +; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) @@ -413,6 +446,7 @@ ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 ; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 @@ -420,35 +454,13 @@ ; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 ; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16 ; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:80 -; GCN-NEXT: s_nop 0 -; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64 -; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 -; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 -; GCN-NEXT: v_mul_lo_u32 v15, v15, v15 -; GCN-NEXT: v_mul_lo_u32 v14, v14, v14 +; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 ; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 ; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 -; GCN-NEXT: v_mul_lo_u32 v7, v7, v7 -; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 -; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 -; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 -; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 ; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:80 -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:64 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -459,9 +471,21 @@ ; EXACTCUTOFF: ; %bb.0: ; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v16, 7, v0 -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; kill: killed $sgpr0_sgpr1 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v7, v7, v7 ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v14, v14, v14 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 @@ -476,20 +500,29 @@ ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:112 ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:96 -; EXACTCUTOFF-NEXT: s_nop 0 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v7, v7, v7 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:64 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) @@ -501,6 +534,7 @@ ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 @@ -508,35 +542,13 @@ ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16 ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:80 -; EXACTCUTOFF-NEXT: s_nop 0 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(2) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v14, v14, v14 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v7, v7, v7 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:80 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:64 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -603,112 +615,105 @@ ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v99, 7, v0 -; GCN-NEXT: v_mov_b32_e32 v96, 1.0 -; GCN-NEXT: v_mov_b32_e32 v97, 2.0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v98, s0, v99 -; GCN-NEXT: ds_read_b128 v[28:31], v98 offset:112 -; GCN-NEXT: ds_read_b128 v[24:27], v98 offset:96 -; GCN-NEXT: ds_read_b128 v[20:23], v98 offset:80 -; GCN-NEXT: ds_read_b128 v[16:19], v98 offset:64 -; GCN-NEXT: ds_read_b128 v[0:3], v98 -; GCN-NEXT: ds_read_b128 v[4:7], v98 offset:16 -; GCN-NEXT: ds_read_b128 v[8:11], v98 offset:32 -; GCN-NEXT: ds_read_b128 v[12:15], v98 offset:48 -; GCN-NEXT: ds_read_b128 v[60:63], v98 offset:8304 -; GCN-NEXT: ds_read_b128 v[56:59], v98 offset:8288 -; GCN-NEXT: ds_read_b128 v[52:55], v98 offset:8272 -; GCN-NEXT: ds_read_b128 v[48:51], v98 offset:8256 -; GCN-NEXT: ds_read_b128 v[44:47], v98 offset:8240 -; GCN-NEXT: ds_read_b128 v[40:43], v98 offset:8224 -; GCN-NEXT: ds_read_b128 v[36:39], v98 offset:8208 -; GCN-NEXT: ds_read_b128 v[32:35], v98 offset:8192 -; GCN-NEXT: ds_read_b128 v[92:95], v98 offset:24688 -; GCN-NEXT: ds_read_b128 v[88:91], v98 offset:24672 -; GCN-NEXT: ds_read_b128 v[84:87], v98 offset:24656 -; GCN-NEXT: ds_read_b128 v[80:83], v98 offset:24640 -; GCN-NEXT: ds_read_b128 v[76:79], v98 offset:24624 -; GCN-NEXT: ds_read_b128 v[72:75], v98 offset:24608 -; GCN-NEXT: ds_read_b128 v[68:71], v98 offset:24592 -; GCN-NEXT: ds_read_b128 v[64:67], v98 offset:24576 -; GCN-NEXT: v_add_u32_e32 v99, s1, v99 -; GCN-NEXT: s_waitcnt lgkmcnt(14) -; GCN-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v96, v97, v[0:31] -; GCN-NEXT: v_add_u32_e32 v100, 0x6000, v98 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: ds_write_b128 v99, v[28:31] offset:112 -; GCN-NEXT: ds_write_b128 v99, v[24:27] offset:96 -; GCN-NEXT: ds_write_b128 v99, v[20:23] offset:80 -; GCN-NEXT: ds_write_b128 v99, v[16:19] offset:64 -; GCN-NEXT: ds_write_b128 v99, v[12:15] offset:48 -; GCN-NEXT: ds_write_b128 v99, v[8:11] offset:32 -; GCN-NEXT: ds_write_b128 v99, v[4:7] offset:16 -; GCN-NEXT: ds_write_b128 v99, v[0:3] -; GCN-NEXT: ds_read_b128 v[28:31], v98 offset:49264 -; GCN-NEXT: ds_read_b128 v[24:27], v98 offset:49248 -; GCN-NEXT: ds_read_b128 v[20:23], v98 offset:49232 -; GCN-NEXT: ds_read_b128 v[16:19], v98 offset:49216 -; GCN-NEXT: ds_read_b128 v[12:15], v98 offset:49200 -; GCN-NEXT: ds_read_b128 v[8:11], v98 offset:49184 -; GCN-NEXT: ds_read_b128 v[4:7], v98 offset:49168 -; GCN-NEXT: ds_read_b128 v[0:3], v98 offset:49152 -; GCN-NEXT: s_waitcnt lgkmcnt(14) -; GCN-NEXT: v_mfma_f32_32x32x1f32 v[32:63], v96, v97, v[32:63] -; GCN-NEXT: v_mov_b32_e32 v98, s1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: ds_write_b128 v98, v[56:59] offset:8288 -; GCN-NEXT: ds_write_b128 v98, v[60:63] offset:8304 -; GCN-NEXT: ds_write_b128 v98, v[48:51] offset:8256 -; GCN-NEXT: ds_write_b128 v98, v[52:55] offset:8272 -; GCN-NEXT: ds_write_b128 v98, v[40:43] offset:8224 -; GCN-NEXT: ds_write_b128 v98, v[44:47] offset:8240 -; GCN-NEXT: ds_write_b128 v98, v[32:35] offset:8192 -; GCN-NEXT: ds_write_b128 v98, v[36:39] offset:8208 -; GCN-NEXT: ds_read_b128 v[60:63], v100 offset:57456 -; GCN-NEXT: ds_read_b128 v[56:59], v100 offset:57440 -; GCN-NEXT: ds_read_b128 v[52:55], v100 offset:57424 -; GCN-NEXT: ds_read_b128 v[48:51], v100 offset:57408 -; GCN-NEXT: ds_read_b128 v[32:35], v100 offset:57344 -; GCN-NEXT: ds_read_b128 v[36:39], v100 offset:57360 -; GCN-NEXT: ds_read_b128 v[40:43], v100 offset:57376 -; GCN-NEXT: ds_read_b128 v[44:47], v100 offset:57392 -; GCN-NEXT: v_mfma_f32_32x32x1f32 v[64:95], v96, v97, v[64:95] +; GCN-NEXT: v_add_u32_e32 v1, s0, v0 +; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 +; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 +; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 +; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:64 +; GCN-NEXT: ds_read_b128 a[0:3], v1 +; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:16 +; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:32 +; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48 +; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:8304 +; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:8288 +; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:8272 +; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:8256 +; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:8240 +; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:8224 +; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:8208 +; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:8192 +; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1 +; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24688 +; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24672 +; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:24656 +; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:24640 +; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:24624 +; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:24608 +; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592 +; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576 +; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:49264 +; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:49248 +; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:49232 +; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:49216 +; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:49200 +; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:49184 +; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:49168 +; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:49152 +; GCN-NEXT: v_mov_b32_e32 v1, 1.0 +; GCN-NEXT: ds_read_b128 a[156:159], v2 offset:57456 +; GCN-NEXT: ds_read_b128 a[152:155], v2 offset:57440 +; GCN-NEXT: ds_read_b128 a[148:151], v2 offset:57424 +; GCN-NEXT: ds_read_b128 a[144:147], v2 offset:57408 +; GCN-NEXT: ds_read_b128 a[128:131], v2 offset:57344 +; GCN-NEXT: ds_read_b128 a[132:135], v2 offset:57360 +; GCN-NEXT: ds_read_b128 a[136:139], v2 offset:57376 +; GCN-NEXT: ds_read_b128 a[140:143], v2 offset:57392 +; GCN-NEXT: v_mov_b32_e32 v2, 2.0 +; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) ; GCN-NEXT: s_waitcnt lgkmcnt(14) -; GCN-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v96, v97, v[0:31] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95] +; GCN-NEXT: s_waitcnt lgkmcnt(8) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 v[32:63], v96, v97, v[32:63] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 6 -; GCN-NEXT: ds_write_b128 v98, v[88:91] offset:16480 -; GCN-NEXT: ds_write_b128 v98, v[92:95] offset:16496 -; GCN-NEXT: ds_write_b128 v98, v[80:83] offset:16448 -; GCN-NEXT: ds_write_b128 v98, v[84:87] offset:16464 -; GCN-NEXT: ds_write_b128 v98, v[72:75] offset:16416 -; GCN-NEXT: ds_write_b128 v98, v[76:79] offset:16432 -; GCN-NEXT: ds_write_b128 v98, v[64:67] offset:16384 -; GCN-NEXT: ds_write_b128 v98, v[68:71] offset:16400 -; GCN-NEXT: ds_write_b128 v98, v[24:27] offset:24672 -; GCN-NEXT: ds_write_b128 v98, v[28:31] offset:24688 -; GCN-NEXT: ds_write_b128 v98, v[16:19] offset:24640 -; GCN-NEXT: ds_write_b128 v98, v[20:23] offset:24656 -; GCN-NEXT: ds_write_b128 v98, v[8:11] offset:24608 -; GCN-NEXT: ds_write_b128 v98, v[12:15] offset:24624 -; GCN-NEXT: ds_write_b128 v98, v[0:3] offset:24576 -; GCN-NEXT: ds_write_b128 v98, v[4:7] offset:24592 -; GCN-NEXT: ds_write_b128 v98, v[56:59] offset:32864 -; GCN-NEXT: ds_write_b128 v98, v[60:63] offset:32880 -; GCN-NEXT: ds_write_b128 v98, v[48:51] offset:32832 -; GCN-NEXT: ds_write_b128 v98, v[52:55] offset:32848 -; GCN-NEXT: ds_write_b128 v98, v[40:43] offset:32800 -; GCN-NEXT: ds_write_b128 v98, v[44:47] offset:32816 -; GCN-NEXT: ds_write_b128 v98, v[32:35] offset:32768 -; GCN-NEXT: ds_write_b128 v98, v[36:39] offset:32784 +; GCN-NEXT: s_nop 4 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:48 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 +; GCN-NEXT: ds_write_b128 v0, a[0:3] +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:8288 +; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:8304 +; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:8256 +; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:8272 +; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:8224 +; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:8240 +; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:8192 +; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:8208 +; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480 +; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496 +; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448 +; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464 +; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416 +; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432 +; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384 +; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400 +; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:24672 +; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:24688 +; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:24640 +; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:24656 +; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:24608 +; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:24624 +; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:24576 +; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:24592 +; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:32864 +; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:32880 +; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:32832 +; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:32848 +; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32800 +; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:32816 +; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:32768 +; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:32784 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(40) SyncID(0) ; GCN-NEXT: s_endpgm @@ -716,112 +721,105 @@ ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry ; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v99, 7, v0 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v96, 1.0 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v97, 2.0 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_add_u32_e32 v98, s0, v99 -; EXACTCUTOFF-NEXT: ds_read_b128 v[28:31], v98 offset:112 -; EXACTCUTOFF-NEXT: ds_read_b128 v[24:27], v98 offset:96 -; EXACTCUTOFF-NEXT: ds_read_b128 v[20:23], v98 offset:80 -; EXACTCUTOFF-NEXT: ds_read_b128 v[16:19], v98 offset:64 -; EXACTCUTOFF-NEXT: ds_read_b128 v[0:3], v98 -; EXACTCUTOFF-NEXT: ds_read_b128 v[4:7], v98 offset:16 -; EXACTCUTOFF-NEXT: ds_read_b128 v[8:11], v98 offset:32 -; EXACTCUTOFF-NEXT: ds_read_b128 v[12:15], v98 offset:48 -; EXACTCUTOFF-NEXT: ds_read_b128 v[60:63], v98 offset:8304 -; EXACTCUTOFF-NEXT: ds_read_b128 v[56:59], v98 offset:8288 -; EXACTCUTOFF-NEXT: ds_read_b128 v[52:55], v98 offset:8272 -; EXACTCUTOFF-NEXT: ds_read_b128 v[48:51], v98 offset:8256 -; EXACTCUTOFF-NEXT: ds_read_b128 v[44:47], v98 offset:8240 -; EXACTCUTOFF-NEXT: ds_read_b128 v[40:43], v98 offset:8224 -; EXACTCUTOFF-NEXT: ds_read_b128 v[36:39], v98 offset:8208 -; EXACTCUTOFF-NEXT: ds_read_b128 v[32:35], v98 offset:8192 -; EXACTCUTOFF-NEXT: ds_read_b128 v[92:95], v98 offset:24688 -; EXACTCUTOFF-NEXT: ds_read_b128 v[88:91], v98 offset:24672 -; EXACTCUTOFF-NEXT: ds_read_b128 v[84:87], v98 offset:24656 -; EXACTCUTOFF-NEXT: ds_read_b128 v[80:83], v98 offset:24640 -; EXACTCUTOFF-NEXT: ds_read_b128 v[76:79], v98 offset:24624 -; EXACTCUTOFF-NEXT: ds_read_b128 v[72:75], v98 offset:24608 -; EXACTCUTOFF-NEXT: ds_read_b128 v[68:71], v98 offset:24592 -; EXACTCUTOFF-NEXT: ds_read_b128 v[64:67], v98 offset:24576 -; EXACTCUTOFF-NEXT: v_add_u32_e32 v99, s1, v99 -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v96, v97, v[0:31] -; EXACTCUTOFF-NEXT: v_add_u32_e32 v100, 0x6000, v98 -; EXACTCUTOFF-NEXT: s_nop 7 -; EXACTCUTOFF-NEXT: s_nop 7 -; EXACTCUTOFF-NEXT: s_nop 1 -; EXACTCUTOFF-NEXT: ds_write_b128 v99, v[28:31] offset:112 -; EXACTCUTOFF-NEXT: ds_write_b128 v99, v[24:27] offset:96 -; EXACTCUTOFF-NEXT: ds_write_b128 v99, v[20:23] offset:80 -; EXACTCUTOFF-NEXT: ds_write_b128 v99, v[16:19] offset:64 -; EXACTCUTOFF-NEXT: ds_write_b128 v99, v[12:15] offset:48 -; EXACTCUTOFF-NEXT: ds_write_b128 v99, v[8:11] offset:32 -; EXACTCUTOFF-NEXT: ds_write_b128 v99, v[4:7] offset:16 -; EXACTCUTOFF-NEXT: ds_write_b128 v99, v[0:3] -; EXACTCUTOFF-NEXT: ds_read_b128 v[28:31], v98 offset:49264 -; EXACTCUTOFF-NEXT: ds_read_b128 v[24:27], v98 offset:49248 -; EXACTCUTOFF-NEXT: ds_read_b128 v[20:23], v98 offset:49232 -; EXACTCUTOFF-NEXT: ds_read_b128 v[16:19], v98 offset:49216 -; EXACTCUTOFF-NEXT: ds_read_b128 v[12:15], v98 offset:49200 -; EXACTCUTOFF-NEXT: ds_read_b128 v[8:11], v98 offset:49184 -; EXACTCUTOFF-NEXT: ds_read_b128 v[4:7], v98 offset:49168 -; EXACTCUTOFF-NEXT: ds_read_b128 v[0:3], v98 offset:49152 -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 v[32:63], v96, v97, v[32:63] -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v98, s1 -; EXACTCUTOFF-NEXT: s_nop 7 -; EXACTCUTOFF-NEXT: s_nop 7 -; EXACTCUTOFF-NEXT: s_nop 1 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[56:59] offset:8288 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[60:63] offset:8304 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[48:51] offset:8256 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[52:55] offset:8272 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[40:43] offset:8224 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[44:47] offset:8240 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[32:35] offset:8192 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[36:39] offset:8208 -; EXACTCUTOFF-NEXT: ds_read_b128 v[60:63], v100 offset:57456 -; EXACTCUTOFF-NEXT: ds_read_b128 v[56:59], v100 offset:57440 -; EXACTCUTOFF-NEXT: ds_read_b128 v[52:55], v100 offset:57424 -; EXACTCUTOFF-NEXT: ds_read_b128 v[48:51], v100 offset:57408 -; EXACTCUTOFF-NEXT: ds_read_b128 v[32:35], v100 offset:57344 -; EXACTCUTOFF-NEXT: ds_read_b128 v[36:39], v100 offset:57360 -; EXACTCUTOFF-NEXT: ds_read_b128 v[40:43], v100 offset:57376 -; EXACTCUTOFF-NEXT: ds_read_b128 v[44:47], v100 offset:57392 -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 v[64:95], v96, v97, v[64:95] +; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:112 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:96 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:80 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:64 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:16 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:32 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:48 +; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v1 offset:8304 +; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v1 offset:8288 +; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v1 offset:8272 +; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v1 offset:8256 +; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v1 offset:8240 +; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v1 offset:8224 +; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v1 offset:8208 +; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v1 offset:8192 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v2, 0x6000, v1 +; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v1 offset:24688 +; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v1 offset:24672 +; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v1 offset:24656 +; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v1 offset:24640 +; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v1 offset:24624 +; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v1 offset:24608 +; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v1 offset:24592 +; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v1 offset:24576 +; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:49264 +; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:49248 +; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:49232 +; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v1 offset:49216 +; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v1 offset:49200 +; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v1 offset:49184 +; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:49168 +; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1 offset:49152 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, 1.0 +; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v2 offset:57456 +; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v2 offset:57440 +; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v2 offset:57424 +; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v2 offset:57408 +; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v2 offset:57344 +; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v2 offset:57360 +; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v2 offset:57376 +; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v2 offset:57392 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 2.0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v96, v97, v[0:31] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95] +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(8) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127] ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 v[32:63], v96, v97, v[32:63] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159] ; EXACTCUTOFF-NEXT: s_nop 7 -; EXACTCUTOFF-NEXT: s_nop 6 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[88:91] offset:16480 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[92:95] offset:16496 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[80:83] offset:16448 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[84:87] offset:16464 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[72:75] offset:16416 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[76:79] offset:16432 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[64:67] offset:16384 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[68:71] offset:16400 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[24:27] offset:24672 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[28:31] offset:24688 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[16:19] offset:24640 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[20:23] offset:24656 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[8:11] offset:24608 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[12:15] offset:24624 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[0:3] offset:24576 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[4:7] offset:24592 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[56:59] offset:32864 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[60:63] offset:32880 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[48:51] offset:32832 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[52:55] offset:32848 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[40:43] offset:32800 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[44:47] offset:32816 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[32:35] offset:32768 -; EXACTCUTOFF-NEXT: ds_write_b128 v98, v[36:39] offset:32784 +; EXACTCUTOFF-NEXT: s_nop 4 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:96 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:80 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:64 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:48 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[56:59] offset:8288 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[60:63] offset:8304 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[48:51] offset:8256 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[52:55] offset:8272 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[40:43] offset:8224 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[44:47] offset:8240 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[32:35] offset:8192 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[36:39] offset:8208 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[88:91] offset:16480 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[92:95] offset:16496 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[80:83] offset:16448 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[84:87] offset:16464 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[72:75] offset:16416 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[76:79] offset:16432 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[64:67] offset:16384 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[68:71] offset:16400 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[120:123] offset:24672 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[124:127] offset:24688 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[112:115] offset:24640 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[116:119] offset:24656 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[104:107] offset:24608 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[108:111] offset:24624 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[96:99] offset:24576 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[100:103] offset:24592 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:32864 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:32880 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[144:147] offset:32832 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:32848 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[136:139] offset:32800 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[140:143] offset:32816 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[128:131] offset:32768 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[132:135] offset:32784 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(40) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm @@ -865,266 +863,266 @@ ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 7, v0 -; GCN-NEXT: v_mov_b32_e32 v34, 1.0 -; GCN-NEXT: v_mov_b32_e32 v35, 2.0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 1.0 +; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v32, s0, v33 -; GCN-NEXT: ds_read_b128 v[28:31], v32 offset:112 -; GCN-NEXT: ds_read_b128 v[24:27], v32 offset:96 -; GCN-NEXT: ds_read_b128 v[20:23], v32 offset:80 -; GCN-NEXT: ds_read_b128 v[16:19], v32 offset:64 -; GCN-NEXT: ds_read_b128 v[0:3], v32 -; GCN-NEXT: ds_read_b128 v[4:7], v32 offset:16 -; GCN-NEXT: ds_read_b128 v[8:11], v32 offset:32 -; GCN-NEXT: ds_read_b128 v[12:15], v32 offset:48 -; GCN-NEXT: v_add_u32_e32 v33, s1, v33 -; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-NEXT: v_add_u32_e32 v1, s0, v0 +; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 +; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 +; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 +; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:64 +; GCN-NEXT: ds_read_b128 a[0:3], v1 +; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:16 +; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:32 +; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v34, v35, v[0:31] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-NEXT: v_add_u32_e32 v0, s1, v0 +; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 -; GCN-NEXT: ds_write_b128 v33, v[28:31] offset:112 -; GCN-NEXT: ds_write_b128 v33, v[24:27] offset:96 -; GCN-NEXT: ds_write_b128 v33, v[20:23] offset:80 -; GCN-NEXT: ds_write_b128 v33, v[16:19] offset:64 -; GCN-NEXT: ds_write_b128 v33, v[12:15] offset:48 -; GCN-NEXT: ds_write_b128 v33, v[8:11] offset:32 -; GCN-NEXT: ds_write_b128 v33, v[4:7] offset:16 -; GCN-NEXT: ds_write_b128 v33, v[0:3] -; GCN-NEXT: ds_read_b128 v[64:67], v32 offset:8304 -; GCN-NEXT: ds_read_b128 v[60:63], v32 offset:8288 -; GCN-NEXT: ds_read_b128 v[56:59], v32 offset:8272 -; GCN-NEXT: ds_read_b128 v[52:55], v32 offset:8256 -; GCN-NEXT: ds_read_b128 v[48:51], v32 offset:8240 -; GCN-NEXT: ds_read_b128 v[44:47], v32 offset:8224 -; GCN-NEXT: ds_read_b128 v[40:43], v32 offset:8208 -; GCN-NEXT: ds_read_b128 v[36:39], v32 offset:8192 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:48 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 +; GCN-NEXT: ds_write_b128 v0, a[0:3] +; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:8304 +; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:8288 +; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:8272 +; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:8256 +; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:8240 +; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:8224 +; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:8208 +; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:8192 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] ; GCN-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NEXT: v_add_u32_e32 v1, 0x6000, v32 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 v[36:67], v34, v35, v[36:67] ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 -; GCN-NEXT: ds_write_b128 v0, v[60:63] offset:8288 -; GCN-NEXT: ds_write_b128 v0, v[64:67] offset:8304 -; GCN-NEXT: ds_write_b128 v0, v[52:55] offset:8256 -; GCN-NEXT: ds_write_b128 v0, v[56:59] offset:8272 -; GCN-NEXT: ds_write_b128 v0, v[44:47] offset:8224 -; GCN-NEXT: ds_write_b128 v0, v[48:51] offset:8240 -; GCN-NEXT: ds_write_b128 v0, v[36:39] offset:8192 -; GCN-NEXT: ds_write_b128 v0, v[40:43] offset:8208 -; GCN-NEXT: ds_read_b128 v[64:67], v32 offset:24688 -; GCN-NEXT: ds_read_b128 v[60:63], v32 offset:24672 -; GCN-NEXT: ds_read_b128 v[56:59], v32 offset:24656 -; GCN-NEXT: ds_read_b128 v[52:55], v32 offset:24640 -; GCN-NEXT: ds_read_b128 v[48:51], v32 offset:24624 -; GCN-NEXT: ds_read_b128 v[44:47], v32 offset:24608 -; GCN-NEXT: ds_read_b128 v[40:43], v32 offset:24592 -; GCN-NEXT: ds_read_b128 v[36:39], v32 offset:24576 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:8288 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:8304 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:8256 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:8272 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:8224 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:8240 +; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:8192 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:8208 +; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:24688 +; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:24672 +; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:24656 +; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:24640 +; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:24624 +; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:24608 +; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:24592 +; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:24576 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 v[36:67], v34, v35, v[36:67] ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 2 -; GCN-NEXT: ds_write_b128 v0, v[60:63] offset:16480 -; GCN-NEXT: ds_write_b128 v0, v[64:67] offset:16496 -; GCN-NEXT: ds_write_b128 v0, v[52:55] offset:16448 -; GCN-NEXT: ds_write_b128 v0, v[56:59] offset:16464 -; GCN-NEXT: ds_write_b128 v0, v[44:47] offset:16416 -; GCN-NEXT: ds_write_b128 v0, v[48:51] offset:16432 -; GCN-NEXT: ds_write_b128 v0, v[36:39] offset:16384 -; GCN-NEXT: ds_write_b128 v0, v[40:43] offset:16400 -; GCN-NEXT: ds_read_b128 v[64:67], v32 offset:49264 -; GCN-NEXT: ds_read_b128 v[60:63], v32 offset:49248 -; GCN-NEXT: ds_read_b128 v[56:59], v32 offset:49232 -; GCN-NEXT: ds_read_b128 v[52:55], v32 offset:49216 -; GCN-NEXT: ds_read_b128 v[48:51], v32 offset:49200 -; GCN-NEXT: ds_read_b128 v[44:47], v32 offset:49184 -; GCN-NEXT: ds_read_b128 v[40:43], v32 offset:49168 -; GCN-NEXT: ds_read_b128 v[36:39], v32 offset:49152 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:16480 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:16496 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:16448 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:16464 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:16416 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:16432 +; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:16384 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16400 +; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:49264 +; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:49248 +; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:49232 +; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:49216 +; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:49200 +; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:49184 +; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:49168 +; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:49152 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-NEXT: v_add_u32_e32 v1, 0x6000, v1 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 v[36:67], v34, v35, v[36:67] ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 -; GCN-NEXT: ds_write_b128 v0, v[60:63] offset:24672 -; GCN-NEXT: ds_write_b128 v0, v[64:67] offset:24688 -; GCN-NEXT: ds_write_b128 v0, v[52:55] offset:24640 -; GCN-NEXT: ds_write_b128 v0, v[56:59] offset:24656 -; GCN-NEXT: ds_write_b128 v0, v[44:47] offset:24608 -; GCN-NEXT: ds_write_b128 v0, v[48:51] offset:24624 -; GCN-NEXT: ds_write_b128 v0, v[36:39] offset:24576 -; GCN-NEXT: ds_write_b128 v0, v[40:43] offset:24592 -; GCN-NEXT: ds_read_b128 v[30:33], v1 offset:57456 -; GCN-NEXT: ds_read_b128 v[26:29], v1 offset:57440 -; GCN-NEXT: ds_read_b128 v[22:25], v1 offset:57424 -; GCN-NEXT: ds_read_b128 v[18:21], v1 offset:57408 -; GCN-NEXT: ds_read_b128 v[2:5], v1 offset:57344 -; GCN-NEXT: ds_read_b128 v[6:9], v1 offset:57360 -; GCN-NEXT: ds_read_b128 v[10:13], v1 offset:57376 -; GCN-NEXT: ds_read_b128 v[14:17], v1 offset:57392 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:24672 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:24688 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:24640 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:24656 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:24608 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:24624 +; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:24576 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:24592 +; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:57456 +; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:57440 +; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:57424 +; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:57408 +; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:57344 +; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:57360 +; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:57376 +; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:57392 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 v[2:33], v34, v35, v[2:33] ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 2 -; GCN-NEXT: ds_write_b128 v0, v[26:29] offset:32864 -; GCN-NEXT: ds_write_b128 v0, v[30:33] offset:32880 -; GCN-NEXT: ds_write_b128 v0, v[18:21] offset:32832 -; GCN-NEXT: ds_write_b128 v0, v[22:25] offset:32848 -; GCN-NEXT: ds_write_b128 v0, v[10:13] offset:32800 -; GCN-NEXT: ds_write_b128 v0, v[14:17] offset:32816 -; GCN-NEXT: ds_write_b128 v0, v[2:5] offset:32768 -; GCN-NEXT: ds_write_b128 v0, v[6:9] offset:32784 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816 +; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:32768 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:32784 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: s_endpgm ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; EXACTCUTOFF: ; %bb.0: ; %entry ; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v33, 7, v0 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v34, 1.0 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v35, 2.0 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 1.0 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 2.0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_add_u32_e32 v32, s0, v33 -; EXACTCUTOFF-NEXT: ds_read_b128 v[28:31], v32 offset:112 -; EXACTCUTOFF-NEXT: ds_read_b128 v[24:27], v32 offset:96 -; EXACTCUTOFF-NEXT: ds_read_b128 v[20:23], v32 offset:80 -; EXACTCUTOFF-NEXT: ds_read_b128 v[16:19], v32 offset:64 -; EXACTCUTOFF-NEXT: ds_read_b128 v[0:3], v32 -; EXACTCUTOFF-NEXT: ds_read_b128 v[4:7], v32 offset:16 -; EXACTCUTOFF-NEXT: ds_read_b128 v[8:11], v32 offset:32 -; EXACTCUTOFF-NEXT: ds_read_b128 v[12:15], v32 offset:48 -; EXACTCUTOFF-NEXT: v_add_u32_e32 v33, s1, v33 -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:112 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:96 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:80 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:64 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:16 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:32 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:48 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v34, v35, v[0:31] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 7 -; EXACTCUTOFF-NEXT: s_nop 2 -; EXACTCUTOFF-NEXT: ds_write_b128 v33, v[28:31] offset:112 -; EXACTCUTOFF-NEXT: ds_write_b128 v33, v[24:27] offset:96 -; EXACTCUTOFF-NEXT: ds_write_b128 v33, v[20:23] offset:80 -; EXACTCUTOFF-NEXT: ds_write_b128 v33, v[16:19] offset:64 -; EXACTCUTOFF-NEXT: ds_write_b128 v33, v[12:15] offset:48 -; EXACTCUTOFF-NEXT: ds_write_b128 v33, v[8:11] offset:32 -; EXACTCUTOFF-NEXT: ds_write_b128 v33, v[4:7] offset:16 -; EXACTCUTOFF-NEXT: ds_write_b128 v33, v[0:3] -; EXACTCUTOFF-NEXT: ds_read_b128 v[64:67], v32 offset:8304 -; EXACTCUTOFF-NEXT: ds_read_b128 v[60:63], v32 offset:8288 -; EXACTCUTOFF-NEXT: ds_read_b128 v[56:59], v32 offset:8272 -; EXACTCUTOFF-NEXT: ds_read_b128 v[52:55], v32 offset:8256 -; EXACTCUTOFF-NEXT: ds_read_b128 v[48:51], v32 offset:8240 -; EXACTCUTOFF-NEXT: ds_read_b128 v[44:47], v32 offset:8224 -; EXACTCUTOFF-NEXT: ds_read_b128 v[40:43], v32 offset:8208 -; EXACTCUTOFF-NEXT: ds_read_b128 v[36:39], v32 offset:8192 +; EXACTCUTOFF-NEXT: s_nop 1 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:96 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:80 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:64 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:48 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:8304 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:8288 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:8272 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:8256 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:8240 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:8224 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:8208 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:8192 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, 0x6000, v32 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 v[36:67], v34, v35, v[36:67] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 7 -; EXACTCUTOFF-NEXT: s_nop 2 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[60:63] offset:8288 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[64:67] offset:8304 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[52:55] offset:8256 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[56:59] offset:8272 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[44:47] offset:8224 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[48:51] offset:8240 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[36:39] offset:8192 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[40:43] offset:8208 -; EXACTCUTOFF-NEXT: ds_read_b128 v[64:67], v32 offset:24688 -; EXACTCUTOFF-NEXT: ds_read_b128 v[60:63], v32 offset:24672 -; EXACTCUTOFF-NEXT: ds_read_b128 v[56:59], v32 offset:24656 -; EXACTCUTOFF-NEXT: ds_read_b128 v[52:55], v32 offset:24640 -; EXACTCUTOFF-NEXT: ds_read_b128 v[48:51], v32 offset:24624 -; EXACTCUTOFF-NEXT: ds_read_b128 v[44:47], v32 offset:24608 -; EXACTCUTOFF-NEXT: ds_read_b128 v[40:43], v32 offset:24592 -; EXACTCUTOFF-NEXT: ds_read_b128 v[36:39], v32 offset:24576 +; EXACTCUTOFF-NEXT: s_nop 1 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:8288 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:8304 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:8256 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:8272 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:8224 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:8240 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:8192 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:8208 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:24688 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:24672 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:24656 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:24640 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:24624 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:24608 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:24592 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:24576 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 v[36:67], v34, v35, v[36:67] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 2 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[60:63] offset:16480 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[64:67] offset:16496 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[52:55] offset:16448 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[56:59] offset:16464 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[44:47] offset:16416 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[48:51] offset:16432 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[36:39] offset:16384 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[40:43] offset:16400 -; EXACTCUTOFF-NEXT: ds_read_b128 v[64:67], v32 offset:49264 -; EXACTCUTOFF-NEXT: ds_read_b128 v[60:63], v32 offset:49248 -; EXACTCUTOFF-NEXT: ds_read_b128 v[56:59], v32 offset:49232 -; EXACTCUTOFF-NEXT: ds_read_b128 v[52:55], v32 offset:49216 -; EXACTCUTOFF-NEXT: ds_read_b128 v[48:51], v32 offset:49200 -; EXACTCUTOFF-NEXT: ds_read_b128 v[44:47], v32 offset:49184 -; EXACTCUTOFF-NEXT: ds_read_b128 v[40:43], v32 offset:49168 -; EXACTCUTOFF-NEXT: ds_read_b128 v[36:39], v32 offset:49152 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:16480 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:16496 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:16448 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:16464 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:16416 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:16432 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:16384 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16400 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:49264 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:49248 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:49232 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:49216 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:49200 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:49184 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:49168 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:49152 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, 0x6000, v1 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 v[36:67], v34, v35, v[36:67] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 7 -; EXACTCUTOFF-NEXT: s_nop 2 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[60:63] offset:24672 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[64:67] offset:24688 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[52:55] offset:24640 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[56:59] offset:24656 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[44:47] offset:24608 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[48:51] offset:24624 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[36:39] offset:24576 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[40:43] offset:24592 -; EXACTCUTOFF-NEXT: ds_read_b128 v[30:33], v1 offset:57456 -; EXACTCUTOFF-NEXT: ds_read_b128 v[26:29], v1 offset:57440 -; EXACTCUTOFF-NEXT: ds_read_b128 v[22:25], v1 offset:57424 -; EXACTCUTOFF-NEXT: ds_read_b128 v[18:21], v1 offset:57408 -; EXACTCUTOFF-NEXT: ds_read_b128 v[2:5], v1 offset:57344 -; EXACTCUTOFF-NEXT: ds_read_b128 v[6:9], v1 offset:57360 -; EXACTCUTOFF-NEXT: ds_read_b128 v[10:13], v1 offset:57376 -; EXACTCUTOFF-NEXT: ds_read_b128 v[14:17], v1 offset:57392 +; EXACTCUTOFF-NEXT: s_nop 1 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:24672 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:24688 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:24640 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:24656 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:24608 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:24624 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:24576 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:24592 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:57456 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:57440 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:57424 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:57408 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:57344 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:57360 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:57376 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:57392 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 v[2:33], v34, v35, v[2:33] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 2 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[26:29] offset:32864 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[30:33] offset:32880 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[18:21] offset:32832 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[22:25] offset:32848 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[10:13] offset:32800 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[14:17] offset:32816 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[2:5] offset:32768 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, v[6:9] offset:32784 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:32864 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:32880 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:32832 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:32848 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32800 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:32816 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:32768 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:32784 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm entry: @@ -1193,6 +1191,6 @@ declare void @llvm.amdgcn.sched.group.barrier(i32, i32, i32) #1 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #1 -attributes #0 = { nounwind "amdgpu-flat-workgroup-size"="1,256" } +attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" } attributes #1 = { nounwind } attributes #2 = { nounwind readnone speculatable } diff --git a/llvm/test/CodeGen/AMDGPU/sched-group-barrier-pipeline-solver.mir b/llvm/test/CodeGen/AMDGPU/sched-group-barrier-pipeline-solver.mir --- a/llvm/test/CodeGen/AMDGPU/sched-group-barrier-pipeline-solver.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-group-barrier-pipeline-solver.mir @@ -211,10 +211,10 @@ ; GREEDY: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; GREEDY-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GREEDY-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; GREEDY-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; GREEDY-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; GREEDY-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec + ; GREEDY-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; GREEDY-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec ; GREEDY-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec @@ -299,10 +299,10 @@ ; GREEDY-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GREEDY-NEXT: S_NOP 0 ; GREEDY-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; GREEDY-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; GREEDY-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; GREEDY-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec + ; GREEDY-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; GREEDY-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec ; GREEDY-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec