diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -229,6 +229,9 @@ void initializeSIModeRegisterPass(PassRegistry&); extern char &SIModeRegisterID; +void initializeSIInsertHardClausesPass(PassRegistry &); +extern char &SIInsertHardClausesID; + void initializeSIInsertWaitcntsPass(PassRegistry&); extern char &SIInsertWaitcntsID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -241,6 +241,7 @@ initializeAMDGPURewriteOutArgumentsPass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); + initializeSIInsertHardClausesPass(*PR); initializeSIInsertWaitcntsPass(*PR); initializeSIModeRegisterPass(*PR); initializeSIWholeQuadModePass(*PR); @@ -1044,6 +1045,7 @@ // FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would // be better for it to emit S_NOP when possible. addPass(&PostRAHazardRecognizerID); + addPass(&SIInsertHardClausesID); addPass(&SIRemoveShortExecBranchesID); addPass(&SIPreEmitPeepholeID); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -107,6 +107,7 @@ SIFoldOperands.cpp SIFormMemoryClauses.cpp SIFrameLowering.cpp + SIInsertHardClauses.cpp SIInsertSkips.cpp SIInsertWaitcnts.cpp SIInstrInfo.cpp diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp @@ -0,0 +1,188 @@ +//===- SIInsertHardClauses.cpp - Insert Hard Clauses ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Insert s_clause instructions to form hard clauses. +/// +/// Clausing load instructions can give cache coherency benefits. Before gfx10, +/// the hardware automatically detected "soft clauses", which were sequences of +/// memory instructions of the same type. In gfx10 this detection was removed, +/// and the s_clause instruction was introduced to explicitly mark "hard +/// clauses". +/// +/// It's the scheduler's job to form the clauses by putting similar memory +/// instructions next to each other. Our job is just to insert an s_clause +/// instruction to mark the start of each clause. +/// +/// Note that hard clauses are very similar to, but logically distinct from, the +/// groups of instructions that have to be restartable when XNACK is enabled. +/// The rules are slightly different in each case. For example an s_nop +/// instruction breaks a restartable group, but can appear in the middle of a +/// hard clause. (Before gfx10 there wasn't a distinction, and both were called +/// "soft clauses" or just "clauses".) +/// +/// The SIFormMemoryClauses pass and GCNHazardRecognizer deal with restartable +/// groups, not hard clauses. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/SmallVector.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-insert-hard-clauses" + +namespace { + +enum HardClauseType { + // Texture, buffer, global or scratch memory instructions. + HARDCLAUSE_VMEM, + // Flat (not global or scratch) memory instructions. + HARDCLAUSE_FLAT, + // Instructions that access LDS. + HARDCLAUSE_LDS, + // Scalar memory instructions. + HARDCLAUSE_SMEM, + // VALU instructions. + HARDCLAUSE_VALU, + // Internal instructions, which are allowed in the middle of a hard clause, + // except for s_waitcnt. + HARDCLAUSE_INTERNAL, + // Instructions that are not allowed in a hard clause: SALU, export, branch, + // message, GDS, s_waitcnt and anything else not mentioned above. + HARDCLAUSE_ILLEGAL, +}; + +HardClauseType getHardClauseType(const MachineInstr &MI) { + // On current architectures we only get a benefit from clausing loads. + if (MI.mayLoad()) { + if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) + return HARDCLAUSE_VMEM; + if (SIInstrInfo::isFLAT(MI)) + return HARDCLAUSE_FLAT; + // TODO: LDS + if (SIInstrInfo::isSMRD(MI)) + return HARDCLAUSE_SMEM; + } + + // Don't form VALU clauses. It's not clear what benefit they give, if any. + + // In practice s_nop is the only internal instructions we're likely to see. + // It's safe to treat the rest as illegal. + if (MI.getOpcode() == AMDGPU::S_NOP) + return HARDCLAUSE_INTERNAL; + return HARDCLAUSE_ILLEGAL; +} + +class SIInsertHardClauses : public MachineFunctionPass { +public: + static char ID; + + SIInsertHardClauses() : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + // Track information about a clause as we discover it. + struct ClauseInfo { + // The type of all (non-internal) instructions in the clause. + HardClauseType Type = HARDCLAUSE_ILLEGAL; + // The first (necessarily non-internal) instruction in the clause. + MachineInstr *First = nullptr; + // The last non-internal instruction in the clause. + MachineInstr *Last = nullptr; + // The number of non-internal instructions in the clause. + unsigned Size = 0; + // The length of the clause including any internal instructions in the + // middle. + unsigned Length = 0; + // The base operands of *Last. + SmallVector BaseOps; + }; + + bool EmitClause(const ClauseInfo &CI, const TargetInstrInfo *TII) { + if (CI.Length < 2) + return false; + + BuildMI(*CI.First->getParent(), *CI.First, CI.First->getDebugLoc(), + TII->get(AMDGPU::S_CLAUSE)) + .addImm(std::min(CI.Length, 64u) - 1); + return true; + } + + bool runOnMachineFunction(MachineFunction &MF) override { + const GCNSubtarget &ST = MF.getSubtarget(); + if (ST.getGeneration() < AMDGPUSubtarget::GFX10) + return false; + + const SIInstrInfo *SII = ST.getInstrInfo(); + const TargetRegisterInfo *TRI = ST.getRegisterInfo(); + + ClauseInfo CI; + + bool Changed = false; + for (auto &MBB : MF) { + for (auto &MI : MBB) { + HardClauseType Type = getHardClauseType(MI); + + int64_t Dummy1; + bool Dummy2; + SmallVector BaseOps; + if (Type < HARDCLAUSE_INTERNAL) { + if (!SII->getMemOperandsWithOffset(MI, BaseOps, Dummy1, Dummy2, + TRI)) { + // We failed to get the base operands, so we'll never clause this + // instruction with any other, so pretend it's illegal. + Type = HARDCLAUSE_ILLEGAL; + } + } + + if (CI.Size) { + if (Type == HARDCLAUSE_INTERNAL) { + // Keep the current clause. + ++CI.Length; + } else if (Type == CI.Type && SII->shouldClusterMemOps( + CI.BaseOps, BaseOps, CI.Size + 1)) { + // Extend the current clause. + CI.Last = &MI; + ++CI.Size; + ++CI.Length; + CI.BaseOps = std::move(BaseOps); + } else { + // Finish the current clause. + Changed |= EmitClause(CI, SII); + CI = ClauseInfo(); + } + } + + if (!CI.Size && Type < HARDCLAUSE_INTERNAL) { + // Start a new clause. + CI = ClauseInfo{Type, &MI, &MI, 1, 1, std::move(BaseOps)}; + } + } + } + + // Finish the last clause if any. + Changed |= EmitClause(CI, SII); + + return Changed; + } +}; + +} // namespace + +char SIInsertHardClauses::ID = 0; + +char &llvm::SIInsertHardClausesID = SIInsertHardClauses::ID; + +INITIALIZE_PASS(SIInsertHardClauses, DEBUG_TYPE, "SI Insert Hard Clauses", + false, false) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -255,6 +255,7 @@ ; ; GFX10_W32-LABEL: test_div_fmas_f32: ; GFX10_W32: ; %bb.0: +; GFX10_W32-NEXT: s_clause 0x4 ; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0xb8 ; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x70 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x94 @@ -274,6 +275,7 @@ ; ; GFX10_W64-LABEL: test_div_fmas_f32: ; GFX10_W64: ; %bb.0: +; GFX10_W64-NEXT: s_clause 0x4 ; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0xb8 ; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x70 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x94 @@ -333,6 +335,7 @@ ; ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_0: ; GFX10_W32: ; %bb.0: +; GFX10_W32-NEXT: s_clause 0x3 ; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0xb8 ; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x94 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x70 @@ -350,6 +353,7 @@ ; ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_0: ; GFX10_W64: ; %bb.0: +; GFX10_W64-NEXT: s_clause 0x3 ; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0xb8 ; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x94 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x70 @@ -407,6 +411,7 @@ ; ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_1: ; GFX10_W32: ; %bb.0: +; GFX10_W32-NEXT: s_clause 0x3 ; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x58 ; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x34 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x2c @@ -424,6 +429,7 @@ ; ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_1: ; GFX10_W64: ; %bb.0: +; GFX10_W64-NEXT: s_clause 0x3 ; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x58 ; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x34 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x2c @@ -481,6 +487,7 @@ ; ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_2: ; GFX10_W32: ; %bb.0: +; GFX10_W32-NEXT: s_clause 0x3 ; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0xb8 ; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x70 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c @@ -498,6 +505,7 @@ ; ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_2: ; GFX10_W64: ; %bb.0: +; GFX10_W64-NEXT: s_clause 0x3 ; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0xb8 ; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x70 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c @@ -559,6 +567,7 @@ ; ; GFX10_W32-LABEL: test_div_fmas_f64: ; GFX10_W32: ; %bb.0: +; GFX10_W32-NEXT: s_clause 0x1 ; GFX10_W32-NEXT: s_load_dword s8, s[0:1], 0x44 ; GFX10_W32-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi @@ -577,6 +586,7 @@ ; ; GFX10_W64-LABEL: test_div_fmas_f64: ; GFX10_W64: ; %bb.0: +; GFX10_W64-NEXT: s_clause 0x1 ; GFX10_W64-NEXT: s_load_dword s8, s[0:1], 0x44 ; GFX10_W64-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) @@ -637,6 +647,7 @@ ; ; GFX10_W32-LABEL: test_div_fmas_f32_cond_to_vcc: ; GFX10_W32: ; %bb.0: +; GFX10_W32-NEXT: s_clause 0x1 ; GFX10_W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi @@ -655,6 +666,7 @@ ; ; GFX10_W64-LABEL: test_div_fmas_f32_cond_to_vcc: ; GFX10_W64: ; %bb.0: +; GFX10_W64-NEXT: s_clause 0x1 ; GFX10_W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) @@ -712,6 +724,7 @@ ; ; GFX10_W32-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: ; GFX10_W32: ; %bb.0: +; GFX10_W32-NEXT: s_clause 0x3 ; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x70 ; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x94 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c @@ -729,6 +742,7 @@ ; ; GFX10_W64-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: ; GFX10_W64: ; %bb.0: +; GFX10_W64-NEXT: s_clause 0x3 ; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x70 ; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x94 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c @@ -784,6 +798,7 @@ ; ; GFX10_W32-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: ; GFX10_W32: ; %bb.0: +; GFX10_W32-NEXT: s_clause 0x3 ; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x70 ; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x94 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c @@ -801,6 +816,7 @@ ; ; GFX10_W64-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: ; GFX10_W64: ; %bb.0: +; GFX10_W64-NEXT: s_clause 0x3 ; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x70 ; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x94 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c @@ -901,6 +917,7 @@ ; GFX10_W32-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v2, vcc_lo ; GFX10_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 s2, 0, s2 +; GFX10_W32-NEXT: s_clause 0x2 ; GFX10_W32-NEXT: global_load_dword v1, v[1:2], off ; GFX10_W32-NEXT: global_load_dword v2, v[3:4], off offset:-4 ; GFX10_W32-NEXT: global_load_dword v3, v[3:4], off @@ -932,6 +949,7 @@ ; GFX10_W64-NEXT: v_add_co_ci_u32_e32 v4, vcc, 0, v2, vcc ; GFX10_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, s2 +; GFX10_W64-NEXT: s_clause 0x2 ; GFX10_W64-NEXT: global_load_dword v1, v[1:2], off ; GFX10_W64-NEXT: global_load_dword v2, v[3:4], off offset:-4 ; GFX10_W64-NEXT: global_load_dword v3, v[3:4], off diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -57,6 +57,7 @@ ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: global_load_dword v1, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -131,6 +132,7 @@ ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: global_load_dword v1, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -211,6 +213,7 @@ ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 8 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: global_load_dwordx2 v[2:3], v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -291,6 +294,7 @@ ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 8 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: global_load_dwordx2 v[2:3], v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -889,6 +893,7 @@ ; ; GFX10-LABEL: test_div_scale_f32_all_scalar_1: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x2 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c ; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -934,6 +939,7 @@ ; ; GFX10-LABEL: test_div_scale_f32_all_scalar_2: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x2 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c ; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -981,6 +987,7 @@ ; ; GFX10-LABEL: test_div_scale_f64_all_scalar_1: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x2 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -1028,6 +1035,7 @@ ; ; GFX10-LABEL: test_div_scale_f64_all_scalar_2: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x2 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -1229,6 +1237,7 @@ ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: global_load_dword v1, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -1309,6 +1318,7 @@ ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: global_load_dword v1, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll @@ -4,6 +4,7 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; GCN-LABEL: test_wave32: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_clause 0x1 ; GCN-NEXT: s_load_dword s1, s[4:5], 0x0 ; GCN-NEXT: s_load_dword s0, s[4:5], 0x24 ; GCN-NEXT: ; implicit-def: $vcc_hi diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll @@ -4,6 +4,7 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; GCN-LABEL: test_wave32: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_clause 0x1 ; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 ; GCN-NEXT: s_load_dword s1, s[4:5], 0x24 ; GCN-NEXT: ; implicit-def: $vcc_hi diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll @@ -20,6 +20,7 @@ ; ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c ; encoding: [0x80,0x00,0x00,0xf4,0x2c,0x00,0x00,0xfa] ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; encoding: [0x00,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa] ; GFX10-NEXT: ; implicit-def: $vcc_hi diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -19,6 +19,7 @@ ; ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: ; implicit-def: $vcc_hi diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -276,6 +276,7 @@ ; GFX1064-LABEL: add_i32_uniform: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX1064-NEXT: ; implicit-def: $vgpr1 @@ -310,6 +311,7 @@ ; ; GFX1032-LABEL: add_i32_uniform: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX1032-NEXT: s_mov_b32 s2, exec_lo @@ -1827,6 +1829,7 @@ ; GFX1064-LABEL: sub_i32_uniform: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX1064-NEXT: ; implicit-def: $vgpr1 @@ -1861,6 +1864,7 @@ ; ; GFX1032-LABEL: sub_i32_uniform: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX1032-NEXT: s_mov_b32 s2, exec_lo diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -102,6 +102,7 @@ ; ; GFX10-DL-LABEL: udot2: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -244,6 +245,7 @@ ; ; GFX10-DL-LABEL: udot2_MulMul: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -377,6 +379,7 @@ ; ; GFX10-DL-LABEL: idot2: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -511,6 +514,7 @@ ; ; GFX10-DL-LABEL: idot2_MixedTypedMul: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -648,6 +652,7 @@ ; ; GFX10-DL-LABEL: udot2_alt_AddOperands: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -782,6 +787,7 @@ ; ; GFX10-DL-LABEL: idot2_MixedExt: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -905,6 +911,7 @@ ; ; GFX10-DL-LABEL: notudot2_SameVec: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -1039,6 +1046,7 @@ ; ; GFX10-DL-LABEL: udot2_v4i16: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -1171,6 +1179,7 @@ ; ; GFX10-DL-LABEL: udot2_v4i16_Hi: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -1310,6 +1319,7 @@ ; ; GFX10-DL-LABEL: notudot2_v4i16_Even: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -1455,6 +1465,7 @@ ; ; GFX10-DL-LABEL: notudot2_v4i16_Middle: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -1600,6 +1611,7 @@ ; ; GFX10-DL-LABEL: notudot2_DiffIndex: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -1749,6 +1761,7 @@ ; ; GFX10-DL-LABEL: udot2_MultipleUses_add1: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -1896,6 +1909,7 @@ ; ; GFX10-DL-LABEL: idot2_MultipleUses_add1: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -2047,6 +2061,7 @@ ; ; GFX10-DL-LABEL: udot2_MultipleUses_mul1: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -2195,6 +2210,7 @@ ; ; GFX10-DL-LABEL: idot2_MultipleUses_mul1: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -2347,6 +2363,7 @@ ; ; GFX10-DL-LABEL: udot2_MultipleUses_mul2: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -2495,6 +2512,7 @@ ; ; GFX10-DL-LABEL: idot2_MultipleUses_mul2: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -2784,6 +2802,7 @@ ; ; GFX10-DL-LABEL: notsdot2_sext8: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -119,6 +119,7 @@ ; ; GFX10-DL-LABEL: idot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -641,6 +642,7 @@ ; ; GFX10-DL-LABEL: idot4_multiuse_mul1: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -839,6 +841,7 @@ ; ; GFX10-DL-LABEL: idot4_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -122,6 +122,7 @@ ; ; GFX10-DL-LABEL: udot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -1138,6 +1139,7 @@ ; ; GFX10-DL-LABEL: udot4_multiuse_mul1: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -1345,6 +1347,7 @@ ; ; GFX10-DL-LABEL: udot4_multiuse_add1: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -1743,6 +1746,7 @@ ; ; GFX10-DL-LABEL: udot4_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -167,6 +167,7 @@ ; ; GFX10-DL-LABEL: idot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -1130,6 +1131,7 @@ ; ; GFX10-DL-LABEL: idot8_multiuses_mul1: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -1401,6 +1403,7 @@ ; ; GFX10-DL-LABEL: idot8_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -167,6 +167,7 @@ ; ; GFX10-DL-LABEL: udot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -1654,6 +1655,7 @@ ; ; GFX10-DL-LABEL: udot8_multiuses_mul1: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -1924,6 +1926,7 @@ ; ; GFX10-DL-LABEL: udot8_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi @@ -3085,6 +3088,7 @@ ; ; GFX10-DL-LABEL: udot8_variant1: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll @@ -120,6 +120,7 @@ ;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged_and: ;CHECK-NEXT: %bb. +;GFX10-NEXT: s_clause ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 ;CHECK: s_waitcnt @@ -145,6 +146,7 @@ ;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged_or: ;CHECK-NEXT: %bb. ;CHECK-NEXT: v_lshlrev_b32_e32 v{{[0-9]}}, 6, v0 +;GFX10-NEXT: s_clause ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:28 ;CHECK: s_waitcnt @@ -170,6 +172,7 @@ ;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged_glc_slc: ;CHECK-NEXT: %bb. +;GFX10-NEXT: s_clause ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}} ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}} ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}} @@ -233,6 +236,7 @@ ;CHECK-LABEL: {{^}}buffer_load_x1_offset_merged: ;CHECK-NEXT: %bb. +;GFX10-NEXT: s_clause ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 ;CHECK: s_waitcnt @@ -402,6 +406,7 @@ ;CHECK-LABEL: {{^}}raw_buffer_load_x1_offset_merged: ;CHECK-NEXT: %bb. +;GFX10-NEXT: s_clause ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 ;CHECK: s_waitcnt @@ -420,6 +425,7 @@ ;CHECK-LABEL: {{^}}raw_buffer_load_x1_offset_swizzled_not_merged: ;CHECK-NEXT: %bb. +;GFX10-NEXT: s_clause ;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:4 ;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:8 ;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:12 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -154,6 +154,7 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 @@ -986,6 +987,7 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v3, v[0:1], off ; GFX10-NEXT: global_load_ushort v4, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll --- a/llvm/test/CodeGen/AMDGPU/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/smrd.ll @@ -377,6 +377,7 @@ ; GCN-NEXT: %bb. ; SICI-NEXT: s_buffer_load_dwordx4 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x1 ; SICI-NEXT: s_buffer_load_dwordx2 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x7 +; GFX10-NEXT: s_clause ; VIGFX9_10-NEXT: s_buffer_load_dwordx4 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x4 ; VIGFX9_10-NEXT: s_buffer_load_dwordx2 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x1c define amdgpu_ps void @smrd_imm_merged(<4 x i32> inreg %desc) #0 { @@ -447,6 +448,7 @@ ; GCN-LABEL: {{^}}smrd_vgpr_merged: ; GCN-NEXT: %bb. +; GFX10-NEXT: s_clause ; GCN-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 ; GCN-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 define amdgpu_ps void @smrd_vgpr_merged(<4 x i32> inreg %desc, i32 %a) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -13,6 +13,7 @@ ; GCN-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 8 ; GCN-NEXT: s_mov_b32 s5, exec_lo ; GCN-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GCN-NEXT: s_clause 0x1 ; GCN-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; GCN-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GCN-NEXT: BB0_2: ; Parent Loop BB0_1 Depth=1