diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -4736,18 +4736,48 @@ 1. buffer/global/flat_load glc=1 - - - nontemporal + 2. s_waitcnt vmcnt(0) + + - Must happen before + any following volatile + global/generic + load/store. + - Ensures that + volatile + operations to + different + addresses will not + be reordered by + hardware. + + - !volatile & nontemporal 1. buffer/global/flat_load glc=1 slc=1 load *none* *none* - local 1. ds_load - store *none* *none* - global - !nontemporal + store *none* *none* - global - !volatile & !nontemporal - generic - private 1. buffer/global/flat_store - constant - - nontemporal + - volatile & !nontemporal + + 1. buffer/global/flat_store + 2. s_waitcnt vmcnt(0) + + - Must happen before + any following volatile + global/generic + load/store. + - Ensures that + volatile + operations to + different + addresses will not + be reordered by + hardware. + + - !volatile & nontemporal 1. buffer/global/flat_store glc=1 slc=1 @@ -6008,18 +6038,48 @@ 1. buffer/global/flat_load glc=1 dlc=1 - - - nontemporal + 2. s_waitcnt vmcnt(0) + + - Must happen before + any following volatile + global/generic + load/store. + - Ensures that + volatile + operations to + different + addresses will not + be reordered by + hardware. + + - !volatile & nontemporal 1. buffer/global/flat_load slc=1 load *none* *none* - local 1. ds_load - store *none* *none* - global - !nontemporal + store *none* *none* - global - !volatile & !nontemporal - generic - private 1. buffer/global/flat_store - constant - - nontemporal + - volatile & !nontemporal + + 1. buffer/global/flat_store + 2. s_waitcnt vscnt(0) + + - Must happen before + any following volatile + global/generic + load/store. + - Ensures that + volatile + operations to + different + addresses will not + be reordered by + hardware. + + - !volatile & nontemporal 1. buffer/global/flat_store slc=1 diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -129,6 +129,7 @@ SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; bool IsCrossAddressSpaceOrdering = false; + bool IsVolatile = false; bool IsNonTemporal = false; SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, @@ -138,11 +139,13 @@ bool IsCrossAddressSpaceOrdering = true, AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent, + bool IsVolatile = false, bool IsNonTemporal = false) : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace), IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), + IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal) { // There is also no cross address space ordering if the ordering // address space is the same as the instruction address space and @@ -190,7 +193,13 @@ } /// \returns True if memory access of the machine instruction used to - /// create this SIMemOpInfo is non-temporal, false otherwise. + /// create this SIMemOpInfo is volatile, false otherwise. + bool isVolatile() const { + return IsVolatile; + } + + /// \returns True if memory access of the machine instruction used to + /// create this SIMemOpInfo is nontemporal, false otherwise. bool isNonTemporal() const { return IsNonTemporal; } @@ -278,10 +287,13 @@ SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const = 0; - /// Update \p MI memory instruction to indicate it is - /// nontemporal. Return true iff the instruction was modified. - virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI) - const = 0; + /// Update \p MI memory instruction of kind \p Op associated with address + /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return + /// true iff the instruction was modified. + virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, + SIAtomicAddrSpace AddrSpace, + SIMemOp Op, bool IsVolatile, + bool IsNonTemporal) const = 0; /// Inserts any necessary instructions at position \p Pos relative /// to instruction \p MI to ensure memory instructions before \p Pos of kind @@ -347,7 +359,10 @@ SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const override; - bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override; + bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, + SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsVolatile, + bool IsNonTemporal) const override; bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, @@ -397,7 +412,10 @@ SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const override; - bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override; + bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, + SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsVolatile, + bool IsNonTemporal) const override; bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, @@ -548,11 +566,13 @@ AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; bool IsNonTemporal = true; + bool IsVolatile = false; // Validator should check whether or not MMOs cover the entire set of // locations accessed by the memory instruction. for (const auto &MMO : MI->memoperands()) { IsNonTemporal &= MMO->isNonTemporal(); + IsVolatile |= MMO->isVolatile(); InstrAddrSpace |= toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); AtomicOrdering OpOrdering = MMO->getOrdering(); @@ -595,7 +615,8 @@ } } return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, - IsCrossAddressSpaceOrdering, FailureOrdering, IsNonTemporal); + IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, + IsNonTemporal); } Optional SIMemOpAccess::getLoadInfo( @@ -722,14 +743,43 @@ return Changed; } -bool SIGfx6CacheControl::enableNonTemporal( - const MachineBasicBlock::iterator &MI) const { +bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( + MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsVolatile, bool IsNonTemporal) const { + // Only handle load and store, not atomic read-modify-write insructions. The + // latter use glc to indicate if the atomic returns a result and so must not + // be used for cache control. assert(MI->mayLoad() ^ MI->mayStore()); + + // Only update load and store, not LLVM IR atomic read-modify-write + // instructions. The latter are always marked as volatile so cannot sensibly + // handle it as do not want to pessimize all atomics. Also they do not support + // the nontemporal attribute. + assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE); + bool Changed = false; - /// TODO: Do not enableGLCBit if rmw atomic. - Changed |= enableGLCBit(MI); - Changed |= enableSLCBit(MI); + if (IsVolatile) { + if (Op == SIMemOp::LOAD) + Changed |= enableGLCBit(MI); + + // Ensure operation has completed at system scope to cause all volatile + // operations to be visible outside the program in a global order. Do not + // request cross address space as only the global address space can be + // observable outside the program, so no need to cause a waitcnt for LDS + // address space operations. + Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, + Position::AFTER); + + return Changed; + } + + if (IsNonTemporal) { + // Request L1 MISS_EVICT and L2 STREAM for load and store instructions. + Changed |= enableGLCBit(MI); + Changed |= enableSLCBit(MI); + return Changed; + } return Changed; } @@ -751,7 +801,8 @@ bool VMCnt = false; bool LGKMCnt = false; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != + SIAtomicAddrSpace::NONE) { switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: @@ -978,13 +1029,45 @@ return Changed; } -bool SIGfx10CacheControl::enableNonTemporal( - const MachineBasicBlock::iterator &MI) const { +bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( + MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsVolatile, bool IsNonTemporal) const { + + // Only handle load and store, not atomic read-modify-write insructions. The + // latter use glc to indicate if the atomic returns a result and so must not + // be used for cache control. assert(MI->mayLoad() ^ MI->mayStore()); + + // Only update load and store, not LLVM IR atomic read-modify-write + // instructions. The latter are always marked as volatile so cannot sensibly + // handle it as do not want to pessimize all atomics. Also they do not support + // the nontemporal attribute. + assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE); + bool Changed = false; - Changed |= enableSLCBit(MI); - /// TODO for store (non-rmw atomic) instructions also enableGLCBit(MI) + if (IsVolatile) { + + if (Op == SIMemOp::LOAD) { + Changed |= enableGLCBit(MI); + Changed |= enableDLCBit(MI); + } + + // Ensure operation has completed at system scope to cause all volatile + // operations to be visible outside the program in a global order. Do not + // request cross address space as only the global address space can be + // observable outside the program, so no need to cause a waitcnt for LDS + // address space operations. + Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, + Position::AFTER); + return Changed; + } + + if (IsNonTemporal) { + // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions. + Changed |= enableSLCBit(MI); + return Changed; + } return Changed; } @@ -1007,7 +1090,8 @@ bool VSCnt = false; bool LGKMCnt = false; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != + SIAtomicAddrSpace::NONE) { switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: @@ -1210,12 +1294,12 @@ return Changed; } - // Atomic instructions do not have the nontemporal attribute. - if (MOI.isNonTemporal()) { - Changed |= CC->enableNonTemporal(MI); - return Changed; - } - + // Atomic instructions already bypass caches to the scope specified by the + // SyncScope operand. Only non-atomic volatile and nontemporal instructions + // need additional treatment. + Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(), + SIMemOp::LOAD, MOI.isVolatile(), + MOI.isNonTemporal()); return Changed; } @@ -1236,12 +1320,12 @@ return Changed; } - // Atomic instructions do not have the nontemporal attribute. - if (MOI.isNonTemporal()) { - Changed |= CC->enableNonTemporal(MI); - return Changed; - } - + // Atomic instructions already bypass caches to the scope specified by the + // SyncScope operand. Only non-atomic volatile and nontemporal instructions + // need additional treatment. + Changed |= CC->enableVolatileAndOrNonTemporal( + MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(), + MOI.isNonTemporal()); return Changed; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll @@ -59,9 +59,11 @@ ; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: BB3_2: ; %bb1 ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) entry: %trunc = trunc i32 %cond to i1 br i1 %trunc, label %bb0, label %bb1 @@ -88,9 +90,11 @@ ; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: BB4_2: ; %bb1 ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) entry: %trunc0 = trunc i32 %cond0 to i1 %trunc1 = trunc i32 %cond1 to i1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -12,7 +12,8 @@ ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_cbranch_execz BB0_2 ; CHECK-NEXT: ; %bb.1: ; %if.true -; CHECK-NEXT: global_load_dword v0, v[0:1], off +; CHECK-NEXT: global_load_dword v0, v[0:1], off glc +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: BB0_2: ; %endif ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -39,7 +40,8 @@ ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_cbranch_execz BB1_2 ; CHECK-NEXT: ; %bb.1: ; %if.true -; CHECK-NEXT: global_load_dword v0, v[0:1], off +; CHECK-NEXT: global_load_dword v0, v[0:1], off glc +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: BB1_2: ; %endif ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -68,7 +70,8 @@ ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_cbranch_execz BB2_2 ; CHECK-NEXT: ; %bb.1: ; %if.true -; CHECK-NEXT: global_load_dword v0, v[0:1], off +; CHECK-NEXT: global_load_dword v0, v[0:1], off glc +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: BB2_2: ; %endif ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -99,7 +102,8 @@ ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_cbranch_execz BB3_2 ; CHECK-NEXT: ; %bb.1: ; %if.true -; CHECK-NEXT: global_load_dword v0, v[0:1], off +; CHECK-NEXT: global_load_dword v0, v[0:1], off glc +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: BB3_2: ; %endif ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -219,7 +223,7 @@ ; CHECK-NEXT: s_cbranch_vccnz BB5_1 ; CHECK-NEXT: ; %bb.3: ; %bb4 ; CHECK-NEXT: ; in Loop: Header=BB5_2 Depth=1 -; CHECK-NEXT: global_load_dword v2, v[0:1], off +; CHECK-NEXT: global_load_dword v2, v[0:1], off glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ge_i32_e64 s[2:3], v0, v2 ; CHECK-NEXT: s_branch BB5_1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -15,8 +15,10 @@ ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: s_add_u32 s1, 4, s1 ; GFX9-NEXT: scratch_store_dword off, v0, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_add_u32 s0, 4, s0 -; GFX9-NEXT: scratch_load_dword v0, off, s0 +; GFX9-NEXT: scratch_load_dword v0, off, s0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_kernel: @@ -34,7 +36,9 @@ ; GFX10-NEXT: s_add_u32 s0, 4, s0 ; GFX10-NEXT: s_add_u32 s1, 4, s1 ; GFX10-NEXT: scratch_store_dword off, v0, s0 -; GFX10-NEXT: scratch_load_dword v0, off, s1 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) @@ -61,8 +65,10 @@ ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-NEXT: scratch_store_dword v1, v3, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 -; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_vindex_kernel: @@ -79,7 +85,9 @@ ; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 ; GFX10-NEXT: scratch_store_dword v0, v3, off -; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) @@ -107,8 +115,9 @@ ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-NEXT: scratch_store_dword v1, v3, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 -; GFX9-NEXT: scratch_load_dword v0, v0, off +; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -124,9 +133,9 @@ ; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 ; GFX10-NEXT: scratch_store_dword v0, v3, off -; GFX10-NEXT: scratch_load_dword v0, v1, off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] bb: %i = alloca [32 x float], align 4, addrspace(5) @@ -176,10 +185,13 @@ ; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: s_add_u32 s1, 0x104, s1 -; GFX9-NEXT: scratch_load_dword v1, off, s2 +; GFX9-NEXT: scratch_load_dword v1, off, s2 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_store_dword off, v0, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_add_u32 s0, 0x104, s0 -; GFX9-NEXT: scratch_load_dword v0, off, s0 +; GFX9-NEXT: scratch_load_dword v0, off, s0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_small_offset_kernel: @@ -190,7 +202,7 @@ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: s_add_u32 s1, 4, 0 -; GFX10-NEXT: scratch_load_dword v0, off, s1 +; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -200,7 +212,9 @@ ; GFX10-NEXT: s_add_u32 s0, 0x104, s0 ; GFX10-NEXT: s_add_u32 s1, 0x104, s1 ; GFX10-NEXT: scratch_store_dword off, v0, s0 -; GFX10-NEXT: scratch_load_dword v0, off, s1 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) @@ -224,7 +238,7 @@ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_add_u32 s0, 4, 0 -; GFX9-NEXT: scratch_load_dword v1, off, s0 +; GFX9-NEXT: scratch_load_dword v1, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 @@ -233,8 +247,10 @@ ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-NEXT: scratch_store_dword v1, v3, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 -; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_vindex_small_offset_kernel: @@ -251,9 +267,12 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-NEXT: scratch_load_dword v2, off, s0 +; GFX10-NEXT: scratch_load_dword v2, off, s0 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: scratch_store_dword v0, v3, off -; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) @@ -278,9 +297,9 @@ ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s32, 0 -; GFX9-NEXT: scratch_load_dword v1, off, s0 -; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x100 +; GFX9-NEXT: scratch_load_dword v1, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x100 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, vcc_hi @@ -288,8 +307,9 @@ ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-NEXT: scratch_store_dword v1, v3, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 -; GFX9-NEXT: scratch_load_dword v0, v0, off +; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -306,11 +326,12 @@ ; GFX10-NEXT: s_add_u32 s0, s32, 0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-NEXT: scratch_load_dword v2, off, s0 -; GFX10-NEXT: scratch_store_dword v0, v3, off -; GFX10-NEXT: scratch_load_dword v0, v1, off +; GFX10-NEXT: scratch_load_dword v2, off, s0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: scratch_store_dword v0, v3, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] bb: %padding = alloca [64 x i32], align 4, addrspace(5) @@ -341,10 +362,13 @@ ; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: s_add_u32 s1, 0x4004, s1 -; GFX9-NEXT: scratch_load_dword v1, off, s2 +; GFX9-NEXT: scratch_load_dword v1, off, s2 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_store_dword off, v0, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 -; GFX9-NEXT: scratch_load_dword v0, off, s0 +; GFX9-NEXT: scratch_load_dword v0, off, s0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_large_offset_kernel: @@ -355,7 +379,7 @@ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: s_add_u32 s1, 4, 0 -; GFX10-NEXT: scratch_load_dword v0, off, s1 +; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -365,7 +389,9 @@ ; GFX10-NEXT: s_add_u32 s0, 0x4004, s0 ; GFX10-NEXT: s_add_u32 s1, 0x4004, s1 ; GFX10-NEXT: scratch_store_dword off, v0, s0 -; GFX10-NEXT: scratch_load_dword v0, off, s1 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -389,7 +415,7 @@ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_add_u32 s0, 4, 0 -; GFX9-NEXT: scratch_load_dword v1, off, s0 +; GFX9-NEXT: scratch_load_dword v1, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 @@ -398,8 +424,10 @@ ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-NEXT: scratch_store_dword v1, v3, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 -; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_vindex_large_offset_kernel: @@ -416,9 +444,12 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-NEXT: scratch_load_dword v2, off, s0 +; GFX10-NEXT: scratch_load_dword v2, off, s0 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: scratch_store_dword v0, v3, off -; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -443,9 +474,9 @@ ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s32, 0 -; GFX9-NEXT: scratch_load_dword v1, off, s0 -; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_load_dword v1, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, vcc_hi @@ -453,8 +484,9 @@ ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-NEXT: scratch_store_dword v1, v3, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 -; GFX9-NEXT: scratch_load_dword v0, v0, off +; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -471,11 +503,12 @@ ; GFX10-NEXT: s_add_u32 s0, s32, 0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-NEXT: scratch_load_dword v2, off, s0 -; GFX10-NEXT: scratch_store_dword v0, v3, off -; GFX10-NEXT: scratch_load_dword v0, v1, off +; GFX10-NEXT: scratch_load_dword v2, off, s0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: scratch_store_dword v0, v3, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] bb: %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -501,11 +534,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: s_add_u32 s0, 4, 0 ; GFX9-NEXT: scratch_store_dword off, v0, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_movk_i32 s0, 0x3e80 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_add_u32 s0, 4, s0 ; GFX9-NEXT: scratch_store_dword off, v0, s0 -; GFX9-NEXT: scratch_load_dword v0, off, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_dword v0, off, s0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_large_imm_offset_kernel: @@ -520,8 +556,11 @@ ; GFX10-NEXT: s_add_u32 s1, 4, 0 ; GFX10-NEXT: s_add_u32 s0, 4, s0 ; GFX10-NEXT: scratch_store_dword off, v0, s1 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_store_dword off, v1, s0 -; GFX10-NEXT: scratch_load_dword v0, off, s0 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm bb: %i = alloca [4096 x i32], align 4, addrspace(5) @@ -541,11 +580,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: s_add_u32 s0, s32, 0 ; GFX9-NEXT: scratch_store_dword off, v0, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_movk_i32 s0, 0x3e80 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_add_u32 s0, s32, s0 ; GFX9-NEXT: scratch_store_dword off, v0, s0 -; GFX9-NEXT: scratch_load_dword v0, off, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -559,10 +600,11 @@ ; GFX10-NEXT: s_add_u32 s1, s32, 0 ; GFX10-NEXT: s_add_u32 s0, s32, s0 ; GFX10-NEXT: scratch_store_dword off, v0, s1 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_store_dword off, v1, s0 -; GFX10-NEXT: scratch_load_dword v0, off, s0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] bb: %i = alloca [4096 x i32], align 4, addrspace(5) @@ -586,7 +628,9 @@ ; GFX9-NEXT: v_add_lshl_u32 v0, s0, v0, 2 ; GFX9-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024 -; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_vidx_sidx_offset: @@ -601,7 +645,9 @@ ; GFX10-NEXT: v_add_lshl_u32 v0, s0, v0, 2 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX10-NEXT: scratch_store_dword v0, v1, off offset:1024 -; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm bb: %alloca = alloca [32 x i32], align 4, addrspace(5) @@ -621,7 +667,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 15 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off -; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -632,9 +679,9 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off -; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] bb: store volatile i64 15, i64 addrspace(5)* %arg, align 8 @@ -649,7 +696,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 15 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off -; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -660,9 +708,9 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off -; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] bb: store volatile i64 15, i64 addrspace(5)* %arg, align 1 @@ -681,7 +729,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: scratch_store_dwordx3 v0, v[1:3], off -; GFX9-NEXT: scratch_load_dwordx3 v[0:2], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -696,9 +745,9 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: scratch_store_dwordx3 v0, v[1:3], off -; GFX10-NEXT: scratch_load_dwordx3 v[0:2], v0, off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] bb: store volatile <3 x i32> , <3 x i32> addrspace(5)* %arg, align 1 @@ -719,7 +768,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: scratch_store_dwordx4 v0, v[1:4], off -; GFX9-NEXT: scratch_load_dwordx4 v[0:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -736,9 +786,9 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: scratch_store_dwordx4 v0, v[1:4], off -; GFX10-NEXT: scratch_load_dwordx4 v[0:3], v0, off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] bb: store volatile <4 x i32> , <4 x i32> addrspace(5)* %arg, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll @@ -13,14 +13,15 @@ ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 s[8:9], s[4:5] -; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 s[8:9], s[6:7] -; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_sub_f32_e32 v2, 0x80000000, v2 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_sub_f32_e32 v2, 0x80000000, v2 ; SI-NEXT: v_med3_f32 v2, v2, v3, v4 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -43,16 +44,17 @@ ; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: flat_load_dword v1, v[2:3] -; VI-NEXT: flat_load_dword v2, v[4:5] +; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v1, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, s1 ; VI-NEXT: v_mov_b32_e32 v6, s0 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v8 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_med3_f32 v0, v0, v1, v2 ; VI-NEXT: flat_store_dword v[6:7], v0 ; VI-NEXT: s_endpgm @@ -62,12 +64,13 @@ ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -98,19 +101,19 @@ ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 s[8:9], s[4:5] -; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 s[8:9], s[6:7] -; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_sub_f32_e32 v2, 0x80000000, v2 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_min_f32_e32 v5, v2, v3 ; SI-NEXT: v_max_f32_e32 v2, v2, v3 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_min_f32_e32 v2, v2, v3 @@ -138,21 +141,21 @@ ; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: flat_load_dword v3, v[4:5] +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_sub_f32_e32 v4, 0x80000000, v7 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; VI-NEXT: v_min_f32_e32 v5, v4, v2 ; VI-NEXT: v_max_f32_e32 v2, v4, v2 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_min_f32_e32 v2, v2, v3 @@ -167,17 +170,17 @@ ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 @@ -213,15 +216,16 @@ ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 s[8:9], s[4:5] -; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 s[8:9], s[6:7] -; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s2, 0x80000000 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_sub_f32_e32 v2, s2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_sub_f32_e64 v4, s2, |v4| ; SI-NEXT: v_med3_f32 v2, v2, |v3|, v4 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11] @@ -245,17 +249,18 @@ ; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: flat_load_dword v3, v[4:5] +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_mov_b32 s2, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_sub_f32_e32 v4, s2, v7 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_f32_e64 v3, s2, |v3| ; VI-NEXT: v_med3_f32 v2, v4, |v2|, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -266,13 +271,14 @@ ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s2, 0x80000000 -; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_sub_f32_e32 v1, s2, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_f32_e64 v3, s2, |v3| ; GFX9-NEXT: v_med3_f32 v1, v1, |v2|, v3 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -310,17 +316,17 @@ ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 s[8:9], s[4:5] -; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 s[8:9], s[6:7] -; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s2, 0x80000000 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_sub_f32_e64 v2, s2, |v2| -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_sub_f32_e64 v3, s2, |v3| -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_sub_f32_e64 v4, s2, |v4| ; SI-NEXT: v_med3_f32 v2, v2, v3, v4 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11] @@ -344,19 +350,19 @@ ; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: flat_load_dword v3, v[4:5] +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_mov_b32 s2, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_sub_f32_e64 v4, s2, |v7| -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_sub_f32_e64 v2, s2, |v2| -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_f32_e64 v3, s2, |v3| ; VI-NEXT: v_med3_f32 v2, v4, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -367,15 +373,15 @@ ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s2, 0x80000000 -; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_sub_f32_e64 v1, s2, |v1| -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_sub_f32_e64 v2, s2, |v2| -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_f32_e64 v3, s2, |v3| ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -415,16 +421,16 @@ ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 s[8:9], s[4:5] -; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 s[8:9], s[6:7] -; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_f32_e32 v3, 2.0, v3 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v4, 4.0, v4 ; SI-NEXT: v_min_f32_e32 v5, v2, v3 ; SI-NEXT: v_max_f32_e32 v2, v2, v3 @@ -454,20 +460,20 @@ ; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: flat_load_dword v3, v[4:5] +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_f32_e32 v4, 1.0, v7 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_f32_e32 v2, 2.0, v2 ; VI-NEXT: v_min_f32_e32 v5, v4, v2 ; VI-NEXT: v_max_f32_e32 v2, v4, v2 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f32_e32 v3, 4.0, v3 ; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_min_f32_e32 v2, v2, v3 @@ -482,16 +488,16 @@ ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2 ; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 @@ -536,22 +542,23 @@ ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 s[8:9], s[4:5] -; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_mov_b64 s[8:9], s[6:7] -; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_min_f32_e32 v5, v2, v3 ; SI-NEXT: v_max_f32_e32 v2, v2, v3 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_min_f32_e32 v2, v2, v3 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 @@ -578,20 +585,20 @@ ; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: flat_load_dword v3, v[4:5] +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_mul_f32_e32 v4, 1.0, v7 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_min_f32_e32 v5, v4, v2 ; VI-NEXT: v_max_f32_e32 v2, v4, v2 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_min_f32_e32 v2, v2, v3 @@ -599,6 +606,7 @@ ; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_max_f32_e32 v2, v3, v2 ; VI-NEXT: flat_store_dword v[0:1], v5 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -607,17 +615,18 @@ ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v[0:1], v4, off -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -810,8 +810,11 @@ ; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v16, v1, s[12:13] ; GPRIDX-NEXT: v_cndmask_b32_e64 v18, v18, v1, s[14:15] ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[7:10], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[11:14], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[15:18], off ; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: s_setpc_b64 s[30:31] @@ -874,8 +877,11 @@ ; MOVREL-NEXT: v_cndmask_b32_e64 v16, v16, v1, s8 ; MOVREL-NEXT: v_cndmask_b32_e64 v18, v18, v1, s9 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[7:10], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[11:14], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[15:18], off ; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: s_setpc_b64 s[30:31] @@ -954,9 +960,13 @@ ; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v14, v0, s[8:9] ; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v16, v0, s[10:11] ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[1:4], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[5:8], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[9:12], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[13:16], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: s_endpgm ; ; MOVREL-LABEL: dyn_insertelement_v8f64_s_s_v: @@ -1020,9 +1030,13 @@ ; MOVREL-NEXT: v_cndmask_b32_e64 v15, v15, s30, s4 ; MOVREL-NEXT: v_cndmask_b32_e64 v16, v16, s31, s4 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[1:4], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[5:8], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[9:12], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[13:16], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: s_endpgm entry: %insert = insertelement <8 x double> %vec, double %val, i32 %idx @@ -1078,9 +1092,13 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v3, v1 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[6:9], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[10:13], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[14:17], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: s_endpgm ; ; MOVREL-LABEL: dyn_insertelement_v8f64_s_v_s: @@ -1121,9 +1139,13 @@ ; MOVREL-NEXT: v_movreld_b32_e32 v2, v0 ; MOVREL-NEXT: v_movreld_b32_e32 v3, v1 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[10:13], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[14:17], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: s_endpgm entry: %insert = insertelement <8 x double> %vec, double %val, i32 %idx @@ -1147,9 +1169,13 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v1, s3 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[12:15], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: s_endpgm ; ; MOVREL-LABEL: dyn_insertelement_v8f64_v_s_s: @@ -1158,9 +1184,13 @@ ; MOVREL-NEXT: v_movreld_b32_e32 v0, s2 ; MOVREL-NEXT: v_movreld_b32_e32 v1, s3 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[12:15], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: s_endpgm entry: %insert = insertelement <8 x double> %vec, double %val, i32 %idx @@ -1235,9 +1265,13 @@ ; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v16, v1, s[8:9] ; GPRIDX-NEXT: v_cndmask_b32_e64 v18, v18, v1, s[10:11] ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[7:10], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[11:14], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[15:18], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: s_endpgm ; ; MOVREL-LABEL: dyn_insertelement_v8f64_s_v_v: @@ -1299,9 +1333,13 @@ ; MOVREL-NEXT: v_cndmask_b32_e64 v16, v16, v1, s4 ; MOVREL-NEXT: v_cndmask_b32_e64 v18, v18, v1, s5 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[7:10], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[11:14], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[15:18], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: s_endpgm entry: %insert = insertelement <8 x double> %vec, double %val, i32 %idx @@ -1346,9 +1384,13 @@ ; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v16, s[14:15] ; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[12:13] ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[12:15], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: s_endpgm ; ; MOVREL-LABEL: dyn_insertelement_v8f64_v_s_v: @@ -1378,9 +1420,13 @@ ; MOVREL-NEXT: v_cndmask_b32_e64 v14, v14, s2, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e64 v15, v15, s3, vcc_lo ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[12:15], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: s_endpgm entry: %insert = insertelement <8 x double> %vec, double %val, i32 %idx @@ -1404,9 +1450,13 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v1, v17 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[12:15], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: s_endpgm ; ; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_s: @@ -1415,9 +1465,13 @@ ; MOVREL-NEXT: v_movreld_b32_e32 v0, v16 ; MOVREL-NEXT: v_movreld_b32_e32 v1, v17 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[12:15], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: s_endpgm entry: %insert = insertelement <8 x double> %vec, double %val, i32 %idx @@ -1460,9 +1514,13 @@ ; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v17, s[12:13] ; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v15, v17, s[10:11] ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[12:15], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: s_endpgm ; ; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_v: @@ -1494,9 +1552,13 @@ ; MOVREL-NEXT: v_cndmask_b32_e64 v13, v13, v17, s6 ; MOVREL-NEXT: v_cndmask_b32_e64 v15, v15, v17, s5 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[12:15], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: s_endpgm entry: %insert = insertelement <8 x double> %vec, double %val, i32 %idx @@ -1978,24 +2040,25 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v2, s2 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s3 ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: v_mov_b32_e32 v0, s4 ; GPRIDX-NEXT: v_mov_b32_e32 v1, s5 ; GPRIDX-NEXT: v_mov_b32_e32 v2, s6 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s7 ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: v_mov_b32_e32 v0, s8 ; GPRIDX-NEXT: v_mov_b32_e32 v1, s9 ; GPRIDX-NEXT: v_mov_b32_e32 v2, s10 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s11 ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: v_mov_b32_e32 v0, s12 ; GPRIDX-NEXT: v_mov_b32_e32 v1, s13 ; GPRIDX-NEXT: v_mov_b32_e32 v2, s14 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s15 ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: s_endpgm ; ; MOVREL-LABEL: dyn_insertelement_v8f64_s_s_s_add_1: @@ -2035,9 +2098,13 @@ ; MOVREL-NEXT: v_mov_b32_e32 v14, s14 ; MOVREL-NEXT: v_mov_b32_e32 v15, s15 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[12:15], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: s_endpgm entry: %idx.add = add i32 %idx, 1 @@ -2082,9 +2149,13 @@ ; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v17, s[12:13] ; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v15, v17, s[10:11] ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[12:15], off +; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: s_endpgm ; ; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_v_add_1: @@ -2117,9 +2188,13 @@ ; MOVREL-NEXT: v_cndmask_b32_e64 v13, v13, v17, s6 ; MOVREL-NEXT: v_cndmask_b32_e64 v15, v15, v17, s5 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[12:15], off +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: s_endpgm entry: %idx.add = add i32 %idx, 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -818,9 +818,12 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] -; GFX7-NEXT: buffer_load_dword v3, v[1:2], s[0:3], 0 addr64 -; GFX7-NEXT: buffer_load_dword v4, v[1:2], s[0:3], 0 addr64 offset:4 -; GFX7-NEXT: buffer_load_dword v1, v[1:2], s[0:3], 0 addr64 offset:8 +; GFX7-NEXT: buffer_load_dword v3, v[1:2], s[0:3], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_load_dword v4, v[1:2], s[0:3], 0 addr64 offset:4 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_load_dword v1, v[1:2], s[0:3], 0 addr64 offset:8 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_cmp_lg_u32 s8, 0 ; GFX7-NEXT: s_cselect_b32 s0, 1, 0 ; GFX7-NEXT: s_and_b32 s0, 1, s0 @@ -829,7 +832,6 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_and_b64 vcc, vcc, s[0:1] ; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_div_fmas_f32 v0, v3, v4, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 ; GFX7-NEXT: s_endpgm @@ -848,9 +850,12 @@ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 8, v1 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_dword v1, v[1:2] -; GFX8-NEXT: flat_load_dword v2, v[3:4] -; GFX8-NEXT: flat_load_dword v3, v[5:6] +; GFX8-NEXT: flat_load_dword v1, v[1:2] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_load_dword v2, v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_load_dword v3, v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s4, 8 ; GFX8-NEXT: s_addc_u32 s1, s5, 0 ; GFX8-NEXT: s_cmp_lg_u32 s2, 0 @@ -859,8 +864,7 @@ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, s2 ; GFX8-NEXT: s_and_b64 vcc, vcc, s[2:3] -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_div_fmas_f32 v2, v1, v2, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -874,16 +878,17 @@ ; GFX10_W32-NEXT: s_load_dword s0, s[0:1], 0x54 ; GFX10_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_clause 0x2 -; GFX10_W32-NEXT: global_load_dword v2, v1, s[6:7] -; GFX10_W32-NEXT: global_load_dword v3, v1, s[6:7] offset:4 -; GFX10_W32-NEXT: global_load_dword v1, v1, s[6:7] offset:8 +; GFX10_W32-NEXT: global_load_dword v2, v1, s[6:7] glc dlc +; GFX10_W32-NEXT: s_waitcnt vmcnt(0) +; GFX10_W32-NEXT: global_load_dword v3, v1, s[6:7] offset:4 glc dlc +; GFX10_W32-NEXT: s_waitcnt vmcnt(0) +; GFX10_W32-NEXT: global_load_dword v1, v1, s[6:7] offset:8 glc dlc +; GFX10_W32-NEXT: s_waitcnt vmcnt(0) ; GFX10_W32-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10_W32-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10_W32-NEXT: s_and_b32 s0, 1, s0 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 ; GFX10_W32-NEXT: s_and_b32 vcc_lo, vcc_lo, s0 -; GFX10_W32-NEXT: s_waitcnt vmcnt(0) ; GFX10_W32-NEXT: v_div_fmas_f32 v0, v2, v3, v1 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: global_store_dword v1, v0, s[4:5] offset:8 @@ -896,16 +901,17 @@ ; GFX10_W64-NEXT: s_load_dword s0, s[0:1], 0x54 ; GFX10_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_clause 0x2 -; GFX10_W64-NEXT: global_load_dword v2, v1, s[6:7] -; GFX10_W64-NEXT: global_load_dword v3, v1, s[6:7] offset:4 -; GFX10_W64-NEXT: global_load_dword v1, v1, s[6:7] offset:8 +; GFX10_W64-NEXT: global_load_dword v2, v1, s[6:7] glc dlc +; GFX10_W64-NEXT: s_waitcnt vmcnt(0) +; GFX10_W64-NEXT: global_load_dword v3, v1, s[6:7] offset:4 glc dlc +; GFX10_W64-NEXT: s_waitcnt vmcnt(0) +; GFX10_W64-NEXT: global_load_dword v1, v1, s[6:7] offset:8 glc dlc +; GFX10_W64-NEXT: s_waitcnt vmcnt(0) ; GFX10_W64-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10_W64-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10_W64-NEXT: s_and_b32 s0, 1, s0 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 ; GFX10_W64-NEXT: s_and_b64 vcc, vcc, s[0:1] -; GFX10_W64-NEXT: s_waitcnt vmcnt(0) ; GFX10_W64-NEXT: v_div_fmas_f32 v0, v2, v3, v1 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: global_store_dword v1, v0, s[4:5] offset:8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -14,10 +14,11 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, v2 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -34,8 +35,9 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: flat_load_dword v1, v[2:3] +; GFX8-NEXT: flat_load_dword v0, v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_load_dword v1, v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -48,9 +50,9 @@ ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -79,10 +81,11 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v2, v0, v2 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -99,8 +102,9 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: flat_load_dword v1, v[2:3] +; GFX8-NEXT: flat_load_dword v0, v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_load_dword v1, v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -113,9 +117,9 @@ ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v0, s2, v1, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -147,8 +151,9 @@ ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v3, s1 @@ -168,8 +173,9 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 @@ -183,9 +189,9 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v2, s[2:3] offset:8 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[2:3], v2, s[2:3] offset:8 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[2:3], v[2:3], v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -217,8 +223,9 @@ ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v3, s1 @@ -238,8 +245,9 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 @@ -253,9 +261,9 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v2, s[2:3] offset:8 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[2:3], v2, s[2:3] offset:8 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1056,12 +1064,12 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v1, 0x7fffffff, v2 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, v1 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1078,11 +1086,11 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX8-NEXT: flat_load_dword v0, v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_load_dword v1, v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1094,12 +1102,11 @@ ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] @@ -1129,10 +1136,11 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, v2 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] @@ -1150,8 +1158,9 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: flat_load_dword v1, v[2:3] +; GFX8-NEXT: flat_load_dword v0, v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_load_dword v1, v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 @@ -1165,9 +1174,9 @@ ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll @@ -16,11 +16,13 @@ ; GCN-NEXT: ; %bb.1: ; %mid ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off +; GCN-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: BB0_2: ; %bb ; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off +; GCN-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_endpgm entry: %cond = icmp eq i32 %arg0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll @@ -15,10 +15,12 @@ ; GCN-NEXT: ; %bb.1: ; %mid ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: BB0_2: ; %bb ; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm entry: %cond = icmp eq i32 %arg0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll @@ -15,6 +15,7 @@ ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: global_store_dword v[0:1], v0, off +; GCN-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_endpgm entry: %cond = icmp eq i32 %arg0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll @@ -15,6 +15,7 @@ ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm entry: %cond = icmp eq i32 %arg0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll @@ -14,9 +14,10 @@ ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_load_dword s0, s[4:5], 0x11 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v1 ; CI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; CI-NEXT: flat_store_dword v[0:1], v0 @@ -27,10 +28,10 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: global_store_dword v[0:1], v0, off @@ -59,6 +60,7 @@ ; CI-NEXT: ; %bb.1: ; %bb0 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: flat_store_dword v[0:1], v0 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: BB1_2: ; %bb1 ; CI-NEXT: s_endpgm ; @@ -76,6 +78,7 @@ ; GFX9-NEXT: ; %bb.1: ; %bb0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_store_dword v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: BB1_2: ; %bb1 ; GFX9-NEXT: s_endpgm %val = call i1 @llvm.amdgcn.is.private(i8* %ptr) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll @@ -14,9 +14,10 @@ ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_load_dword s0, s[4:5], 0x10 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v1 ; CI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; CI-NEXT: flat_store_dword v[0:1], v0 @@ -27,10 +28,10 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: global_store_dword v[0:1], v0, off @@ -59,6 +60,7 @@ ; CI-NEXT: ; %bb.1: ; %bb0 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: flat_store_dword v[0:1], v0 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: BB1_2: ; %bb1 ; CI-NEXT: s_endpgm ; @@ -76,6 +78,7 @@ ; GFX9-NEXT: ; %bb.1: ; %bb0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_store_dword v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: BB1_2: ; %bb1 ; GFX9-NEXT: s_endpgm %val = call i1 @llvm.amdgcn.is.shared(i8* %ptr) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll @@ -32,6 +32,7 @@ ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 ; CI-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_endpgm ; ; VI-LABEL: s_trig_preop_f64: @@ -42,6 +43,7 @@ ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_trig_preop_f64: @@ -52,6 +54,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 %b) store volatile double %result, double* undef @@ -65,6 +68,7 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 7) store volatile double %result, double* undef diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -17,17 +17,23 @@ ; GFX9-NEXT: ; %bb.1: ; %bb1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5be6 ; GFX9-NEXT: global_store_dword v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0x1c7 ; GFX9-NEXT: global_store_dword v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3e8 ; GFX9-NEXT: global_store_dword v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0x1c8 ; GFX9-NEXT: global_store_dword v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3e7 ; GFX9-NEXT: global_store_dword v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: global_store_dword v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: BB0_2: ; %Flow ; GFX9-NEXT: s_xor_b32 s0, s0, -1 ; GFX9-NEXT: s_and_b32 s0, s0, 1 @@ -36,16 +42,22 @@ ; GFX9-NEXT: ; %bb.3: ; %bb0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9-NEXT: global_store_dword v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0x1c8 ; GFX9-NEXT: global_store_dword v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3e7 ; GFX9-NEXT: global_store_dword v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3e8 ; GFX9-NEXT: global_store_dword v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0x1c7 ; GFX9-NEXT: global_store_dword v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5be6 ; GFX9-NEXT: global_store_dword v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: BB0_4: ; %bb2 ; GFX9-NEXT: s_endpgm entry: @@ -104,7 +116,9 @@ ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: BB1_2: ; %Flow ; GFX9-NEXT: s_xor_b32 s0, s0, -1 ; GFX9-NEXT: s_and_b32 s0, s0, 1 @@ -123,7 +137,9 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: BB1_4: ; %bb2 ; GFX9-NEXT: s_endpgm entry: @@ -165,11 +181,13 @@ ; GFX9-NEXT: s_addc_u32 s7, s7, static.gv2@rel32@hi+12 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_store_dword v0, v0, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, static.gv3@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, static.gv3@rel32@hi+12 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: BB2_2: ; %Flow ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] @@ -180,11 +198,13 @@ ; GFX9-NEXT: s_addc_u32 s7, s7, static.gv0@rel32@hi+12 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_store_dword v0, v0, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, static.gv1@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, static.gv1@rel32@hi+12 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: BB2_4: ; %bb2 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -468,7 +468,7 @@ ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; @@ -478,7 +478,7 @@ ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %val = load volatile float, float addrspace(1)* %ptr @@ -493,7 +493,7 @@ ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_movk_i32 s4, 0x3ffc -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], s4 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], s4 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; @@ -504,7 +504,7 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_movk_i32 s4, 0x3ffc -; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s4 +; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s4 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i64 4095 @@ -523,7 +523,7 @@ ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; @@ -537,7 +537,7 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i64 4294967296 @@ -556,7 +556,7 @@ ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; @@ -570,7 +570,7 @@ ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i64 4294967297 @@ -586,7 +586,7 @@ ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_movk_i32 s4, 0x4000 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], s4 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], s4 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; @@ -597,7 +597,7 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_movk_i32 s4, 0x4000 -; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s4 +; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s4 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i64 4096 @@ -612,7 +612,7 @@ ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b64 s[0:1], 0 ; GFX6-NEXT: s_movk_i32 s4, 0x3ffc -; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 +; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; @@ -622,7 +622,7 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_movk_i32 s4, 0x3ffc -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i64 4095 @@ -637,7 +637,7 @@ ; GFX6-NEXT: s_mov_b32 s1, 4 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, s0 -; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; @@ -647,7 +647,7 @@ ; GFX7-NEXT: s_mov_b32 s1, 4 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, s0 -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i64 4294967296 @@ -662,7 +662,7 @@ ; GFX6-NEXT: s_mov_b32 s1, s0 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; @@ -672,7 +672,7 @@ ; GFX7-NEXT: s_mov_b32 s1, s0 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i64 4294967297 @@ -687,7 +687,7 @@ ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b64 s[0:1], 0 ; GFX6-NEXT: s_movk_i32 s4, 0x4000 -; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 +; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; @@ -697,7 +697,7 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_movk_i32 s4, 0x4000 -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i64 4096 @@ -716,7 +716,7 @@ ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; @@ -730,7 +730,7 @@ ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i32 %soffset @@ -745,7 +745,7 @@ ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; @@ -755,7 +755,7 @@ ; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i32 %soffset @@ -770,7 +770,7 @@ ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 +; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; @@ -780,7 +780,7 @@ ; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep0 = getelementptr float, float addrspace(1)* %ptr, i32 %soffset @@ -802,7 +802,7 @@ ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, s5 -; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; @@ -818,7 +818,7 @@ ; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, s5 -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 256 @@ -836,7 +836,7 @@ ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; @@ -848,7 +848,7 @@ ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i32 %voffset @@ -866,7 +866,7 @@ ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_movk_i32 s4, 0x3ffc -; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 +; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; @@ -879,7 +879,7 @@ ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_movk_i32 s4, 0x3ffc -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep0 = getelementptr float, float addrspace(1)* %ptr, i32 %voffset @@ -896,7 +896,7 @@ ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_addc_u32 s5, s3, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; @@ -908,7 +908,7 @@ ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_addc_u32 s5, s3, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 4095 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -56,6 +56,7 @@ ; GCN-NEXT: BB0_3: ; %bb.2 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm entry: @@ -129,6 +130,7 @@ ; GCN-NEXT: BB1_2: ; %bb.1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm entry: %cond = icmp eq i32 %arg.cond, 0 @@ -193,9 +195,9 @@ ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: s_mov_b32 s33, s8 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] entry: @@ -257,9 +259,9 @@ ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_sub_u32 s32, s32, 0x2000 ; GCN-NEXT: s_mov_b32 s33, s8 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] entry: %cond = icmp eq i32 %arg.cond, 0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll --- a/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll @@ -2,7 +2,7 @@ ; GCN-LABEL: {{^}}kernel_ieee_mode_default: ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] +; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] ; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] ; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] @@ -17,7 +17,7 @@ ; GCN-LABEL: {{^}}kernel_ieee_mode_on: ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] +; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] ; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] ; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] @@ -32,7 +32,7 @@ ; GCN-LABEL: {{^}}kernel_ieee_mode_off: ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] +; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] ; GCN-NOT: [[VAL0]] ; GCN-NOT: [[VAL1]] ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]] @@ -47,7 +47,7 @@ ; GCN-LABEL: {{^}}func_ieee_mode_default: ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] +; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] ; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] ; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] @@ -62,7 +62,7 @@ ; GCN-LABEL: {{^}}func_ieee_mode_on: ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] +; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] ; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] ; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] @@ -77,7 +77,7 @@ ; GCN-LABEL: {{^}}func_ieee_mode_off: ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] +; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] ; GCN-NOT: [[VAL0]] ; GCN-NOT: [[VAL1]] ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]] @@ -92,7 +92,7 @@ ; GCN-LABEL: {{^}}cs_ieee_mode_default: ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] +; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] ; GCN-NOT: [[VAL0]] ; GCN-NOT: [[VAL1]] ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] @@ -107,7 +107,7 @@ ; GCN-LABEL: {{^}}cs_ieee_mode_on: ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] +; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] ; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] ; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] @@ -122,7 +122,7 @@ ; GCN-LABEL: {{^}}cs_ieee_mode_off: ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] +; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] ; GCN-NOT: [[VAL0]] ; GCN-NOT: [[VAL1]] ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]] @@ -137,7 +137,7 @@ ; GCN-LABEL: {{^}}ps_ieee_mode_default: ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] +; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] ; GCN-NOT: [[VAL0]] ; GCN-NOT: [[VAL1]] ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]] @@ -152,7 +152,7 @@ ; GCN-LABEL: {{^}}ps_ieee_mode_on: ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] +; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] ; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] ; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] @@ -167,7 +167,7 @@ ; GCN-LABEL: {{^}}ps_ieee_mode_off: ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] +; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] ; GCN-NOT: [[VAL0]] ; GCN-NOT: [[VAL1]] ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]] diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll @@ -1,4 +1,3 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1030 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1010 %s diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -245,7 +245,7 @@ ; GCN: v_mov_b32_e32 [[BB4_K:v[0-9]+]], 63 ; GCN: buffer_store_dword [[BB4_K]] -; GCN-NEXT: s_endpgm +; GCN: s_endpgm ; GCN-NEXT: .Lfunc_end{{[0-9]+}}: define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) { bb0: diff --git a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -11,12 +11,12 @@ ; GCN: s_and_saveexec_b64 ; GCN: s_cbranch_execz [[BB1:BB[0-9]+_[0-9]+]] -; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32{{$}} +; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32 glc{{$}} ; GCN-NOT: s32 ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}} ; GCN-NOT: s32 -; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:16{{$}} +; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:16 glc{{$}} ; GCN-NOT: s32 ; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:16{{$}} ; GCN-NOT: s32 diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -87,8 +87,6 @@ ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_signext@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_signext@rel32@hi+12 - -; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: s_endpgm @@ -110,9 +108,6 @@ ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext@rel32@hi+12 - - -; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -57,10 +57,10 @@ ; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}} ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4{{$}} ; FLATSCR-NEXT: scratch_store_dword off, v0, s33 offset:4{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_sub_u32 s32, s32, 0x200 ; FLATSCR-NEXT: s_sub_u32 s32, s32, 8 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @callee_with_stack_no_fp_elim_all() #1 { %alloca = alloca i32, addrspace(5) @@ -346,10 +346,10 @@ ; GCN-NEXT: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; MUBUF-NEXT: buffer_store_dword [[ZERO]], off, s[0:3], s33 ; FLATSCR-NEXT: scratch_store_dword off, [[ZERO]], s33 +; GCN-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_sub_u32 s32, s32, 0x100000 ; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x4000 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @realign_stack_no_fp_elim() #1 { %alloca = alloca i32, align 8192, addrspace(5) @@ -366,6 +366,7 @@ ; GCN: v_writelane_b32 v1, s31, 1 ; MUBUF: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4 ; FLATSCR: scratch_store_dword off, [[ZERO]], s33 offset:4 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN: ;;#ASMSTART ; MUBUF: v_readlane_b32 s4, v1, 0 ; MUBUF-NEXT: s_add_u32 s32, s32, 0x200 @@ -376,7 +377,6 @@ ; MUBUF-NEXT: s_sub_u32 s32, s32, 0x200 ; FLATSCR-NEXT: s_sub_u32 s32, s32, 8 ; GCN-NEXT: v_readlane_b32 s33, v1, 2 -; GCN-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[4:5] ; FLATSCR-NEXT: s_setpc_b64 s[0:1] define void @no_unused_non_csr_sgpr_for_fp() #1 { diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -399,9 +399,7 @@ ; VARABI: buffer_load_dword v32, off, s[0:3], s32{{$}} ; VARABI: v_and_b32_e32 v32, 0x3ff, v32 ; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 - -; VARABI: s_waitcnt -; VARABI-NEXT: s_setpc_b64 +; VARABI: s_setpc_b64 ; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31 ; FIXEDABI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}} @@ -545,7 +543,7 @@ ; VARABI-NEXT: s_waitcnt ; VARABI-NEXT: v_and_b32_e32 v32, 0x3ff, v32 ; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32 -; VARABI: buffer_load_dword v0, off, s[0:3], s32{{$}} +; VARABI: buffer_load_dword v0, off, s[0:3], s32 glc{{$}} ; VARABI: s_setpc_b64 @@ -554,7 +552,7 @@ ; FIXEDABI: buffer_load_dword v0, off, s[0:3], s32{{$}} ; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; FIXEDABI: buffer_load_dword v0, off, s[0:3], s32 offset:4{{$}} +; FIXEDABI: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc{{$}} ; FIXEDABI: s_setpc_b64 define void @too_many_args_use_workitem_id_x_byval( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, @@ -704,10 +702,8 @@ ; VARABI: v_bfe_u32 [[BFE_Y:v[0-9]+]], v32, 10, 10 ; VARABI-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v32, 20, 10 ; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]] -; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]] - -; VARABI: s_waitcnt -; VARABI-NEXT: s_setpc_b64 +; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]] +; VARABI: s_setpc_b64 ; FIXEDABI: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v31 @@ -717,7 +713,7 @@ ; FIXEDABI: v_bfe_u32 [[BFE_Y:v[0-9]+]], v31, 10, 10 ; FIXEDABI-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v31, 20, 10 ; FIXEDABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]] -; FIXEDABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]] +; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]] define void @too_many_args_use_workitem_id_xyz( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, @@ -810,9 +806,7 @@ ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[IDY]] ; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[IDZ]] - -; GCN: s_waitcnt -; GCN-NEXT: s_setpc_b64 +; GCN: s_setpc_b64 ; GCN: ScratchSize: 0 define void @too_many_args_use_workitem_id_x_stack_yz( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll --- a/llvm/test/CodeGen/AMDGPU/cc-update.ll +++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -29,6 +29,7 @@ ; GFX803-NEXT: v_mov_b32_e32 v0, 0 ; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_endpgm ; ; GFX900-LABEL: test_kern_stack: @@ -39,6 +40,7 @@ ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_endpgm ; ; GFX1010-LABEL: test_kern_stack: @@ -51,6 +53,7 @@ ; GFX1010-NEXT: s_add_u32 s0, s0, s7 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-NEXT: s_endpgm entry: %x = alloca i32, align 4, addrspace(5) @@ -119,6 +122,7 @@ ; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 ; GFX803-NEXT: s_movk_i32 s32, 0x400 ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX803-NEXT: s_endpgm ; @@ -134,6 +138,7 @@ ; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 ; GFX900-NEXT: s_movk_i32 s32, 0x400 ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX900-NEXT: s_endpgm ; @@ -151,6 +156,7 @@ ; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 ; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1010-NEXT: s_endpgm entry: @@ -190,6 +196,7 @@ ; GFX803-NEXT: v_mov_b32_e32 v0, 0 ; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_endpgm ; ; GFX900-LABEL: test_force_fp_kern_stack: @@ -201,6 +208,7 @@ ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_endpgm ; ; GFX1010-LABEL: test_force_fp_kern_stack: @@ -214,6 +222,7 @@ ; GFX1010-NEXT: s_add_u32 s0, s0, s7 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-NEXT: s_endpgm entry: %x = alloca i32, align 4, addrspace(5) @@ -286,6 +295,7 @@ ; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 ; GFX803-NEXT: s_movk_i32 s32, 0x400 ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX803-NEXT: s_endpgm ; @@ -302,6 +312,7 @@ ; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 ; GFX900-NEXT: s_movk_i32 s32, 0x400 ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX900-NEXT: s_endpgm ; @@ -320,6 +331,7 @@ ; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 ; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1010-NEXT: s_endpgm entry: @@ -336,10 +348,10 @@ ; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; GFX803-NEXT: s_add_u32 s0, s0, s7 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 -; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 +; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_mov_b32 s4, 0x40000 ; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 -; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill ; GFX803-NEXT: ;;#ASMSTART ; GFX803-NEXT: ;;#ASMEND @@ -347,6 +359,7 @@ ; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_endpgm ; ; GFX900-LABEL: test_sgpr_offset_kernel: @@ -355,9 +368,9 @@ ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 ; GFX900-NEXT: s_add_u32 s0, s0, s7 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 -; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 -; GFX900-NEXT: s_mov_b32 s6, 0x40000 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_mov_b32 s6, 0x40000 ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ;;#ASMEND @@ -365,6 +378,7 @@ ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 +; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_endpgm ; ; GFX1010-LABEL: test_sgpr_offset_kernel: @@ -376,7 +390,7 @@ ; GFX1010-NEXT: s_add_u32 s0, s0, s7 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: s_mov_b32 s6, 0x20000 -; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 +; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc dlc ; GFX1010-NEXT: s_waitcnt vmcnt(0) ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX1010-NEXT: s_waitcnt_depctr 0xffe3 @@ -386,6 +400,7 @@ ; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload ; GFX1010-NEXT: s_waitcnt vmcnt(0) ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 +; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-NEXT: s_endpgm entry: ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -codegenprepare -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=OPT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GCN %s @@ -36,7 +37,8 @@ ; GCN-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_atomic_add_f32 v0, v1, s[2:3] offset:28 -; GCN-NEXT: global_load_dword v0, v[0:1], off +; GCN-NEXT: global_load_dword v0, v[0:1], off glc +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: BB0_2: ; %endif ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, 0x3d0000 diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -137,7 +137,7 @@ ; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32: ; GCN: s_and_saveexec_b64 ; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}} -; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}} +; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092 glc{{$}} ; GCN: {{^}}BB4_2: define amdgpu_kernel void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { entry: @@ -177,7 +177,7 @@ ; GCN: v_mov_b32_e32 [[BASE_FI0:v[0-9]+]], 4 ; GCN: buffer_store_dword {{v[0-9]+}}, [[BASE_FI0]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092{{$}} ; GCN: v_mov_b32_e32 [[BASE_FI1:v[0-9]+]], 4 -; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092{{$}} +; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092 glc{{$}} ; GCN: {{^BB[0-9]+}}_2: define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { @@ -215,7 +215,7 @@ ; GCN-LABEL: {{^}}test_no_sink_scratch_large_offset_i32: ; GCN: s_and_saveexec_b64 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}} -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}} +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen glc{{$}} ; GCN: {{^BB[0-9]+}}_2: define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -233,12 +233,15 @@ ; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:2 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:6 +; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:4 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:8 +; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4 ; GFX900-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:6 ; GFX900-NEXT: s_waitcnt vmcnt(1) @@ -262,14 +265,17 @@ ; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_store_short off, v0, vcc_hi offset:4 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] offset:2 ; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_store_short off, v0, vcc_hi offset:6 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] offset:4 ; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_store_short off, v0, vcc_hi offset:8 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 ; FLATSCR-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 ; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 @@ -387,10 +393,11 @@ ; GCN-LABEL: chain_hi_to_lo_global_other_dep: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:2 -; GCN-NEXT: global_load_short_d16_hi v0, v[0:1], off -; GCN-NEXT: v_mov_b32_e32 v1, 0xffff +; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:2 glc ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_load_short_d16_hi v0, v[0:1], off glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, 0xffff ; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] ; GCN-NEXT: v_bfi_b32 v0, v1, v2, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -409,10 +416,12 @@ ; GCN-LABEL: chain_hi_to_lo_flat_other_dep: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: flat_load_ushort v2, v[0:1] offset:2 -; GCN-NEXT: flat_load_short_d16_hi v0, v[0:1] +; GCN-NEXT: flat_load_ushort v2, v[0:1] offset:2 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_load_short_d16_hi v0, v[0:1] glc +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, 0xffff -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] ; GCN-NEXT: v_bfi_b32 v0, v1, v2, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/commute_modifiers.ll b/llvm/test/CodeGen/AMDGPU/commute_modifiers.ll --- a/llvm/test/CodeGen/AMDGPU/commute_modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/commute_modifiers.ll @@ -64,7 +64,7 @@ } ; FUNC-LABEL: @commute_add_fabs_f32 -; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]| ; SI: buffer_store_dword [[REG]] @@ -81,7 +81,7 @@ } ; FUNC-LABEL: @commute_mul_fneg_f32 -; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]] ; SI: buffer_store_dword [[REG]] @@ -98,7 +98,7 @@ } ; FUNC-LABEL: @commute_mul_fabs_fneg_f32 -; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]| ; SI: buffer_store_dword [[REG]] @@ -117,7 +117,7 @@ ; There's no reason to commute this. ; FUNC-LABEL: @commute_mul_fabs_x_fabs_y_f32 -; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]| ; SI: buffer_store_dword [[REG]] @@ -135,7 +135,7 @@ } ; FUNC-LABEL: @commute_mul_fabs_x_fneg_fabs_y_f32 -; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]| ; SI: buffer_store_dword [[REG]] @@ -157,7 +157,7 @@ ; though we have negate modifier on src2. ; SI-LABEL: {{^}}fma_a_2.0_neg_b_f32 -; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} ; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, |[[R2]]| ; SI: buffer_store_dword [[RESULT]] diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -567,7 +567,7 @@ ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s2 ; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -584,7 +584,7 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -612,9 +612,13 @@ ; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_byte v3, off, s[0:3], 0 offset:3 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_byte v2, off, s[0:3], 0 offset:2 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:1 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_volatile_store: @@ -635,9 +639,13 @@ ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_byte v3, off, s[0:3], 0 offset:3 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_byte v2, off, s[0:3], 0 offset:2 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll --- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -50,6 +50,7 @@ ; CI-NEXT: s_mov_b32 s1, s0 ; CI-NEXT: ds_write_b32 v0, v2 offset:12 ; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: write_ds_sub0_offset0_global_clamp_bit: @@ -66,6 +67,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm entry: %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -237,6 +239,7 @@ ; CI-NEXT: s_mov_b32 s1, s0 ; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 ; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: @@ -254,6 +257,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -50,10 +50,11 @@ ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 -; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:8 ; CI-NEXT: s_endpgm ; @@ -62,8 +63,9 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8 ; GFX9-NEXT: s_endpgm @@ -92,12 +94,12 @@ ; CI-NEXT: s_mov_b64 s[0:1], s[4:5] ; CI-NEXT: s_mov_b64 s[4:5], s[6:7] ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 glc +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: ds_write_b32 v0, v2 -; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: ds_write_b32 v0, v1 offset:32 ; CI-NEXT: s_endpgm ; @@ -106,11 +108,11 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: ds_write_b32 v0, v1 +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ds_write_b32 v0, v1 ; GFX9-NEXT: ds_write_b32 v0, v2 offset:32 ; GFX9-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -138,12 +140,12 @@ ; CI-NEXT: s_mov_b64 s[0:1], s[4:5] ; CI-NEXT: s_mov_b64 s[4:5], s[6:7] ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 glc +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: ds_write_b32 v0, v2 -; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: ds_write_b32 v0, v1 offset:32 ; CI-NEXT: s_endpgm ; @@ -152,11 +154,11 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: ds_write_b32 v0, v1 +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ds_write_b32 v0, v1 ; GFX9-NEXT: ds_write_b32 v0, v2 offset:32 ; GFX9-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -186,11 +188,12 @@ ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64 -; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 offset:8 +; CI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 offset:8 glc +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: ds_write2_b32 v0, v3, v2 offset1:8 ; CI-NEXT: s_endpgm ; @@ -200,9 +203,9 @@ ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v3, s[0:1] offset:8 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v3, s[0:1] offset:8 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_write2_b32 v0, v1, v3 offset1:8 ; GFX9-NEXT: s_endpgm @@ -308,10 +311,11 @@ ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 -; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:255 ; CI-NEXT: s_endpgm ; @@ -320,8 +324,9 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:255 ; GFX9-NEXT: s_endpgm @@ -730,10 +735,11 @@ ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 -; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 -; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 glc ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:8 ; CI-NEXT: s_endpgm ; @@ -742,8 +748,9 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:8 ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll b/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll --- a/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll @@ -27,11 +27,11 @@ ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]$}} -; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:4{{$}} +; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} glc{{$}} +; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:4 glc{{$}} ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} @@ -56,10 +56,10 @@ ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} glc{{$}} ; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:4 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v{{[0-9]+}} @@ -84,10 +84,10 @@ ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} ; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]$}} +; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} glc{{$}} ; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:8 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 3, v{{[0-9]+}} diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -88,11 +88,13 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, 9 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: BB1_2: ; %bb1 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 10 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX9-LABEL: s_add_co_br_user: @@ -113,11 +115,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, 9 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: BB1_2: ; %bb1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 10 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_add_co_br_user: @@ -137,11 +141,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, 9 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: BB1_2: ; %bb1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 10 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm bb: %i1 = add i32 %i, %i diff --git a/llvm/test/CodeGen/AMDGPU/extload-private.ll b/llvm/test/CodeGen/AMDGPU/extload-private.ll --- a/llvm/test/CodeGen/AMDGPU/extload-private.ll +++ b/llvm/test/CodeGen/AMDGPU/extload-private.ll @@ -35,7 +35,7 @@ } ; FUNC-LABEL: {{^}}load_i16_zext_private: -; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}} +; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 glc{{$}} define amdgpu_kernel void @load_i16_zext_private(i32 addrspace(1)* %out) { entry: %tmp0 = alloca i16, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll --- a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll +++ b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll @@ -106,10 +106,10 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s8, s2 ; GCN-NEXT: s_mov_b32 s9, s3 -; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm entry: @@ -130,10 +130,10 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s8, s2 ; GCN-NEXT: s_mov_b32 s9, s3 -; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; GCN-NEXT: s_endpgm entry: @@ -155,11 +155,11 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s8, s6 ; GCN-NEXT: s_mov_b32 s9, s7 -; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s12, 2 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc diff --git a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll --- a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll @@ -231,12 +231,16 @@ ; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]] ; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MUL]], [[Z]] ; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]] +; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]] +; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[U]] ; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[FMA]], [[Z]] ; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]] +; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) ; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]] +; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) ; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]] ; GCN-SLOWFMA: v_add_f32_e32 @@ -267,12 +271,16 @@ ; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]] ; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MUL]] ; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]] +; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]] +; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[U]] ; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[FMA]] ; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]] +; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) ; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]] +; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) ; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]] ; GCN-SLOWFMA: v_add_f32_e32 @@ -305,12 +313,16 @@ ; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]] ; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MUL]], [[Z]] ; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]] +; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]] +; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[UFLOAT]] ; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[FMA]], [[Z]] ; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]] +; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) ; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]] +; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) ; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]] ; GCN-SLOWFMA: v_add_f32_e32 @@ -344,12 +356,16 @@ ; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]] ; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MUL]] ; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]] +; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]] +; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[UFLOAT]] ; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[FMA]] ; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]] +; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) ; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]] +; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) ; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]] ; GCN-SLOWFMA: v_add_f32_e32 diff --git a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll --- a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll @@ -201,8 +201,9 @@ } ; CHECK-LABEL: {{^}}load_flat_i8_max_offset: -; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}} -; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095{{$}} +; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}} +; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc{{$}} +; GFX10: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}} define amdgpu_kernel void @load_flat_i8_max_offset(i8* %fptr) #0 { %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4095 %val = load volatile i8, i8* %fptr.offset @@ -210,7 +211,9 @@ } ; CHECK-LABEL: {{^}}load_flat_i8_max_offset_p1: -; CHECK: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}} +; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}} +; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}} +; GFX10: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}} define amdgpu_kernel void @load_flat_i8_max_offset_p1(i8* %fptr) #0 { %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4096 %val = load volatile i8, i8* %fptr.offset @@ -218,11 +221,11 @@ } ; CHECK-LABEL: {{^}}load_flat_i8_neg_offset: -; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}} +; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GFX9: v_add_co_u32_e64 v{{[0-9]+}}, vcc, -2, s ; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1, -; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @load_flat_i8_neg_offset(i8* %fptr) #0 { %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 -2 %val = load volatile i8, i8* %fptr.offset diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -197,8 +197,10 @@ ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: s_add_u32 s1, 4, s1 ; GFX9-NEXT: scratch_store_dword off, v0, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_add_u32 s0, 4, s0 -; GFX9-NEXT: scratch_load_dword v0, off, s0 +; GFX9-NEXT: scratch_load_dword v0, off, s0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_kernel: @@ -216,7 +218,9 @@ ; GFX10-NEXT: s_add_u32 s0, 4, s0 ; GFX10-NEXT: s_add_u32 s1, 4, s1 ; GFX10-NEXT: scratch_store_dword off, v0, s0 -; GFX10-NEXT: scratch_load_dword v0, off, s1 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_kernel: @@ -235,8 +239,10 @@ ; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-PAL-NEXT: s_add_u32 s1, 4, s1 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_add_u32 s0, 4, s0 -; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: store_load_sindex_kernel: @@ -259,7 +265,9 @@ ; GFX10-PAL-NEXT: s_add_u32 s0, 4, s0 ; GFX10-PAL-NEXT: s_add_u32 s1, 4, s1 ; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) @@ -283,10 +291,12 @@ ; GFX9-NEXT: s_add_u32 s0, 4, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: scratch_store_dword off, v0, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_and_b32 s0, s2, 15 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: s_add_u32 s0, 4, s0 -; GFX9-NEXT: scratch_load_dword v0, off, s0 +; GFX9-NEXT: scratch_load_dword v0, off, s0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_foo: @@ -302,7 +312,9 @@ ; GFX10-NEXT: s_add_u32 s1, 4, s1 ; GFX10-NEXT: s_add_u32 s0, 4, s0 ; GFX10-NEXT: scratch_store_dword off, v0, s1 -; GFX10-NEXT: scratch_load_dword v0, off, s0 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_foo: @@ -320,8 +332,10 @@ ; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-PAL-NEXT: s_add_u32 s1, 4, s1 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_add_u32 s0, 4, s0 -; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: store_load_sindex_foo: @@ -342,7 +356,9 @@ ; GFX10-PAL-NEXT: s_add_u32 s0, 4, s0 ; GFX10-PAL-NEXT: s_add_u32 s1, 4, s1 ; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) @@ -367,8 +383,10 @@ ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 -; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_vindex_kernel: @@ -383,7 +401,9 @@ ; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GFX10-NEXT: scratch_store_dword v2, v3, off -; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_vindex_kernel: @@ -400,8 +420,10 @@ ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0 -; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: store_load_vindex_kernel: @@ -421,7 +443,9 @@ ; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 ; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off -; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) @@ -447,8 +471,9 @@ ; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 ; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 ; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX9-NEXT: scratch_load_dword v0, v0, off +; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -462,9 +487,9 @@ ; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 ; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 ; GFX10-NEXT: scratch_store_dword v0, v1, off -; GFX10-NEXT: scratch_load_dword v0, v2, off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, v2, off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_vindex_foo: @@ -475,8 +500,9 @@ ; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 ; GFX9-PAL-NEXT: v_and_b32_e32 v0, v0, v3 ; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off +; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; @@ -490,9 +516,9 @@ ; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2 ; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2 ; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off -; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off glc dlc +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] bb: %i = alloca [32 x float], align 4, addrspace(5) @@ -552,12 +578,12 @@ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 +; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: s_mov_b32 s1, s0 ; GFX9-NEXT: s_mov_b32 s2, s0 ; GFX9-NEXT: s_mov_b32 s3, s0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -578,12 +604,12 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 +; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: s_mov_b32 s1, s0 ; GFX10-NEXT: s_mov_b32 s2, s0 ; GFX10-NEXT: s_mov_b32 s3, s0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 @@ -605,11 +631,11 @@ ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 -; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_mov_b32 s1, s0 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-PAL-NEXT: s_mov_b32 s3, s0 -; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 @@ -635,12 +661,12 @@ ; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_mov_b32 s0, 0 ; GFX10-PAL-NEXT: s_mov_b32 s1, s0 ; GFX10-PAL-NEXT: s_mov_b32 s2, s0 ; GFX10-PAL-NEXT: s_mov_b32 s3, s0 -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 @@ -663,12 +689,12 @@ ; GFX9-LABEL: zero_init_small_offset_foo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: scratch_load_dword v0, off, s32 +; GFX9-NEXT: scratch_load_dword v0, off, s32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: s_mov_b32 s1, s0 ; GFX9-NEXT: s_mov_b32 s2, s0 ; GFX9-NEXT: s_mov_b32 s3, s0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -684,12 +710,12 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_load_dword v0, off, s32 +; GFX10-NEXT: scratch_load_dword v0, off, s32 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: s_mov_b32 s1, s0 ; GFX10-NEXT: s_mov_b32 s2, s0 ; GFX10-NEXT: s_mov_b32 s3, s0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 @@ -704,12 +730,12 @@ ; GFX9-PAL-LABEL: zero_init_small_offset_foo: ; GFX9-PAL: ; %bb.0: ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 glc +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: s_mov_b32 s1, s0 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-PAL-NEXT: s_mov_b32 s3, s0 -; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 @@ -725,12 +751,12 @@ ; GFX10-PAL: ; %bb.0: ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 glc dlc +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_mov_b32 s0, 0 ; GFX10-PAL-NEXT: s_mov_b32 s1, s0 ; GFX10-PAL-NEXT: s_mov_b32 s2, s0 ; GFX10-PAL-NEXT: s_mov_b32 s3, s0 -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 @@ -757,8 +783,8 @@ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 @@ -766,8 +792,10 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_add_u32 s1, 0x104, s1 ; GFX9-NEXT: scratch_store_dword off, v0, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_add_u32 s0, 0x104, s0 -; GFX9-NEXT: scratch_load_dword v0, off, s0 +; GFX9-NEXT: scratch_load_dword v0, off, s0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_small_offset_kernel: @@ -777,7 +805,7 @@ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 +; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -787,7 +815,9 @@ ; GFX10-NEXT: s_add_u32 s0, 0x104, s0 ; GFX10-NEXT: s_add_u32 s1, 0x104, s1 ; GFX10-NEXT: scratch_store_dword off, v0, s0 -; GFX10-NEXT: scratch_load_dword v0, off, s1 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel: @@ -803,14 +833,16 @@ ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 -; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 -; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-PAL-NEXT: s_add_u32 s1, 0x104, s1 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_add_u32 s0, 0x104, s0 -; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: store_load_sindex_small_offset_kernel: @@ -825,7 +857,7 @@ ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 ; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) @@ -835,7 +867,9 @@ ; GFX10-PAL-NEXT: s_add_u32 s0, 0x104, s0 ; GFX10-PAL-NEXT: s_add_u32 s1, 0x104, s1 ; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) @@ -860,15 +894,17 @@ ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: s_lshl_b32 s0, s2, 2 -; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 -; GFX9-NEXT: s_add_u32 s0, 0x104, s0 +; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_add_u32 s0, 0x104, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: scratch_store_dword off, v0, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_and_b32 s0, s2, 15 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: s_add_u32 s0, 0x104, s0 -; GFX9-NEXT: scratch_load_dword v0, off, s0 +; GFX9-NEXT: scratch_load_dword v0, off, s0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_small_offset_foo: @@ -877,16 +913,18 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 -; GFX10-NEXT: s_and_b32 s0, s2, 15 +; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_and_b32 s0, s2, 15 ; GFX10-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-NEXT: s_lshl_b32 s1, s2, 2 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2 ; GFX10-NEXT: s_add_u32 s1, 0x104, s1 ; GFX10-NEXT: s_add_u32 s0, 0x104, s0 ; GFX10-NEXT: scratch_store_dword off, v0, s1 -; GFX10-NEXT: scratch_load_dword v0, off, s0 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo: @@ -901,14 +939,16 @@ ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 -; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-PAL-NEXT: s_add_u32 s1, 0x104, s1 -; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_add_u32 s0, 0x104, s0 -; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: store_load_sindex_small_offset_foo: @@ -922,16 +962,18 @@ ; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 -; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 ; GFX10-PAL-NEXT: s_add_u32 s0, 0x104, s0 ; GFX10-PAL-NEXT: s_add_u32 s1, 0x104, s1 ; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) @@ -955,15 +997,17 @@ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 -; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_vindex_small_offset_kernel: @@ -977,9 +1021,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, 15 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 +; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: scratch_store_dword v2, v3, off -; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel: @@ -994,13 +1041,15 @@ ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 -; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 +; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0 ; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0 -; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: store_load_vindex_small_offset_kernel: @@ -1019,9 +1068,12 @@ ; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 ; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 ; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX10-PAL-NEXT: scratch_load_dword v1, off, off offset:4 +; GFX10-PAL-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off -; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) @@ -1045,16 +1097,17 @@ ; GFX9-LABEL: store_load_vindex_small_offset_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: scratch_load_dword v1, off, s32 -; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x100 +; GFX9-NEXT: scratch_load_dword v1, off, s32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x100 ; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi ; GFX9-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 ; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 ; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX9-NEXT: scratch_load_dword v0, v0, off +; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1068,26 +1121,28 @@ ; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 ; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 ; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 -; GFX10-NEXT: scratch_load_dword v3, off, s32 -; GFX10-NEXT: scratch_store_dword v0, v1, off -; GFX10-NEXT: scratch_load_dword v0, v2, off +; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: scratch_store_dword v0, v1, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, v2, off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 -; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x100 +; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x100 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 ; GFX9-PAL-NEXT: v_and_b32_e32 v0, v0, v3 ; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off +; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; @@ -1101,11 +1156,12 @@ ; GFX10-PAL-NEXT: v_and_b32_e32 v3, v0, v1 ; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2 ; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2 -; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 -; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off -; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off +; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off glc dlc +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] bb: %padding = alloca [64 x i32], align 4, addrspace(5) @@ -1129,12 +1185,12 @@ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 +; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: s_mov_b32 s1, s0 ; GFX9-NEXT: s_mov_b32 s2, s0 ; GFX9-NEXT: s_mov_b32 s3, s0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -1155,13 +1211,13 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 +; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX10-NEXT: s_mov_b32 s1, s0 ; GFX10-NEXT: s_mov_b32 s2, s0 ; GFX10-NEXT: s_mov_b32 s3, s0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 @@ -1186,11 +1242,11 @@ ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 -; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_mov_b32 s1, s0 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-PAL-NEXT: s_mov_b32 s3, s0 -; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 @@ -1216,13 +1272,13 @@ ; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_mov_b32 s0, 0 ; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX10-PAL-NEXT: s_mov_b32 s1, s0 ; GFX10-PAL-NEXT: s_mov_b32 s2, s0 ; GFX10-PAL-NEXT: s_mov_b32 s3, s0 -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 @@ -1248,12 +1304,12 @@ ; GFX9-LABEL: zero_init_large_offset_foo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: scratch_load_dword v0, off, s32 +; GFX9-NEXT: scratch_load_dword v0, off, s32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: s_mov_b32 s1, s0 ; GFX9-NEXT: s_mov_b32 s2, s0 ; GFX9-NEXT: s_mov_b32 s3, s0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -1273,13 +1329,13 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_load_dword v0, off, s32 +; GFX10-NEXT: scratch_load_dword v0, off, s32 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 ; GFX10-NEXT: s_mov_b32 s1, s0 ; GFX10-NEXT: s_mov_b32 s2, s0 ; GFX10-NEXT: s_mov_b32 s3, s0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 @@ -1297,12 +1353,12 @@ ; GFX9-PAL-LABEL: zero_init_large_offset_foo: ; GFX9-PAL: ; %bb.0: ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 glc +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: s_mov_b32 s1, s0 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-PAL-NEXT: s_mov_b32 s3, s0 -; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 @@ -1322,13 +1378,13 @@ ; GFX10-PAL: ; %bb.0: ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 glc dlc +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_mov_b32 s0, 0 ; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 ; GFX10-PAL-NEXT: s_mov_b32 s1, s0 ; GFX10-PAL-NEXT: s_mov_b32 s2, s0 ; GFX10-PAL-NEXT: s_mov_b32 s3, s0 -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 @@ -1358,8 +1414,8 @@ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 @@ -1367,8 +1423,10 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_add_u32 s1, 0x4004, s1 ; GFX9-NEXT: scratch_store_dword off, v0, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 -; GFX9-NEXT: scratch_load_dword v0, off, s0 +; GFX9-NEXT: scratch_load_dword v0, off, s0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_large_offset_kernel: @@ -1378,7 +1436,7 @@ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 +; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1388,7 +1446,9 @@ ; GFX10-NEXT: s_add_u32 s0, 0x4004, s0 ; GFX10-NEXT: s_add_u32 s1, 0x4004, s1 ; GFX10-NEXT: scratch_store_dword off, v0, s0 -; GFX10-NEXT: scratch_load_dword v0, off, s1 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel: @@ -1404,14 +1464,16 @@ ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 -; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 -; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-PAL-NEXT: s_add_u32 s1, 0x4004, s1 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_add_u32 s0, 0x4004, s0 -; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: store_load_sindex_large_offset_kernel: @@ -1426,7 +1488,7 @@ ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 ; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) @@ -1436,7 +1498,9 @@ ; GFX10-PAL-NEXT: s_add_u32 s0, 0x4004, s0 ; GFX10-PAL-NEXT: s_add_u32 s1, 0x4004, s1 ; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -1461,15 +1525,17 @@ ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: s_lshl_b32 s0, s2, 2 -; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 -; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 +; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: scratch_store_dword off, v0, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_and_b32 s0, s2, 15 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 -; GFX9-NEXT: scratch_load_dword v0, off, s0 +; GFX9-NEXT: scratch_load_dword v0, off, s0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_large_offset_foo: @@ -1478,16 +1544,18 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 -; GFX10-NEXT: s_and_b32 s0, s2, 15 +; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_and_b32 s0, s2, 15 ; GFX10-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-NEXT: s_lshl_b32 s1, s2, 2 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2 ; GFX10-NEXT: s_add_u32 s1, 0x4004, s1 ; GFX10-NEXT: s_add_u32 s0, 0x4004, s0 ; GFX10-NEXT: scratch_store_dword off, v0, s1 -; GFX10-NEXT: scratch_load_dword v0, off, s0 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo: @@ -1502,14 +1570,16 @@ ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 -; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-PAL-NEXT: s_add_u32 s1, 0x4004, s1 -; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_add_u32 s0, 0x4004, s0 -; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: store_load_sindex_large_offset_foo: @@ -1523,16 +1593,18 @@ ; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 -; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 ; GFX10-PAL-NEXT: s_add_u32 s0, 0x4004, s0 ; GFX10-PAL-NEXT: s_add_u32 s1, 0x4004, s1 ; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -1556,15 +1628,17 @@ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 -; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_vindex_large_offset_kernel: @@ -1578,9 +1652,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, 15 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 +; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: scratch_store_dword v2, v3, off -; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel: @@ -1595,13 +1672,15 @@ ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 -; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 +; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0 ; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0 -; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: store_load_vindex_large_offset_kernel: @@ -1620,9 +1699,12 @@ ; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 ; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 ; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX10-PAL-NEXT: scratch_load_dword v1, off, off offset:4 +; GFX10-PAL-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off -; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -1646,16 +1728,17 @@ ; GFX9-LABEL: store_load_vindex_large_offset_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: scratch_load_dword v1, off, s32 -; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: scratch_load_dword v1, off, s32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 ; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi ; GFX9-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 ; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 ; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX9-NEXT: scratch_load_dword v0, v0, off +; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1669,26 +1752,28 @@ ; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 ; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 ; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 -; GFX10-NEXT: scratch_load_dword v3, off, s32 -; GFX10-NEXT: scratch_store_dword v0, v1, off -; GFX10-NEXT: scratch_load_dword v0, v2, off +; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: scratch_store_dword v0, v1, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, v2, off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 -; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x4000 +; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x4000 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 ; GFX9-PAL-NEXT: v_and_b32_e32 v0, v0, v3 ; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off +; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; @@ -1702,11 +1787,12 @@ ; GFX10-PAL-NEXT: v_and_b32_e32 v3, v0, v1 ; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2 ; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2 -; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 -; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off -; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off +; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off glc dlc +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] bb: %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -1733,10 +1819,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_add_u32 s0, 4, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 -; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_large_imm_offset_kernel: @@ -1750,8 +1839,11 @@ ; GFX10-NEXT: s_movk_i32 s0, 0x3800 ; GFX10-NEXT: s_add_u32 s0, 4, s0 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:4 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 -; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel: @@ -1767,10 +1859,13 @@ ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_add_u32 s0, 4, s0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 -; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: store_load_large_imm_offset_kernel: @@ -1789,8 +1884,11 @@ ; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800 ; GFX10-PAL-NEXT: s_add_u32 s0, 4, s0 ; GFX10-PAL-NEXT: scratch_store_dword off, v0, off offset:4 +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_endpgm bb: %i = alloca [4096 x i32], align 4, addrspace(5) @@ -1810,10 +1908,12 @@ ; GFX9-NEXT: s_movk_i32 s0, 0x3000 ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: scratch_store_dword off, v0, s32 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s32, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 -; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1826,10 +1926,11 @@ ; GFX10-NEXT: s_movk_i32 s0, 0x3800 ; GFX10-NEXT: s_add_u32 s0, s32, s0 ; GFX10-NEXT: scratch_store_dword off, v0, s32 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 -; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_large_imm_offset_foo: @@ -1838,10 +1939,12 @@ ; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_add_u32 s0, s32, s0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 -; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; @@ -1854,10 +1957,11 @@ ; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800 ; GFX10-PAL-NEXT: s_add_u32 s0, s32, s0 ; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 -; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] bb: %i = alloca [4096 x i32], align 4, addrspace(5) @@ -1882,7 +1986,9 @@ ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 15 ; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024 -; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_vidx_sidx_offset: @@ -1897,7 +2003,9 @@ ; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, 4 ; GFX10-NEXT: scratch_store_dword v0, v1, off offset:1024 -; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_vidx_sidx_offset: @@ -1915,7 +2023,9 @@ ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 -; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: store_load_vidx_sidx_offset: @@ -1935,7 +2045,9 @@ ; GFX10-PAL-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, 4 ; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 -; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_endpgm bb: %alloca = alloca [32 x i32], align 4, addrspace(5) @@ -1955,7 +2067,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 15 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off -; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1966,9 +2079,9 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off -; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_i64_aligned: @@ -1977,7 +2090,8 @@ ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off -; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; @@ -1988,9 +2102,9 @@ ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off -; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] bb: store volatile i64 15, i64 addrspace(5)* %arg, align 8 @@ -2005,7 +2119,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 15 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off -; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2016,9 +2131,9 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off -; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_i64_unaligned: @@ -2027,7 +2142,8 @@ ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off -; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; @@ -2038,9 +2154,9 @@ ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off -; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] bb: store volatile i64 15, i64 addrspace(5)* %arg, align 1 @@ -2056,7 +2172,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, 2 ; GFX9-NEXT: v_mov_b32_e32 v3, 3 ; GFX9-NEXT: scratch_store_dwordx3 v0, v[1:3], off -; GFX9-NEXT: scratch_load_dwordx3 v[0:2], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2068,9 +2185,9 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, 2 ; GFX10-NEXT: v_mov_b32_e32 v3, 3 ; GFX10-NEXT: scratch_store_dwordx3 v0, v[1:3], off -; GFX10-NEXT: scratch_load_dwordx3 v[0:2], v0, off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_v3i32_unaligned: @@ -2080,7 +2197,8 @@ ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3 ; GFX9-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off -; GFX9-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; @@ -2092,9 +2210,9 @@ ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 2 ; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3 ; GFX10-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off -; GFX10-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] bb: store volatile <3 x i32> , <3 x i32> addrspace(5)* %arg, align 1 @@ -2111,7 +2229,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, 3 ; GFX9-NEXT: v_mov_b32_e32 v4, 4 ; GFX9-NEXT: scratch_store_dwordx4 v0, v[1:4], off -; GFX9-NEXT: scratch_load_dwordx4 v[0:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2124,9 +2243,9 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, 3 ; GFX10-NEXT: v_mov_b32_e32 v4, 4 ; GFX10-NEXT: scratch_store_dwordx4 v0, v[1:4], off -; GFX10-NEXT: scratch_load_dwordx4 v[0:3], v0, off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_v4i32_unaligned: @@ -2137,7 +2256,8 @@ ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3 ; GFX9-PAL-NEXT: v_mov_b32_e32 v4, 4 ; GFX9-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off -; GFX9-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; @@ -2150,9 +2270,9 @@ ; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3 ; GFX10-PAL-NEXT: v_mov_b32_e32 v4, 4 ; GFX10-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off -; GFX10-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc +; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] bb: store volatile <4 x i32> , <4 x i32> addrspace(5)* %arg, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll --- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll @@ -16,9 +16,9 @@ ; (fadd (fmul x, y), z) -> (fma x, y, z) ; FUNC-LABEL: {{^}}combine_to_fma_f64_0: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] ; SI: buffer_store_dwordx2 [[RESULT]] define amdgpu_kernel void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { @@ -40,10 +40,10 @@ ; (fadd (fmul x, y), z) -> (fma x, y, z) ; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}} ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]] ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} @@ -73,9 +73,9 @@ ; (fadd x, (fmul y, z)) -> (fma y, z, x) ; FUNC-LABEL: {{^}}combine_to_fma_f64_1: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] ; SI: buffer_store_dwordx2 [[RESULT]] define amdgpu_kernel void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { @@ -97,9 +97,9 @@ ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) ; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]] ; SI: buffer_store_dwordx2 [[RESULT]] define amdgpu_kernel void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { @@ -121,10 +121,10 @@ ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) ; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}} ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]] ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]] ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} @@ -154,9 +154,9 @@ ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) ; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]] ; SI: buffer_store_dwordx2 [[RESULT]] define amdgpu_kernel void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { @@ -178,10 +178,10 @@ ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) ; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}} ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]] ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]] ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} @@ -211,9 +211,9 @@ ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) ; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] ; SI: buffer_store_dwordx2 [[RESULT]] define amdgpu_kernel void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { @@ -237,10 +237,10 @@ ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) ; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}} ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]] ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} @@ -272,10 +272,10 @@ ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) ; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}} ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]] ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} @@ -308,11 +308,11 @@ ; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) ; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64: -; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} -; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}} +; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32 glc{{$}} ; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]] ; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[TMP0]] @@ -349,11 +349,11 @@ ; -> (fma (fneg y), z, (fma (fneg u), v, x)) ; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64: -; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} -; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}} +; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}} +; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32 glc{{$}} ; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]] ; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[Y]], [[Z]], [[TMP0]] diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll b/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll --- a/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll @@ -4,7 +4,7 @@ declare double @llvm.maxnum.f64(double, double) nounwind readnone ; SI-LABEL: {{^}}test_fmax3_f64: -; SI: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}} +; SI: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 glc{{$}} ; SI: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8 ; SI: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16 ; SI: v_max_f64 [[QUIET_A:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGA]] diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -37,7 +37,9 @@ ; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]] ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]] +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] +; GCN-NEXT: s_waitcnt vmcnt(0) define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -65,7 +67,9 @@ ; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_ADD]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]] +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] +; GCN-NEXT: s_waitcnt vmcnt(0) define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -168,7 +172,9 @@ ; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] ; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]] ; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]] +; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) ; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]] +; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -196,7 +202,9 @@ ; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]] ; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} ; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]] +; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) ; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] +; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -308,7 +316,9 @@ ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]] ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]] +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] +; GCN-NEXT: s_waitcnt vmcnt(0) define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -806,7 +816,9 @@ ; GCN: v_max_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]] ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]] ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]] +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] +; GCN-NEXT: s_waitcnt vmcnt(0) define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1042,7 +1054,9 @@ ; GCN: v_min_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]] ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]] ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]] +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] +; GCN-NEXT: s_waitcnt vmcnt(0) define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1111,7 +1125,9 @@ ; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]] ; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]] ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]] +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] +; GCN-NEXT: s_waitcnt vmcnt(0) define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1142,7 +1158,9 @@ ; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_FMA]] ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]] +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] +; GCN-NEXT: s_waitcnt vmcnt(0) define amdgpu_kernel void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1349,7 +1367,9 @@ ; GCN-NSZ-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]] ; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]] +; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) ; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] +; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1434,7 +1454,9 @@ ; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_MAD]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MAD]] +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] +; GCN-NEXT: s_waitcnt vmcnt(0) define amdgpu_kernel void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1859,7 +1881,9 @@ ; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]] ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]] +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] +; GCN-NEXT: s_waitcnt vmcnt(0) define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1881,7 +1905,9 @@ ; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]] ; GCN-NEXT: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[ADD]], 4.0 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] +; GCN-NEXT: s_waitcnt vmcnt(0) define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2316,7 +2342,9 @@ ; GCN-NEXT: v_fma_f32 [[FMA1:v[0-9]+]], -[[A]], [[C]], 2.0 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]] +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA1]] +; GCN-NEXT: s_waitcnt vmcnt(0) define amdgpu_kernel void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2348,7 +2376,9 @@ ; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], -[[A]], [[B]] ; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]] ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]] +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] +; GCN-NEXT: s_waitcnt vmcnt(0) define amdgpu_kernel void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2379,7 +2409,9 @@ ; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]] +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] +; GCN-NEXT: s_waitcnt vmcnt(0) define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2418,7 +2450,9 @@ ; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[FMA0]], [[D]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL2]] +; GCN-NEXT: s_waitcnt vmcnt(0) define amdgpu_kernel void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2453,7 +2487,9 @@ ; GCN-DAG: v_mul_f64 [[MUL1:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[D]] ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]] +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] +; GCN-NEXT: s_waitcnt vmcnt(0) define amdgpu_kernel void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr, double addrspace(1)* %b.ptr, double addrspace(1)* %c.ptr, double addrspace(1)* %d.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll --- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -107,8 +107,8 @@ ; GCN-LABEL: {{^}}func_load_private_arg_i32_ptr: ; GCN: s_waitcnt -; MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen{{$}} -; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v0, off{{$}} +; MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc{{$}} +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v0, off glc{{$}} define void @func_load_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 { %val = load volatile i32, i32 addrspace(5)* %ptr ret void @@ -162,11 +162,11 @@ ; GCN: s_and_saveexec_b64 ; CI: v_add_i32_e32 [[GEP:v[0-9]+]], vcc, 4, [[SHIFT]] -; CI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}} +; CI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 glc{{$}} ; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SP]] -; GFX9-MUBUF: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}} -; GFX9-FLATSCR: scratch_load_dword v{{[0-9]+}}, off, s32 offset:4{{$}} +; GFX9-MUBUF: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 glc{{$}} +; GFX9-FLATSCR: scratch_load_dword v{{[0-9]+}}, off, s32 offset:4 glc{{$}} ; GCN: ds_write_b32 v{{[0-9]+}}, [[GEP]] define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 } addrspace(5)* byval({ i8, i32 }) %arg0, i32 %arg2) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -537,10 +537,10 @@ } ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_x2: -; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s32{{$}} -; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s32 glc{{$}} +; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s32 offset:4 glc{{$}} +; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s32 offset:8 glc{{$}} +; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s32 offset:12 glc{{$}} ; GCN: ds_write_b32 v0, v0 ; GCN: s_setpc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -159,7 +159,8 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 @@ -168,7 +169,6 @@ ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -190,7 +190,8 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 @@ -199,7 +200,6 @@ ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -225,7 +225,8 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 @@ -234,7 +235,6 @@ ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -256,7 +256,8 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 @@ -265,7 +266,6 @@ ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -350,7 +350,8 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: global_load_sbyte v0, v[0:1], off +; GFX9-NEXT: global_load_sbyte v0, v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 @@ -378,7 +379,8 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: global_load_sbyte v0, v[0:1], off +; GFX10-NEXT: global_load_sbyte v0, v[0:1], off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 @@ -410,7 +412,8 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 @@ -438,7 +441,8 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 @@ -529,7 +533,8 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 @@ -557,7 +562,8 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 @@ -589,7 +595,8 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 @@ -617,7 +624,8 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 @@ -2877,6 +2885,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v41, v1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: global_store_dword v[40:41], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s4, v42, 0 @@ -2912,6 +2921,7 @@ ; GFX10-NEXT: v_writelane_b32 v42, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: global_store_dword v[40:41], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 @@ -2924,7 +2934,6 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[4:5] %val = call amdgpu_gfx i32 @external_i32_func_i32(i32 42) store volatile i32 %val, i32 addrspace(1)* %out @@ -3105,7 +3114,9 @@ ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] @@ -3145,13 +3156,14 @@ ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dword v[0:1], v1, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[4:5] %in.val = alloca { i8, i32 }, align 4, addrspace(5) %out.val = alloca { i8, i32 }, align 4, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll --- a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll +++ b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll @@ -19,7 +19,9 @@ ; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_short v[0:1], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: load_1d_f16_tfe_dmask0: @@ -37,7 +39,9 @@ ; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_short v[0:1], v1, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; ; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask0: @@ -55,7 +59,9 @@ ; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v1 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: s_endpgm %v = call { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.data = extractvalue { half, i32 } %v, 0 @@ -81,7 +87,9 @@ ; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_short v[0:1], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: load_1d_f16_tfe_dmask1: @@ -99,7 +107,9 @@ ; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_short v[0:1], v1, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; ; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask1: @@ -117,7 +127,9 @@ ; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v1 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: s_endpgm %v = call { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.data = extractvalue { half, i32 } %v, 0 @@ -143,7 +155,9 @@ ; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: load_1d_v2f16_tfe_dmask0: @@ -161,7 +175,9 @@ ; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v[0:1], v1, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; ; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask0: @@ -179,7 +195,9 @@ ; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v1 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: s_endpgm %v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.data = extractvalue { <2 x half>, i32 } %v, 0 @@ -205,7 +223,9 @@ ; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: load_1d_v2f16_tfe_dmask1: @@ -223,7 +243,9 @@ ; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v[0:1], v1, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; ; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask1: @@ -241,7 +263,9 @@ ; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v1 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: s_endpgm %v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.data = extractvalue { <2 x half>, i32 } %v, 0 @@ -267,7 +291,9 @@ ; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x3 unorm tfe d16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: load_1d_v2f16_tfe_dmask3: @@ -285,7 +311,9 @@ ; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm tfe d16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v[0:1], v1, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; ; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask3: @@ -306,7 +334,9 @@ ; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v0 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v3 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: s_endpgm %v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.data = extractvalue { <2 x half>, i32 } %v, 0 @@ -333,8 +363,11 @@ ; GFX9-NEXT: image_load v[1:3], v0, s[4:11] dmask:0x7 unorm tfe d16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v3, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: load_1d_v3f16_tfe_dmask7: @@ -353,8 +386,11 @@ ; GFX10-NEXT: image_load v[1:3], v0, s[4:11] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe d16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dword v[0:1], v1, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dword v[0:1], v3, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; ; GFX8-UNPACKED-LABEL: load_1d_v3f16_tfe_dmask7: @@ -375,9 +411,12 @@ ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v3 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v0 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v4 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: s_endpgm %v = call { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.data = extractvalue { <3 x half>, i32 } %v, 0 @@ -404,7 +443,9 @@ ; GFX9-NEXT: image_load v[1:3], v0, s[4:11] dmask:0xf unorm tfe d16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[1:2], off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v3, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: load_1d_v4f16_tfe_dmask15: @@ -423,7 +464,9 @@ ; GFX10-NEXT: image_load v[1:3], v0, s[4:11] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe d16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[1:2], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dword v[0:1], v3, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; ; GFX8-UNPACKED-LABEL: load_1d_v4f16_tfe_dmask15: @@ -448,7 +491,9 @@ ; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v2, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-UNPACKED-NEXT: flat_store_dwordx2 v[0:1], v[1:2] +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v5 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: s_endpgm %v = call { <4 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f16i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.data = extractvalue { <4 x half>, i32 } %v, 0 diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll --- a/llvm/test/CodeGen/AMDGPU/imm16.ll +++ b/llvm/test/CodeGen/AMDGPU/imm16.ll @@ -14,6 +14,7 @@ ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; VI-LABEL: store_inline_imm_neg_0.0_i16: @@ -24,6 +25,7 @@ ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] +; VI-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: store_inline_imm_neg_0.0_i16: @@ -34,6 +36,7 @@ ; SI-NEXT: v_mov_b32_e32 v0, 0x8000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm store volatile i16 -32768, i16 addrspace(1)* %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll --- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll @@ -14,6 +14,7 @@ ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_branch BB0_1 ; IR-LABEL: @infinite_loop( ; IR-NEXT: entry: @@ -45,6 +46,7 @@ ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc ; SI-NEXT: s_cbranch_vccnz BB1_2 ; SI-NEXT: BB1_3: ; %UnifiedReturnBlock @@ -87,6 +89,7 @@ ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc ; SI-NEXT: s_cbranch_vccnz BB2_2 ; SI-NEXT: ; %bb.3: ; %Flow @@ -105,6 +108,7 @@ ; SI-NEXT: BB2_6: ; %loop1 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc ; SI-NEXT: s_cbranch_vccz BB2_6 ; SI-NEXT: BB2_7: ; %DummyReturnBlock @@ -156,6 +160,7 @@ ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] ; SI-NEXT: s_cbranch_execnz BB3_3 ; SI-NEXT: ; %bb.4: ; %loop.exit.guard diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.ll --- a/llvm/test/CodeGen/AMDGPU/inline-asm.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.ll @@ -245,6 +245,7 @@ ; FIXME: Should prodbably be masking high bits of load. ; CHECK-LABEL: {{^}}i1_input_phys_vgpr_x2: ; CHECK: buffer_load_ubyte v0 +; CHECK-NEXT: s_waitcnt ; CHECK-NEXT: buffer_load_ubyte v1 ; CHECK-NEXT: s_waitcnt ; CHECK-NEXT: ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1577,7 +1577,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 { ; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v2, v[0:1], off +; GFX9-NEXT: global_load_dword v2, v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 @@ -1585,7 +1586,6 @@ ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] ; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s4 @@ -1604,7 +1604,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: flat_load_dword v4, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_mov_b32 s2, 0xffff ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -1614,7 +1615,6 @@ ; VI-NEXT: s_lshl_b32 s0, s1, 16 ; VI-NEXT: s_or_b32 s0, s1, s0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 ; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1632,7 +1632,8 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v4, v[0:1] +; CI-NEXT: flat_load_dword v4, v[0:1] glc +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: s_mov_b32 s3, 0 ; CI-NEXT: s_mov_b32 s2, 0xffff @@ -1642,7 +1643,6 @@ ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: s_or_b32 s0, s4, s1 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 ; CI-NEXT: v_lshl_b64 v[4:5], s[2:3], v4 ; CI-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -101,10 +101,11 @@ ; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[0:1], s[10:11] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[10:11], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[10:11], s[2:3] ; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v2, v2, v3 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64 ; SI-NEXT: s_endpgm @@ -121,12 +122,13 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v1, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1 ; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm @@ -137,8 +139,9 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -165,9 +168,9 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, v2, 1.0 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -180,11 +183,11 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, 1.0 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm @@ -194,7 +197,7 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, 1.0 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -219,9 +222,9 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v2, 1.0, v2 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -234,11 +237,11 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, 1.0, v0 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm @@ -248,7 +251,7 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, 1.0, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -275,10 +278,11 @@ ; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[0:1], s[10:11] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[10:11], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[10:11], s[2:3] ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, v3 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64 ; SI-NEXT: s_endpgm @@ -295,12 +299,13 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v1, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, v1 ; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm @@ -311,8 +316,9 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -342,10 +348,11 @@ ; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[0:1], s[10:11] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[10:11], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[10:11], s[2:3] ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, v2, -v3 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64 ; SI-NEXT: s_endpgm @@ -362,12 +369,13 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v1, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, -v1 ; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm @@ -378,8 +386,9 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -409,10 +418,11 @@ ; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[0:1], s[10:11] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[10:11], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[10:11], s[2:3] ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, -v3 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64 ; SI-NEXT: s_endpgm @@ -429,12 +439,13 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v1, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, -v1 ; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm @@ -445,8 +456,9 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -477,10 +489,11 @@ ; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[0:1], s[10:11] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[10:11], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[10:11], s[2:3] ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -|v2|, -v3 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64 ; SI-NEXT: s_endpgm @@ -497,12 +510,13 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v1, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -|v0|, -v1 ; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm @@ -513,8 +527,9 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -|v1|, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll @@ -114,9 +114,9 @@ } ; GCN-LABEL: {{^}}test_div_fmas_f32_logical_cond_to_vcc: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} ; SI-DAG: v_cmp_eq_u32_e32 [[CMP0:vcc]], 0, v{{[0-9]+}} ; SI-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -22,13 +22,13 @@ ; SI-NEXT: s_mov_b32 s15, s3 ; SI-NEXT: s_mov_b32 s10, s2 ; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -50,13 +50,13 @@ ; VI-NEXT: s_mov_b32 s15, s3 ; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_mov_b32 s11, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_f16_e32 v1, v1, v1 ; VI-NEXT: v_max_f16_e32 v0, v0, v1 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -75,13 +75,13 @@ ; GFX9-NEXT: s_mov_b32 s15, s3 ; GFX9-NEXT: s_mov_b32 s10, s2 ; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -22,13 +22,13 @@ ; SI-NEXT: s_mov_b32 s15, s3 ; SI-NEXT: s_mov_b32 s10, s2 ; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -50,13 +50,13 @@ ; VI-NEXT: s_mov_b32 s15, s3 ; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_mov_b32 s11, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_f16_e32 v1, v1, v1 ; VI-NEXT: v_min_f16_e32 v0, v0, v1 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -75,13 +75,13 @@ ; GFX9-NEXT: s_mov_b32 s15, s3 ; GFX9-NEXT: s_mov_b32 s10, s2 ; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll --- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll @@ -536,13 +536,13 @@ ; GCN: s_waitcnt ; GFX900-MUBUFF: buffer_load_short_d16_hi v0, off, s[0:3], 0 offset:4094{{$}} ; GFX900-FLATSCR: s_movk_i32 [[SOFF:[^,]+]], 0xffe -; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, [[SOFF]]{{$}} +; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, [[SOFF]] glc{{$}} ; GFX900: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094{{$}} +; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094 glc{{$}} define void @load_private_hi_v2i16_reglo_vreg_nooff(i16 addrspace(5)* byval(i16) %in, i16 %reg) #0 { entry: %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*) @@ -554,15 +554,15 @@ ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff: ; GCN: s_waitcnt -; GFX900-MUBUF-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} +; GFX900-MUBUF-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094 glc{{$}} ; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe -; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v1, off, [[SOFF]]{{$}} +; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v1, off, [[SOFF]] glc{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094{{$}} +; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094 glc{{$}} define void @load_private_hi_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, half %reg) #0 { entry: %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*) @@ -660,15 +660,15 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8: ; GCN: s_waitcnt -; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} +; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094 glc{{$}} ; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe -; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]]{{$}} +; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]] glc{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094{{$}} +; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc{{$}} define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i16 %reg) #0 { entry: %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) @@ -681,15 +681,15 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8: ; GCN: s_waitcnt -; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} +; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094 glc{{$}} ; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe -; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v1, off, [[SOFF]]{{$}} +; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v1, off, [[SOFF]] glc{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094{{$}} +; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc{{$}} define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i16 %reg) #0 { entry: %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) @@ -702,15 +702,15 @@ ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8: ; GCN: s_waitcnt -; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} +; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094 glc{{$}} ; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe -; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]]{{$}} +; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]] glc{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094{{$}} +; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc{{$}} define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, half %reg) #0 { entry: %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) @@ -805,9 +805,13 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset: ; GFX900-MUBUF: buffer_store_dword +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-MUBUF-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4094 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR: scratch_store_dword +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v{{[0-9]+}}, off, s32 offset:4094 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -824,9 +828,13 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset: ; GFX900-MUBUF: buffer_store_dword +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR: scratch_store_dword +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v{{[0-9]+}}, off, s32 offset:4095 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -844,9 +852,13 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset: ; GFX900-MUBUF: buffer_store_dword +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR: scratch_store_dword +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v{{[0-9]+}}, off, s32 offset:4095 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -997,8 +1009,8 @@ ; FIXME: Is there a cost to using the extload over not? ; GCN-LABEL: {{^}}load_private_v2i16_split: ; GCN: s_waitcnt -; GFX900-MUBUF: buffer_load_ushort v0, off, s[0:3], s32{{$}} -; GFX900-FLATSCR: scratch_load_ushort v0, off, s32{{$}} +; GFX900-MUBUF: buffer_load_ushort v0, off, s[0:3], s32 glc{{$}} +; GFX900-FLATSCR: scratch_load_ushort v0, off, s32 glc{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-MUBUF-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:2 ; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s32 offset:2 diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll --- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -1333,7 +1333,7 @@ ; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff: ; GFX900-MUBUF: ; %bb.0: ; %entry ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-MUBUF-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 +; GFX900-MUBUF-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 glc ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v1, off ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -1342,9 +1342,9 @@ ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) @@ -1353,9 +1353,9 @@ ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) +; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) @@ -1365,7 +1365,7 @@ ; GFX900-FLATSCR: ; %bb.0: ; %entry ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe -; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s0 +; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s0 glc ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -1382,7 +1382,7 @@ ; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reghi_vreg_nooff: ; GFX900-MUBUF: ; %bb.0: ; %entry ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-MUBUF-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 +; GFX900-MUBUF-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 glc ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v1, off ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -1391,9 +1391,9 @@ ; GFX906-LABEL: load_private_lo_v2i16_reghi_vreg_nooff: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) @@ -1402,9 +1402,9 @@ ; GFX803-LABEL: load_private_lo_v2i16_reghi_vreg_nooff: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) +; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) @@ -1414,7 +1414,7 @@ ; GFX900-FLATSCR: ; %bb.0: ; %entry ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe -; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s0 +; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s0 glc ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -1431,7 +1431,7 @@ ; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_nooff: ; GFX900-MUBUF: ; %bb.0: ; %entry ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-MUBUF-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 +; GFX900-MUBUF-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 glc ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v1, off ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -1440,9 +1440,9 @@ ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 -; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off @@ -1452,9 +1452,9 @@ ; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_nooff: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) +; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) @@ -1464,7 +1464,7 @@ ; GFX900-FLATSCR: ; %bb.0: ; %entry ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe -; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s0 +; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s0 glc ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -1582,7 +1582,7 @@ ; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8: ; GFX900-MUBUF: ; %bb.0: ; %entry ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 +; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 glc ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v1, off ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -1591,9 +1591,9 @@ ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) @@ -1603,9 +1603,9 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:4094 -; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 +; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:4094 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) +; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) @@ -1615,7 +1615,7 @@ ; GFX900-FLATSCR: ; %bb.0: ; %entry ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe -; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s0 +; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s0 glc ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -1633,7 +1633,7 @@ ; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8: ; GFX900-MUBUF: ; %bb.0: ; %entry ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], 0 offset:4094 +; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], 0 offset:4094 glc ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v1, off ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -1642,9 +1642,9 @@ ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX906-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) @@ -1653,9 +1653,9 @@ ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX803-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX803-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) +; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) @@ -1665,7 +1665,7 @@ ; GFX900-FLATSCR: ; %bb.0: ; %entry ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe -; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v1, off, s0 +; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v1, off, s0 glc ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -1683,7 +1683,7 @@ ; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8: ; GFX900-MUBUF: ; %bb.0: ; %entry ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 +; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 glc ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v1, off ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -1692,9 +1692,9 @@ ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 -; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off @@ -1705,9 +1705,9 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:4094 -; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 +; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:4094 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) +; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) @@ -1717,7 +1717,7 @@ ; GFX900-FLATSCR: ; %bb.0: ; %entry ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe -; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s0 +; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s0 glc ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -1914,7 +1914,8 @@ ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX900-MUBUF-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 glc ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -1925,9 +1926,10 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 glc +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) @@ -1938,9 +1940,10 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 -; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) +; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 glc +; GFX803-NEXT: s_waitcnt vmcnt(0) +; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) @@ -1951,7 +1954,8 @@ ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32 -; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v0, off, s32 offset:4094 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v0, off, s32 offset:4094 glc ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -1975,7 +1979,8 @@ ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 glc ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -1986,9 +1991,10 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX906-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 glc +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) @@ -1999,9 +2005,10 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX803-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 -; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) +; GFX803-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 glc +; GFX803-NEXT: s_waitcnt vmcnt(0) +; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) @@ -2012,7 +2019,8 @@ ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32 -; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v0, off, s32 offset:4095 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v0, off, s32 offset:4095 glc ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -2037,7 +2045,8 @@ ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 glc ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -2048,9 +2057,10 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX906-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 glc +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) @@ -2061,10 +2071,11 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 +; GFX803-NEXT: s_waitcnt vmcnt(0) +; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 glc +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 -; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) @@ -2075,7 +2086,8 @@ ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32 -; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v0, off, s32 offset:4095 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v0, off, s32 offset:4095 glc ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -2100,7 +2112,8 @@ ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 glc ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -2111,9 +2124,10 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX906-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 glc +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off @@ -2125,9 +2139,10 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX803-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 -; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) +; GFX803-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 glc +; GFX803-NEXT: s_waitcnt vmcnt(0) +; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) @@ -2138,7 +2153,8 @@ ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32 -; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v0, off, s32 offset:4095 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v0, off, s32 offset:4095 glc ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -2164,7 +2180,8 @@ ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 glc ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -2175,9 +2192,10 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX906-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 glc +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off @@ -2189,10 +2207,11 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 +; GFX803-NEXT: s_waitcnt vmcnt(0) +; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 glc +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 -; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) @@ -2203,7 +2222,8 @@ ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32 -; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v0, off, s32 offset:4095 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v0, off, s32 offset:4095 glc ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -30,27 +30,32 @@ ; MUBUF-NEXT: v_mov_b32_e32 v3, 0x2000 ; MUBUF-NEXT: s_mov_b32 s6, 0 ; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: BB0_1: ; %loadstoreloop ; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 ; MUBUF-NEXT: v_add_u32_e32 v3, s6, v1 ; MUBUF-NEXT: s_add_i32 s6, s6, 1 ; MUBUF-NEXT: s_cmpk_lt_u32 s6, 0x2120 ; MUBUF-NEXT: buffer_store_byte v2, v3, s[0:3], 0 offen +; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_cbranch_scc1 BB0_1 ; MUBUF-NEXT: ; %bb.2: ; %split ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x3000 ; MUBUF-NEXT: v_add_u32_e32 v1, 0x20d0, v1 -; MUBUF-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen -; MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 -; MUBUF-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen -; MUBUF-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 -; MUBUF-NEXT: s_waitcnt vmcnt(1) -; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v2, v3 +; MUBUF-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen glc +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 glc +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen glc +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v2, v3 ; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc ; MUBUF-NEXT: v_mov_b32_e32 v2, 0 ; MUBUF-NEXT: s_waitcnt lgkmcnt(0) ; MUBUF-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_endpgm ; ; FLATSCR-LABEL: local_stack_offset_uses_sp: @@ -62,25 +67,29 @@ ; FLATSCR-NEXT: s_movk_i32 vcc_hi, 0x2000 ; FLATSCR-NEXT: s_mov_b32 s2, 0 ; FLATSCR-NEXT: scratch_store_dword off, v0, vcc_hi +; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: BB0_1: ; %loadstoreloop ; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 ; FLATSCR-NEXT: s_add_u32 s3, 0x3000, s2 ; FLATSCR-NEXT: s_add_i32 s2, s2, 1 ; FLATSCR-NEXT: s_cmpk_lt_u32 s2, 0x2120 ; FLATSCR-NEXT: scratch_store_byte off, v0, s3 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_cbranch_scc1 BB0_1 ; FLATSCR-NEXT: ; %bb.2: ; %split ; FLATSCR-NEXT: s_movk_i32 s2, 0x2000 ; FLATSCR-NEXT: s_add_u32 s2, 0x3000, s2 -; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s2 offset:208 +; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s2 offset:208 glc +; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_movk_i32 s2, 0x3000 -; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s2 offset:64 +; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s2 offset:64 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; FLATSCR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_endpgm entry: %pin.low = alloca i32, align 8192, addrspace(5) @@ -111,26 +120,30 @@ ; MUBUF-NEXT: s_mov_b32 s4, 0 ; MUBUF-NEXT: s_add_u32 s32, s32, 0x180000 ; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s33 +; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: BB1_1: ; %loadstoreloop ; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 ; MUBUF-NEXT: v_add_u32_e32 v5, s4, v3 ; MUBUF-NEXT: s_add_i32 s4, s4, 1 ; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2120 ; MUBUF-NEXT: buffer_store_byte v4, v5, s[0:3], 0 offen +; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_cbranch_scc1 BB1_1 ; MUBUF-NEXT: ; %bb.2: ; %split ; MUBUF-NEXT: v_lshrrev_b32_e64 v3, 6, s33 ; MUBUF-NEXT: v_add_u32_e32 v3, 0x1000, v3 ; MUBUF-NEXT: v_add_u32_e32 v3, 0x20d0, v3 -; MUBUF-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen -; MUBUF-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen offset:4 -; MUBUF-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen -; MUBUF-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen glc +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen offset:4 glc +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen glc +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:4 glc +; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_sub_u32 s32, s32, 0x180000 ; MUBUF-NEXT: s_mov_b32 s33, s5 -; MUBUF-NEXT: s_waitcnt vmcnt(1) ; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v4, v5 -; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc ; MUBUF-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -146,6 +159,7 @@ ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: s_add_u32 s32, s32, 0x6000 ; FLATSCR-NEXT: scratch_store_dword off, v2, s33 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: BB1_1: ; %loadstoreloop ; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 ; FLATSCR-NEXT: s_add_u32 vcc_hi, s33, 0x1000 @@ -153,17 +167,19 @@ ; FLATSCR-NEXT: s_add_i32 s0, s0, 1 ; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120 ; FLATSCR-NEXT: scratch_store_byte off, v2, s1 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_cbranch_scc1 BB1_1 ; FLATSCR-NEXT: ; %bb.2: ; %split ; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 ; FLATSCR-NEXT: s_add_u32 s1, s33, 0x1000 ; FLATSCR-NEXT: s_add_u32 s0, s1, s0 -; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:208 +; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:208 glc +; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_add_u32 s0, s33, 0x1000 -; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s0 offset:64 +; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s0 offset:64 glc +; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x6000 ; FLATSCR-NEXT: s_mov_b32 s33, s2 -; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc ; FLATSCR-NEXT: global_store_dwordx2 v[0:1], v[2:3], off diff --git a/llvm/test/CodeGen/AMDGPU/mad-combine.ll b/llvm/test/CodeGen/AMDGPU/mad-combine.ll --- a/llvm/test/CodeGen/AMDGPU/mad-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-combine.ll @@ -17,9 +17,9 @@ ; (fadd (fmul x, y), z) -> (fma x, y, z) ; FUNC-LABEL: {{^}}combine_to_mad_f32_0: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} ; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]] @@ -52,10 +52,10 @@ ; (fadd (fmul x, y), z) -> (fma x, y, z) ; FUNC-LABEL: {{^}}combine_to_mad_f32_0_2use: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}} ; SI-STD-DAG: v_mac_f32_e32 [[C]], [[A]], [[B]] ; SI-STD-DAG: v_mac_f32_e32 [[D]], [[A]], [[B]] @@ -97,9 +97,9 @@ ; (fadd x, (fmul y, z)) -> (fma y, z, x) ; FUNC-LABEL: {{^}}combine_to_mad_f32_1: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} ; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]] ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] @@ -128,9 +128,9 @@ ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) ; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]] ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]] @@ -158,10 +158,10 @@ ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) ; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32_2use: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}} ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]] ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] @@ -200,9 +200,9 @@ ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) ; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]] ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]] @@ -230,10 +230,10 @@ ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) ; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32_2use: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}} ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]] ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]] @@ -272,9 +272,9 @@ ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) ; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]] @@ -305,10 +305,10 @@ ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) ; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_neg: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}} ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], -[[B]], -[[C]] ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], -[[B]], -[[D]] @@ -349,10 +349,10 @@ ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) ; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_mul: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}} ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] @@ -394,11 +394,11 @@ ; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) ; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_0_f32: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} -; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}} +; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} ; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] ; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] @@ -436,11 +436,11 @@ ; -> (fma (fneg y), z, (fma (fneg u), v, x)) ; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_1_f32: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} -; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}} +; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} ; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] ; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] @@ -478,11 +478,11 @@ ; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) ; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_2_f32: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} -; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}} +; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} ; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] ; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[A]], [[B]] @@ -529,11 +529,11 @@ ; -> (fmuladd (fneg y), z, (fmuladd (fneg u), v, x)) ; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_3_f32: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} -; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}} +; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} ; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] ; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[B]], [[C]] diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll --- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll @@ -120,8 +120,8 @@ ; GCN: s_waitcnt ; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]{{$}} ; GFX9-NEXT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, v3 -; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp{{$}} ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp{{$}} ; GFX9-NEXT: s_setpc_b64 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use(half %src0, half %src1, half %src2) #0 { %src0.ext = fpext half %src0 to float diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll --- a/llvm/test/CodeGen/AMDGPU/madak.ll +++ b/llvm/test/CodeGen/AMDGPU/madak.ll @@ -41,7 +41,7 @@ ; GCN-LABEL: {{^}}madak_2_use_f32: ; GFX9: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 ; GFX10: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GFX6-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GFX6-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} ; GFX6-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GFX6-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 ; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]], diff --git a/llvm/test/CodeGen/AMDGPU/madmk.ll b/llvm/test/CodeGen/AMDGPU/madmk.ll --- a/llvm/test/CodeGen/AMDGPU/madmk.ll +++ b/llvm/test/CodeGen/AMDGPU/madmk.ll @@ -9,7 +9,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone ; GCN-LABEL: {{^}}madmk_f32: -; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN: v_mac_f32_e32 [[VB]], 0x41200000, [[VA]] define amdgpu_kernel void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 { @@ -28,7 +28,7 @@ } ; GCN-LABEL: {{^}}madmk_2_use_f32: -; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 ; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x41200000 @@ -61,7 +61,7 @@ ; We don't get any benefit if the constant is an inline immediate. ; GCN-LABEL: {{^}}madmk_inline_imm_f32: -; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN: v_mac_f32_e32 [[VB]], 4.0, [[VA]] define amdgpu_kernel void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 { @@ -128,7 +128,7 @@ } ; GCN-LABEL: {{^}}no_madmk_src0_modifier_f32: -; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x41200000 ; GCN: v_mad_f32 {{v[0-9]+}}, |[[VA]]|, [[SK]], [[VB]] @@ -150,7 +150,7 @@ } ; GCN-LABEL: {{^}}no_madmk_src2_modifier_f32: -; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{[sv][0-9]+}}, |{{v[0-9]+}}| define amdgpu_kernel void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -0,0 +1,396 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s + +define amdgpu_kernel void @flat_nontemporal_load_0( +; GFX7-LABEL: flat_nontemporal_load_0: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_nontemporal_load_0: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_nontemporal_load_0: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_0: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load volatile i32, i32* %in, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_nontemporal_load_1( +; GFX7-LABEL: flat_nontemporal_load_1: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX7-NEXT: flat_load_dword v2, v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_nontemporal_load_1: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_add_co_u32_e64 v0, s0, s0, v0 +; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_nontemporal_load_1: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_add_co_u32_e64 v0, s0, s0, v0 +; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_1: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; SKIP-CACHE-INV-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val.gep = getelementptr inbounds i32, i32* %in, i32 %tid + %val = load volatile i32, i32* %val.gep, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_nontemporal_store_0( +; GFX7-LABEL: flat_nontemporal_store_0: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_nontemporal_store_0: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_nontemporal_store_0: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_0: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load i32, i32* %in, align 4 + store volatile i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_nontemporal_store_1( +; GFX7-LABEL: flat_nontemporal_store_1: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: flat_load_dword v2, v[1:2] +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_nontemporal_store_1: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-WGP-NEXT: flat_load_dword v2, v[1:2] +; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_nontemporal_store_1: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-CU-NEXT: flat_load_dword v2, v[1:2] +; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_1: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[1:2] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; SKIP-CACHE-INV-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val = load i32, i32* %in, align 4 + %out.gep = getelementptr inbounds i32, i32* %out, i32 %tid + store volatile i32 %val, i32* %out.gep + ret void +} + +define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( +; GFX7-LABEL: flat_volatile_workgroup_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_volatile_workgroup_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_volatile_workgroup_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_volatile_workgroup_acquire_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic volatile i32, i32* %in syncscope("workgroup") acquire, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_volatile_workgroup_release_store( +; GFX7-LABEL: flat_volatile_workgroup_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_volatile_workgroup_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_volatile_workgroup_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_volatile_workgroup_release_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic volatile i32 %in, i32* %out syncscope("workgroup") release, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -0,0 +1,458 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s + +define amdgpu_kernel void @global_volatile_load_0( +; GFX6-LABEL: global_volatile_load_0: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_volatile_load_0: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_volatile_load_0: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_volatile_load_0: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_volatile_load_0: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load volatile i32, i32 addrspace(1)* %in, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_volatile_load_1( +; GFX6-LABEL: global_volatile_load_1: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s6 +; GFX6-NEXT: s_mov_b32 s1, s7 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_volatile_load_1: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX7-NEXT: flat_load_dword v2, v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_volatile_load_1: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v0, v0, s[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_volatile_load_1: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v0, v0, s[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_volatile_load_1: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val.gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid + %val = load volatile i32, i32 addrspace(1)* %val.gep, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_volatile_store_0( +; GFX6-LABEL: global_volatile_store_0: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_volatile_store_0: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_volatile_store_0: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_volatile_store_0: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_volatile_store_0: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load i32, i32 addrspace(1)* %in, align 4 + store volatile i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_volatile_store_1( +; GFX6-LABEL: global_volatile_store_1: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_volatile_store_1: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_volatile_store_1: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_volatile_store_1: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_volatile_store_1: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0 +; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val = load i32, i32 addrspace(1)* %in, align 4 + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid + store volatile i32 %val, i32 addrspace(1)* %out.gep + ret void +} + +define amdgpu_kernel void @global_volatile_workgroup_acquire_load( +; GFX6-LABEL: global_volatile_workgroup_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_volatile_workgroup_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_volatile_workgroup_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_volatile_workgroup_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_volatile_workgroup_acquire_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic volatile i32, i32 addrspace(1)* %in syncscope("workgroup") acquire, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_volatile_workgroup_release_store( +; GFX6-LABEL: global_volatile_workgroup_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_volatile_workgroup_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_volatile_workgroup_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_volatile_workgroup_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_volatile_workgroup_release_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic volatile i32 %in, i32 addrspace(1)* %out syncscope("workgroup") release, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -0,0 +1,440 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s + +define amdgpu_kernel void @local_volatile_load_0( +; GFX6-LABEL: local_volatile_load_0: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_volatile_load_0: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_volatile_load_0: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_volatile_load_0: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_volatile_load_0: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(1)* %out) { +entry: + %val = load volatile i32, i32 addrspace(3)* %in, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @local_volatile_load_1( +; GFX6-LABEL: local_volatile_load_1: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_volatile_load_1: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_volatile_load_1: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_volatile_load_1: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_volatile_load_1: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(1)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val.gep = getelementptr inbounds i32, i32 addrspace(3)* %in, i32 %tid + %val = load volatile i32, i32 addrspace(3)* %val.gep, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @local_volatile_store_0( +; GFX6-LABEL: local_volatile_store_0: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_volatile_store_0: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_volatile_store_0: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_volatile_store_0: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_volatile_store_0: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(3)* %out) { +entry: + %val = load i32, i32 addrspace(1)* %in, align 4 + store volatile i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_volatile_store_1( +; GFX6-LABEL: local_volatile_store_1: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_volatile_store_1: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_volatile_store_1: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_volatile_store_1: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_volatile_store_1: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(3)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val = load i32, i32 addrspace(1)* %in, align 4 + %out.gep = getelementptr inbounds i32, i32 addrspace(3)* %out, i32 %tid + store volatile i32 %val, i32 addrspace(3)* %out.gep + ret void +} + +define amdgpu_kernel void @local_volatile_workgroup_acquire_load( +; GFX6-LABEL: local_volatile_workgroup_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_volatile_workgroup_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_volatile_workgroup_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_volatile_workgroup_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_volatile_workgroup_acquire_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic volatile i32, i32 addrspace(3)* %in syncscope("workgroup") acquire, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_volatile_workgroup_release_store( +; GFX6-LABEL: local_volatile_workgroup_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_volatile_workgroup_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_volatile_workgroup_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_volatile_workgroup_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_volatile_workgroup_release_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic volatile i32 %in, i32 addrspace(3)* %out syncscope("workgroup") release, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -0,0 +1,404 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s + +define amdgpu_kernel void @private_volatile_load_0( +; GFX6-LABEL: private_volatile_load_0: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX6-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX6-NEXT: s_add_u32 s8, s8, s3 +; GFX6-NEXT: s_addc_u32 s9, s9, 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_volatile_load_0: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_add_u32 s8, s8, s7 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: buffer_load_dword v2, v0, s[8:11], 0 offen glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_volatile_load_0: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7 +; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_volatile_load_0: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_add_u32 s8, s8, s7 +; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_volatile_load_0: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[8:9] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s8, s8, s3 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s9, s9, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm + i32 addrspace(5)* %in, i32 addrspace(1)* %out) { +entry: + %val = load volatile i32, i32 addrspace(5)* %in, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @private_volatile_load_1( +; GFX6-LABEL: private_volatile_load_1: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX6-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX6-NEXT: s_add_u32 s8, s8, s3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX6-NEXT: s_addc_u32 s9, s9, 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_volatile_load_1: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_add_u32 s8, s8, s7 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; GFX7-NEXT: buffer_load_dword v2, v0, s[8:11], 0 offen glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_volatile_load_1: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7 +; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_volatile_load_1: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_add_u32 s8, s8, s7 +; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_volatile_load_1: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[8:9] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s8, s8, s3 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s9, s9, 0 +; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm + i32 addrspace(5)* %in, i32 addrspace(1)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val.gep = getelementptr inbounds i32, i32 addrspace(5)* %in, i32 %tid + %val = load volatile i32, i32 addrspace(5)* %val.gep, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @private_volatile_store_0( +; GFX6-LABEL: private_volatile_store_0: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GFX6-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX6-NEXT: s_add_u32 s4, s4, s3 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_addc_u32 s5, s5, 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_volatile_store_0: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_add_u32 s8, s8, s7 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_volatile_store_0: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7 +; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_volatile_store_0: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_add_u32 s8, s8, s7 +; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_volatile_store_0: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s4, s3 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s5, 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(5)* %out) { +entry: + %val = load i32, i32 addrspace(1)* %in, align 4 + store volatile i32 %val, i32 addrspace(5)* %out + ret void +} + +define amdgpu_kernel void @private_volatile_store_1( +; GFX6-LABEL: private_volatile_store_1: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GFX6-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX6-NEXT: s_add_u32 s4, s4, s3 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX6-NEXT: s_addc_u32 s5, s5, 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_volatile_store_1: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_add_u32 s8, s8, s7 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_volatile_store_1: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7 +; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_volatile_store_1: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_add_u32 s8, s8, s7 +; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_volatile_store_1: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s4, s3 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s5, 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(5)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val = load i32, i32 addrspace(1)* %in, align 4 + %out.gep = getelementptr inbounds i32, i32 addrspace(5)* %out, i32 %tid + store volatile i32 %val, i32 addrspace(5)* %out.gep + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -154,8 +154,10 @@ ; GFX9-NEXT: v_mul_u32_u24_e32 v3, v0, v1 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v1 ; GFX9-NEXT: global_store_dword v[0:1], v3, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_u32_u24 v0, v0, v1, v2 ; GFX9-NEXT: global_store_dword v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll --- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -234,8 +234,8 @@ ; GCN-NEXT: ; %bb.8: ; %case0 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; GCN-NEXT: s_mov_b64 s[8:9], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b64 s[8:9], 0 ; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 ; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec ; GCN-NEXT: s_and_b64 s[10:11], vcc, exec diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll --- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll @@ -186,6 +186,7 @@ ; GCN-NEXT: BB1_6: ; %bb31 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm ; IR-LABEL: @nested_loop_conditions( ; IR-NEXT: bb: diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -48,6 +48,7 @@ ; MUBUF-NEXT: BB0_3: ; %bb.2 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: global_store_dword v[0:1], v0, off +; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_endpgm ; ; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4: @@ -83,6 +84,7 @@ ; FLATSCR-NEXT: BB0_3: ; %bb.2 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 ; FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_endpgm entry: @@ -151,6 +153,7 @@ ; MUBUF-NEXT: BB1_2: ; %bb.1 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: global_store_dword v[0:1], v0, off +; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_endpgm ; ; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: @@ -181,6 +184,7 @@ ; FLATSCR-NEXT: BB1_2: ; %bb.1 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 ; FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_endpgm entry: %cond = icmp eq i32 %arg.cond, 0 @@ -243,9 +247,9 @@ ; MUBUF-NEXT: s_or_b64 exec, exec, s[4:5] ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: global_store_dword v[0:1], v0, off +; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_sub_u32 s32, s32, 0x400 ; MUBUF-NEXT: s_mov_b32 s33, s7 -; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; FLATSCR-LABEL: func_non_entry_block_static_alloca_align4: @@ -280,9 +284,9 @@ ; FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 ; FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_sub_u32 s32, s32, 16 ; FLATSCR-NEXT: s_mov_b32 s33, s5 -; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: @@ -342,9 +346,9 @@ ; MUBUF-NEXT: s_or_b64 exec, exec, s[4:5] ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: global_store_dword v[0:1], v0, off +; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_sub_u32 s32, s32, 0x2000 ; MUBUF-NEXT: s_mov_b32 s33, s7 -; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; FLATSCR-LABEL: func_non_entry_block_static_alloca_align64: @@ -374,9 +378,9 @@ ; FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 ; FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x80 ; FLATSCR-NEXT: s_mov_b32 s33, s3 -; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %cond = icmp eq i32 %arg.cond, 0 diff --git a/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll b/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll --- a/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll +++ b/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll @@ -34,7 +34,7 @@ ; GCN-LABEL: {{^}}load_from_undef: ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1] ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] -; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offen{{$}} +; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offen glc{{$}} define amdgpu_kernel void @load_from_undef() #0 { %ld = load volatile i32, i32 addrspace(5)* undef ret void @@ -43,7 +43,7 @@ ; GCN-LABEL: {{^}}load_from_inttoptr: ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1] ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] -; OPT: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offset:124{{$}} +; OPT: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offset:124 glc{{$}} define amdgpu_kernel void @load_from_inttoptr() #0 { %ld = load volatile i32, i32 addrspace(5)* inttoptr (i32 124 to i32 addrspace(5)*) ret void diff --git a/llvm/test/CodeGen/AMDGPU/rsq.ll b/llvm/test/CodeGen/AMDGPU/rsq.ll --- a/llvm/test/CodeGen/AMDGPU/rsq.ll +++ b/llvm/test/CodeGen/AMDGPU/rsq.ll @@ -44,7 +44,7 @@ ; NOTE: c * rcp( sqrt(a) * b ) is generated when we move rcp generation to AMGGPUCogenPrepare. ; SI-LABEL: @rsqrt_fmul -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -24,19 +24,19 @@ ; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 -; SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 -; SI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc @@ -65,15 +65,17 @@ ; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 -; VI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 -; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 -; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -112,16 +114,16 @@ ; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 -; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc @@ -147,14 +149,15 @@ ; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 -; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm @@ -191,16 +194,16 @@ ; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 -; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc @@ -226,14 +229,15 @@ ; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 -; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm @@ -270,16 +274,16 @@ ; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 -; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc @@ -305,15 +309,16 @@ ; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 -; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, 0x3800 ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm @@ -350,16 +355,16 @@ ; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 -; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc @@ -385,15 +390,16 @@ ; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 -; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, 0x3800 ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -294,11 +294,12 @@ ; GCN-NEXT: s_mov_b32 s14, 0 ; GCN-NEXT: s_mov_b32 s15, s3 ; GCN-NEXT: s_mov_b64 s[12:13], s[6:7] -; GCN-NEXT: buffer_load_ushort v2, off, s[8:11], 0 -; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[12:15], 0 addr64 offset:2 +; GCN-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[12:15], 0 addr64 offset:2 glc +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NEXT: v_lshl_b32_e32 v0, v2, v0 @@ -1640,6 +1641,7 @@ ; GCN-NEXT: s_lshl_b32 s0, s0, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm ; ; EG-LABEL: test_mul2: diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -82,15 +82,17 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_subrev_i32_e32 v3, vcc, 64, v3 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_test_i32_x_sub_64_multi_use: @@ -101,17 +103,19 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: flat_load_dword v3, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v4, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_subrev_u32_e32 v3, vcc, 64, v4 ; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v3 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_x_sub_64_multi_use: @@ -119,14 +123,16 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v1 +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v1 ; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_x_sub_64_multi_use: @@ -134,15 +140,16 @@ ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -804,15 +811,17 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_subrev_i32_e32 v3, vcc, 64, v3 ; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_short v3, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_test_i16_x_sub_64_multi_use: @@ -823,17 +832,19 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ushort v3, v[0:1] -; VI-NEXT: flat_load_ushort v4, v[0:1] +; VI-NEXT: flat_load_ushort v3, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_ushort v4, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_subrev_u16_e32 v2, 64, v3 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_subrev_u16_e32 v3, 64, v4 ; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_short v[0:1], v3 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i16_x_sub_64_multi_use: @@ -841,14 +852,16 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1 +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1 ; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_short v0, v2, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i16_x_sub_64_multi_use: @@ -856,15 +869,16 @@ ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_sub_nc_u16_e64 v1, v1, 64 +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_sub_nc_u16_e64 v1, v1, 64 ; GFX10-NEXT: v_sub_nc_u16_e64 v2, v2, 64 ; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_short v0, v2, s[0:1] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -212,6 +212,7 @@ ; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1 ; SI-NEXT: s_mov_b64 s[12:13], 0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_branch BB3_1 ; SI-NEXT: BB3_8: ; %loop.exit.guard4 ; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1 @@ -284,6 +285,7 @@ ; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1 ; FLAT-NEXT: s_mov_b64 s[12:13], 0 ; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; FLAT-NEXT: s_waitcnt vmcnt(0) ; FLAT-NEXT: s_branch BB3_1 ; FLAT-NEXT: BB3_8: ; %loop.exit.guard4 ; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -351,15 +351,16 @@ ; SI-NEXT: s_sext_i32_i8 s0, s0 ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sext_v4i8_to_v4i32: @@ -376,11 +377,15 @@ ; VI-NEXT: v_bfe_i32 v0, v0, 0, 8 ; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm %cast = bitcast i32 %a to <4 x i8> %ext = sext <4 x i8> %cast to <4 x i32> @@ -417,9 +422,13 @@ ; SI-NEXT: v_bfe_i32 v3, v0, 8, 8 ; SI-NEXT: v_bfe_i32 v0, v0, 0, 8 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_sext_v4i8_to_v4i32: @@ -442,9 +451,13 @@ ; VI-NEXT: v_bfe_i32 v0, v0, 0, 8 ; VI-NEXT: v_bfe_i32 v1, v1, 0, 8 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm %a = load i32, i32 addrspace(1)* %in %cast = bitcast i32 %a to <4 x i8> @@ -475,16 +488,17 @@ ; SI-NEXT: s_sext_i32_i16 s6, s6 ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: s_sext_i32_i16 s7, s7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sext_v4i16_to_v4i32: @@ -500,13 +514,17 @@ ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_ashr_i32 s4, s7, 16 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: s_sext_i32_i16 s7, s7 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm %cast = bitcast i64 %a to <4 x i16> %ext = sext <4 x i16> %cast to <4 x i32> @@ -541,9 +559,13 @@ ; SI-NEXT: v_bfe_i32 v0, v0, 0, 16 ; SI-NEXT: v_bfe_i32 v1, v1, 0, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_sext_v4i16_to_v4i32: @@ -565,9 +587,13 @@ ; VI-NEXT: v_ashrrev_i32_e32 v2, 16, v1 ; VI-NEXT: v_bfe_i32 v1, v1, 0, 16 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm %a = load i64, i64 addrspace(1)* %in %cast = bitcast i64 %a to <4 x i16> diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll @@ -13,6 +13,7 @@ ; VI-NEXT: v_mov_b32_e32 v0, 9 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:128 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm ; VI-NEXT: .section .rodata,#alloc ; VI-NEXT: .p2align 6 @@ -59,6 +60,7 @@ ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 9 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:128 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .section .rodata,#alloc ; GFX9-NEXT: .p2align 6 @@ -112,6 +114,7 @@ ; VI-NEXT: v_mov_b32_e32 v0, 9 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm ; VI-NEXT: .section .rodata,#alloc ; VI-NEXT: .p2align 6 @@ -158,6 +161,7 @@ ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 9 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .section .rodata,#alloc ; GFX9-NEXT: .p2align 6 @@ -211,6 +215,7 @@ ; VI-NEXT: v_mov_b32_e32 v0, 9 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm ; VI-NEXT: .section .rodata,#alloc ; VI-NEXT: .p2align 6 @@ -257,6 +262,7 @@ ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 9 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .section .rodata,#alloc ; GFX9-NEXT: .p2align 6 diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -165,6 +165,7 @@ ; GCN: s_mov_b32 s34, s32 ; GCN: v_mov_b32_e32 v32, 0 ; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 ; GCN-NEXT: s_add_u32 s32, s32, 0x30000 @@ -237,10 +238,10 @@ ; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 ; GCN: v_readlane_b32 s34, [[VGPR_REG:v[0-9]+]], 0 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:128 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN: s_setpc_b64 s[30:31] %local_val = alloca i32, align 128, addrspace(5) store volatile i32 %b, i32 addrspace(5)* %local_val, align 128 ; Use all clobberable registers, so BP has to spill to a VGPR. diff --git a/llvm/test/CodeGen/AMDGPU/store-hi16.ll b/llvm/test/CodeGen/AMDGPU/store-hi16.ll --- a/llvm/test/CodeGen/AMDGPU/store-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/store-hi16.ll @@ -646,9 +646,13 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16_to_offset: ; GCN: s_waitcnt ; GFX900-MUBUF: buffer_store_dword +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094 +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR: scratch_store_dword +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, s32 offset:4094 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) define void @store_private_hi_v2i16_to_offset(i32 %arg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -665,9 +669,12 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_to_offset: ; GCN: s_waitcnt ; GFX900-MUBUF: buffer_store_dword +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s32 offset:4095 ; GFX900-FLATSCR: scratch_store_dword +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, s32 offset:4095 +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) define void @store_private_hi_v2i16_i8_to_offset(i32 %arg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -13,10 +13,11 @@ ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v0, v0, s[8:9] -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v0, v0, s[8:9] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: v_pk_sub_i16 v0, v1, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -33,11 +34,12 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v1, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_u16_e32 v2, v0, v1 ; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_or_b32_e32 v0, v2, v0 @@ -163,11 +165,11 @@ ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -180,11 +182,11 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, 0xfffffe38 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v2, 0xff85, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v2, v0 @@ -208,11 +210,11 @@ ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -225,11 +227,11 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, 0x3df ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v2, 0x34d, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v2, v0 @@ -252,10 +254,10 @@ ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -268,11 +270,11 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, 1 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v2, 1, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v2, v0 @@ -295,10 +297,10 @@ ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v0, v0, 32 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -311,10 +313,10 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; VI-NEXT: v_subrev_u16_e32 v0, 32, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -338,11 +340,11 @@ ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s4, 1.0 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -357,11 +359,11 @@ ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, 0xffffc080 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -383,11 +385,12 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v0, v1, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -406,11 +409,12 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v1, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v1, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_u16_e32 v0, v1, v2 ; VI-NEXT: v_sub_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -437,11 +441,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v2, v2, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -460,13 +465,14 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v4, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_u16_e32 v0, v4, v2 ; VI-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 @@ -491,11 +497,12 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v0, v1, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 @@ -514,11 +521,12 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v1, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_sub_u16_e32 v0, v0, v1 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 diff --git a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll --- a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll +++ b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll @@ -44,11 +44,12 @@ ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_mov_b32 s3, 0xf000 ; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], 0 -; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 ; CHECK-NEXT: BB1_1: ; %bb9 @@ -102,6 +103,7 @@ ; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: v_mov_b32_e32 v0, v6 ; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: v_nop ; CHECK-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll --- a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll +++ b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll @@ -42,12 +42,16 @@ ; GCN-LABEL: {{^}}test_use_s_v_s: ; GCN-DAG: s_load_dwordx2 s{{\[}}[[SA:[0-9]+]]:[[SB:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} ; SI: buffer_load_dword [[VA0:v[0-9]+]] +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_dword [[VA1:v[0-9]+]] +; SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOT: v_mov_b32 ; VI: buffer_load_dword [[VA0:v[0-9]+]] +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_load_dword [[VA1:v[0-9]+]] +; VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOT: v_mov_b32 ; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], s[[SB]] diff --git a/llvm/test/CodeGen/AMDGPU/v_mac.ll b/llvm/test/CodeGen/AMDGPU/v_mac.ll --- a/llvm/test/CodeGen/AMDGPU/v_mac.ll +++ b/llvm/test/CodeGen/AMDGPU/v_mac.ll @@ -3,7 +3,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-DENORM -check-prefix=GCN %s ; GCN-LABEL: {{^}}mac_vvv: -; GCN: buffer_load_dword [[A:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}} +; GCN: buffer_load_dword [[A:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 glc{{$}} ; GCN: buffer_load_dword [[B:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; GCN: buffer_load_dword [[C:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8 ; GCN: v_mac_f32_e32 [[C]], [[A]], [[B]] diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll --- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll @@ -82,19 +82,19 @@ ; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 -; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v3, 0x41200000 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_mov_b32 s8, s6 ; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_madak_f32 v1, v0, v1, 0x41200000 ; SI-NEXT: v_mac_f32_e32 v3, v0, v2 @@ -121,17 +121,18 @@ ; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 -; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 -; VI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, 0x4900 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_mov_b32 s8, s6 ; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_madak_f16 v1, v0, v1, 0x4900 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mac_f16_e32 v3, v0, v2 ; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 ; VI-NEXT: buffer_store_short v3, off, s[8:11], 0 diff --git a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll --- a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll @@ -114,9 +114,9 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v5, v2 -; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 addr64 -; GCN-NEXT: s_mov_b64 s[6:7], s[2:3] +; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b64 s[6:7], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: buffer_store_dword v0, v[4:5], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm