diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -13,6 +13,9 @@ /// S_WAITCNT instructions when we want to access any of their results or /// overwrite any register that's used asynchronously. /// +/// This pass will remove cache invalidation instructions if it can prove that +/// they are unnecessary. +/// /// TODO: This pass currently keeps one timeline per hardware counter. A more /// finely-grained approach that keeps one timeline per event type could /// sometimes get away with generating weaker s_waitcnt instructions. For @@ -83,6 +86,19 @@ enum_iterator(NUM_INST_CNTS)); } +enum MemoryCacheLevel { + MEM_CACHE_LVL_BEGIN = 0, + MEM_CACHE_LVL_0 = MEM_CACHE_LVL_BEGIN, + MEM_CACHE_LVL_1, + MEM_CACHE_LVL_2, + MEM_CACHE_LVL_END +}; + +iterator_range> memoryCacheLevels() { + return make_range(enum_iterator(MEM_CACHE_LVL_BEGIN), + enum_iterator(MEM_CACHE_LVL_END)); +} + using RegInterval = std::pair; struct { @@ -180,6 +196,22 @@ } } +bool instructionInvalidatesL1Cache(unsigned OpCode) { + switch (OpCode) { + case AMDGPU::BUFFER_WBINVL1: + case AMDGPU::BUFFER_WBINVL1_SC: + case AMDGPU::BUFFER_WBINVL1_VOL: + case AMDGPU::BUFFER_GL1_INV: + return true; + default: + return false; + } +} + +bool instructionInvalidatesL0Cache(unsigned OpCode) { + return OpCode == AMDGPU::BUFFER_GL0_INV; +} + // This objects maintains the current score brackets of each wait counter, and // a per-register scoreboard for each wait counter. // @@ -254,8 +286,12 @@ void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, WaitEventType E, MachineInstr &MI); + void updatePotentiallyStaleCacheByEvent(WaitEventType E); + + bool hasPending() const { + return PendingEvents != 0 || hasPotentiallyStaleCache(); + } - bool hasPending() const { return PendingEvents != 0; } bool hasPendingEvent(WaitEventType E) const { return PendingEvents & (1 << E); } @@ -278,6 +314,26 @@ LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT]; } + bool hasPotentiallyStaleCache() const { + for (MemoryCacheLevel Level : memoryCacheLevels()) { + if (hasPotentiallyStaleCacheAtLevel(Level)) + return true; + } + return false; + } + + bool hasPotentiallyStaleCacheAtLevel(MemoryCacheLevel Level) const { + return HasPotentiallyStaleCache[Level]; + } + + void setPotentiallyStaleCacheAtLevel(MemoryCacheLevel Level) { + HasPotentiallyStaleCache[Level] = true; + } + + void clearPotentiallyStaleCacheAtLevel(MemoryCacheLevel Level) { + HasPotentiallyStaleCache[Level] = false; + } + // Return true if there might be pending writes to the specified vgpr by VMEM // instructions with types different from V. bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const { @@ -349,6 +405,8 @@ // Bitmask of the VmemTypes of VMEM instructions that might have a pending // write to each vgpr. unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; + // Keeps track of whether or not the cache at each level is potentially dirty. + bool HasPotentiallyStaleCache[MEM_CACHE_LVL_END] = {}; }; class SIInsertWaitcnts : public MachineFunctionPass { @@ -438,10 +496,18 @@ bool generateWaitcntInstBefore(MachineInstr &MI, WaitcntBrackets &ScoreBrackets, MachineInstr *OldWaitcntInstr); - void updateEventWaitcntAfter(MachineInstr &Inst, - WaitcntBrackets *ScoreBrackets); + void updateWaitcntBracketAfter(MachineInstr &Inst, + WaitcntBrackets *ScoreBrackets); bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets); + void clearStaleCacheLevelsIfIsCacheInvalidationInstruction( + MachineInstr &MI, WaitcntBrackets &Brackets); + + bool removeUnnecessaryMemoryCacheInvalidationInstructions(); + bool removeIfUnnecessaryMemoryCacheInvalidationInBlock( + MachineBasicBlock &BB, const WaitcntBrackets &Bracket); + bool isUnnecessaryMemoryCacheInvalidation(const MachineInstr &Inst, + WaitcntBrackets &Brackets); }; } // end anonymous namespace @@ -512,6 +578,7 @@ // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message. PendingEvents |= 1 << E; setScoreUB(T, CurrScore); + updatePotentiallyStaleCacheByEvent(E); if (T == EXP_CNT) { // Put score on the source vgprs. If this is a store, just use those @@ -653,6 +720,30 @@ } } +void WaitcntBrackets::updatePotentiallyStaleCacheByEvent(WaitEventType E) { + switch (E) { + case VMEM_ACCESS: + case VMEM_READ_ACCESS: + case VMEM_WRITE_ACCESS: + case SMEM_ACCESS: + case GDS_ACCESS: + setPotentiallyStaleCacheAtLevel(MEM_CACHE_LVL_0); + setPotentiallyStaleCacheAtLevel(MEM_CACHE_LVL_1); + break; + case LDS_ACCESS: + case EXP_POS_ACCESS: + case EXP_PARAM_ACCESS: + setPotentiallyStaleCacheAtLevel(MEM_CACHE_LVL_0); + break; + case SQ_MESSAGE: + case EXP_GPR_LOCK: + case GDS_GPR_LOCK: + case VMW_GPR_LOCK: + case NUM_WAIT_EVENTS: + break; + } +} + void WaitcntBrackets::print(raw_ostream &OS) { OS << '\n'; for (auto T : inst_counter_types()) { @@ -703,6 +794,10 @@ } OS << '\n'; } + for (MemoryCacheLevel Level : memoryCacheLevels()) { + OS << "Has potentially stale L" << Level + << " cache: " << hasPotentiallyStaleCacheAtLevel(Level) << '\n'; + } OS << '\n'; } @@ -842,11 +937,8 @@ // See if this instruction has a forced S_WAITCNT VM. // TODO: Handle other cases of NeedsWaitcntVmBefore() - if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 || - MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC || - MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL || - MI.getOpcode() == AMDGPU::BUFFER_GL0_INV || - MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) { + if (instructionInvalidatesL0Cache(MI.getOpcode()) || + instructionInvalidatesL1Cache(MI.getOpcode())) { Wait.VmCnt = 0; } @@ -1231,8 +1323,10 @@ return false; } -void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, - WaitcntBrackets *ScoreBrackets) { +void SIInsertWaitcnts::updateWaitcntBracketAfter( + MachineInstr &Inst, WaitcntBrackets *ScoreBrackets) { + clearStaleCacheLevelsIfIsCacheInvalidationInstruction(Inst, *ScoreBrackets); + // Now look at the instruction opcode. If it is a memory access // instruction, update the upper-bound of the appropriate counter's // bracket and the destination operand scores. @@ -1275,11 +1369,8 @@ ScoreBrackets->setPendingFlat(); } else if (SIInstrInfo::isVMEM(Inst) && // TODO: get a better carve out. - Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 && - Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC && - Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL && - Inst.getOpcode() != AMDGPU::BUFFER_GL0_INV && - Inst.getOpcode() != AMDGPU::BUFFER_GL1_INV) { + !instructionInvalidatesL1Cache(Inst.getOpcode()) && + !instructionInvalidatesL0Cache(Inst.getOpcode())) { if (!ST->hasVscnt()) ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); else if ((Inst.mayLoad() && !SIInstrInfo::isAtomicNoRet(Inst)) || @@ -1344,6 +1435,9 @@ VgprUB = std::max(VgprUB, Other.VgprUB); SgprUB = std::max(SgprUB, Other.SgprUB); + for (MemoryCacheLevel Level : memoryCacheLevels()) { + HasPotentiallyStaleCache[Level] |= Other.HasPotentiallyStaleCache[Level]; + } for (auto T : inst_counter_types()) { // Merge event flags for this counter @@ -1488,7 +1582,7 @@ } } - updateEventWaitcntAfter(Inst, &ScoreBrackets); + updateWaitcntBracketAfter(Inst, &ScoreBrackets); #if 0 // TODO: implement resource type check controlled by options with ub = LB. // If this instruction generates a S_SETVSKIP because it is an @@ -1526,6 +1620,56 @@ return Modified; } +void SIInsertWaitcnts::clearStaleCacheLevelsIfIsCacheInvalidationInstruction( + MachineInstr &MI, WaitcntBrackets &Brackets) { + if (instructionInvalidatesL0Cache(MI.getOpcode())) + Brackets.clearPotentiallyStaleCacheAtLevel(MEM_CACHE_LVL_0); + + if (instructionInvalidatesL1Cache(MI.getOpcode())) + Brackets.clearPotentiallyStaleCacheAtLevel(MEM_CACHE_LVL_1); +} + +bool SIInsertWaitcnts::removeUnnecessaryMemoryCacheInvalidationInstructions() { + bool InstructionRemoved = false; + for (auto &BlockInfo : BlockInfos) { + WaitcntBrackets Bracket = + (BlockInfo.second.Incoming ? *BlockInfo.second.Incoming + : WaitcntBrackets(ST)); + InstructionRemoved |= removeIfUnnecessaryMemoryCacheInvalidationInBlock( + *BlockInfo.first, Bracket); + } + return InstructionRemoved; +} + +bool SIInsertWaitcnts::removeIfUnnecessaryMemoryCacheInvalidationInBlock( + MachineBasicBlock &BB, const WaitcntBrackets &Bracket) { + WaitcntBrackets LocalBracket = Bracket; + SmallVector InstructionsToErase; + for (MachineInstr &Inst : BB) { + if (isUnnecessaryMemoryCacheInvalidation(Inst, LocalBracket)) + InstructionsToErase.push_back(&Inst); + updateWaitcntBracketAfter(Inst, &LocalBracket); + } + + for (MachineInstr *Inst : InstructionsToErase) { + Inst->eraseFromParent(); + } + return !InstructionsToErase.empty(); +} + +bool SIInsertWaitcnts::isUnnecessaryMemoryCacheInvalidation( + const MachineInstr &Inst, WaitcntBrackets &Brackets) { + auto OpCode = Inst.getOpcode(); + if (instructionInvalidatesL0Cache(OpCode) && + !Brackets.hasPotentiallyStaleCacheAtLevel(MEM_CACHE_LVL_0)) { + return true; + } else if (instructionInvalidatesL1Cache(OpCode) && + !Brackets.hasPotentiallyStaleCacheAtLevel(MEM_CACHE_LVL_1)) { + return true; + } + return false; +} + bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { ST = &MF.getSubtarget(); TII = ST->getInstrInfo(); @@ -1615,6 +1759,8 @@ } } while (Repeat); + removeUnnecessaryMemoryCacheInvalidationInstructions(); + SmallVector EndPgmBlocks; bool HaveScalarStores = false; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll @@ -4,6 +4,9 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX10,GFX10WGP %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX10,GFX10CU %s +; Stores to this variable are used to stop the si-insert-waitcnt pass from removing the cache invalidation instructions. +@gint = external addrspace(1) global i32, align 4 + ; FUNC-LABEL: {{^}}system_one_as_acquire: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE @@ -22,6 +25,7 @@ ; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_acquire() { entry: + store i32 0, i32 addrspace(1)* @gint fence syncscope("one-as") acquire ret void } @@ -58,6 +62,7 @@ ; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_acq_rel() { entry: + store i32 0, i32 addrspace(1)* @gint fence syncscope("one-as") acq_rel ret void } @@ -78,6 +83,7 @@ ; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_seq_cst() { entry: + store i32 0, i32 addrspace(1)* @gint fence syncscope("one-as") seq_cst ret void } @@ -156,6 +162,7 @@ ; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_one_as_acquire() { entry: + store i32 0, i32 addrspace(1)* @gint fence syncscope("agent-one-as") acquire ret void } @@ -192,6 +199,7 @@ ; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_one_as_acq_rel() { entry: + store i32 0, i32 addrspace(1)* @gint fence syncscope("agent-one-as") acq_rel ret void } @@ -212,6 +220,7 @@ ; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_one_as_seq_cst() { entry: + store i32 0, i32 addrspace(1)* @gint fence syncscope("agent-one-as") seq_cst ret void } @@ -233,6 +242,7 @@ ; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_one_as_acquire() { entry: + store i32 0, i32 addrspace(1)* @gint fence syncscope("workgroup-one-as") acquire ret void } @@ -253,6 +263,7 @@ ; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_one_as_release() { entry: + store i32 0, i32 addrspace(1)* @gint fence syncscope("workgroup-one-as") release ret void } @@ -274,6 +285,7 @@ ; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_one_as_acq_rel() { entry: + store i32 0, i32 addrspace(1)* @gint fence syncscope("workgroup-one-as") acq_rel ret void } @@ -295,6 +307,7 @@ ; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_one_as_seq_cst() { entry: + store i32 0, i32 addrspace(1)* @gint fence syncscope("workgroup-one-as") seq_cst ret void } @@ -373,6 +386,7 @@ ; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_acquire() { entry: + store i32 0, i32 addrspace(1)* @gint fence acquire ret void } @@ -413,6 +427,7 @@ ; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_acq_rel() { entry: + store i32 0, i32 addrspace(1)* @gint fence acq_rel ret void } @@ -435,6 +450,7 @@ ; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_seq_cst() { entry: + store i32 0, i32 addrspace(1)* @gint fence seq_cst ret void } @@ -513,6 +529,7 @@ ; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_acquire() { entry: + store i32 0, i32 addrspace(1)* @gint fence syncscope("agent") acquire ret void } @@ -553,6 +570,7 @@ ; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_acq_rel() { entry: + store i32 0, i32 addrspace(1)* @gint fence syncscope("agent") acq_rel ret void } @@ -575,6 +593,7 @@ ; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_seq_cst() { entry: + store i32 0, i32 addrspace(1)* @gint fence syncscope("agent") seq_cst ret void } @@ -596,6 +615,7 @@ ; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_acquire() { entry: + store i32 0, i32 addrspace(1)* @gint fence syncscope("workgroup") acquire ret void } @@ -616,6 +636,7 @@ ; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_release() { entry: + store i32 0, i32 addrspace(1)* @gint fence syncscope("workgroup") release ret void } @@ -637,6 +658,7 @@ ; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_acq_rel() { entry: + store i32 0, i32 addrspace(1)* @gint fence syncscope("workgroup") acq_rel ret void } @@ -658,6 +680,7 @@ ; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_seq_cst() { entry: + store i32 0, i32 addrspace(1)* @gint fence syncscope("workgroup") seq_cst ret void } diff --git a/llvm/test/CodeGen/AMDGPU/cache_invalidate.mir b/llvm/test/CodeGen/AMDGPU/cache_invalidate.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/cache_invalidate.mir @@ -0,0 +1,229 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-insert-waitcnts %s -o - | FileCheck -check-prefixes=CHECK %s + +--- | + @lds = addrspace(3) global float undef, align 16 + + define amdgpu_kernel void @no_stores() {ret void} + define amdgpu_kernel void @lds_store() {ret void} + define amdgpu_kernel void @smem_store() {ret void} + define amdgpu_kernel void @smem_store_with_multiple_invalidations() {ret void} + define amdgpu_kernel void @smem_store_with_invalidations_in_side_nodes() {ret void} + define amdgpu_kernel void @smem_load_in_if_block_with_invalidations_in_side_nodes() {ret void} + define amdgpu_kernel void @smem_load_in_else_block_with_invalidations_in_side_nodes() {ret void} + define amdgpu_kernel void @smem_load_at_end_of_loop() {ret void} +... + +--- +name: no_stores + +body: | + bb.0: + ; CHECK-LABEL: name: no_stores + ; CHECK: S_WAITCNT 0 + ; CHECK-NOT: BUFFER_WBINVL1 + ; CHECK: S_ENDPGM 0 + BUFFER_WBINVL1 implicit $exec + S_ENDPGM 0 +... + +--- +name: lds_store + +body: | + bb.0: + ; CHECK-LABEL: name: lds_store + ; CHECK: S_WAITCNT 0 + ; CHECK: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: DS_WRITE_B32_gfx9 killed renamable $vgpr0, renamable $vgpr0, 0, 0, implicit $exec :: (store 4 into @lds, align 16, addrspace 3) + ; CHECK-NOT: BUFFER_WBINVL1 + ; CHECK: S_ENDPGM 0 + renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + DS_WRITE_B32_gfx9 killed renamable $vgpr0, renamable $vgpr0, 0, 0, implicit $exec :: (store 4 into @lds, align 16, addrspace 3) + BUFFER_WBINVL1 implicit $exec + S_ENDPGM 0 +... + +--- +name: smem_store + +body: | + bb.0: + liveins: $sgpr2 + ; CHECK-LABEL: name: smem_store + ; CHECK: S_WAITCNT 0 + ; CHECK: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: S_WAITCNT 3952 + ; CHECK: BUFFER_WBINVL1 implicit $exec + ; CHECK: S_ENDPGM 0 + BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + BUFFER_WBINVL1 implicit $exec + S_ENDPGM 0 +... + +--- +name: smem_store_with_multiple_invalidations + +body: | + bb.0: + liveins: $sgpr2 + ; CHECK-LABEL: name: smem_store_with_multiple_invalidations + ; CHECK: S_WAITCNT 0 + ; CHECK: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: S_WAITCNT 3952 + ; CHECK: BUFFER_WBINVL1 implicit $exec + ; CHECK-NOT: BUFFER_WBINVL1 + ; CHECK: S_ENDPGM 0 + BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + BUFFER_WBINVL1 implicit $exec + BUFFER_WBINVL1 implicit $exec + S_ENDPGM 0 +... + +--- +name: smem_store_with_invalidations_in_side_nodes + +# The buffer_WBINVL1 instructions should left be in both paths. + +body: | + ; CHECK-LABEL: name: smem_store_with_invalidations_in_side_nodes + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: S_WAITCNT 0 + ; CHECK: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFSET renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: S_WAITCNT 3952 + ; CHECK: V_CMP_NE_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec + ; CHECK: $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + ; CHECK: S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc + ; CHECK: bb.1: + ; CHECK: BUFFER_WBINVL1 implicit $exec + ; CHECK: S_ENDPGM 0 + ; CHECK: bb.2: + ; CHECK: BUFFER_WBINVL1 implicit $exec + ; CHECK: S_ENDPGM 0 + bb.0: + renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFSET renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + V_CMP_NE_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc + + bb.1: + BUFFER_WBINVL1 implicit $exec + S_ENDPGM 0 + + bb.2: + BUFFER_WBINVL1 implicit $exec + S_ENDPGM 0 +... + +--- +name: smem_load_in_if_block_with_invalidations_in_side_nodes + +# The buffer_WBINVL1 instruction in the else block can be removed because. + +body: | + ; CHECK-LABEL: name: smem_load_in_if_block_with_invalidations_in_side_nodes + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: S_WAITCNT 0 + ; CHECK: V_CMP_NE_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec + ; CHECK: $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + ; CHECK: S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc + ; CHECK: bb.1: + ; CHECK: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFSET renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: S_WAITCNT 3952 + ; CHECK: BUFFER_WBINVL1 implicit $exec + ; CHECK: S_ENDPGM 0 + ; CHECK: bb.2: + ; CHECK: S_ENDPGM 0 + bb.0: + V_CMP_NE_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc + + bb.1: + renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFSET renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec + BUFFER_WBINVL1 implicit $exec + S_ENDPGM 0 + + bb.2: + BUFFER_WBINVL1 implicit $exec + S_ENDPGM 0 +... + +--- +name: smem_load_in_else_block_with_invalidations_in_side_nodes + +# The buffer_WBINVL1 instruction in the if block can be removed because. + +body: | + ; CHECK-LABEL: name: smem_load_in_else_block_with_invalidations_in_side_nodes + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: S_WAITCNT 0 + ; CHECK: V_CMP_NE_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec + ; CHECK: $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + ; CHECK: S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc + ; CHECK: bb.1: + ; CHECK: S_ENDPGM 0 + ; CHECK: bb.2: + ; CHECK: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFSET renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: S_WAITCNT 3952 + ; CHECK: BUFFER_WBINVL1 implicit $exec + ; CHECK: S_ENDPGM 0 + bb.0: + V_CMP_NE_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc + + bb.1: + BUFFER_WBINVL1 implicit $exec + S_ENDPGM 0 + + bb.2: + renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFSET renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec + BUFFER_WBINVL1 implicit $exec + S_ENDPGM 0 +... + +--- +name: smem_load_at_end_of_loop + +# The buffer_WBINVL1 instruction at the start of the loop should not be removed. + +body: | + ; CHECK-LABEL: name: smem_load_at_end_of_loop + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: S_WAITCNT 0 + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFSET renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.2: + ; CHECK: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; CHECK: S_WAITCNT 3952 + ; CHECK: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec + ; CHECK: $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + ; CHECK: BUFFER_WBINVL1 implicit $exec + ; CHECK: S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + ; CHECK: bb.3: + ; CHECK: S_ENDPGM 0 + bb.0: + S_BRANCH %bb.1 + + bb.3: + renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFSET renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + BUFFER_WBINVL1 implicit $exec + S_CBRANCH_VCCZ %bb.3, implicit killed $vcc + + bb.2: + S_ENDPGM 0 +... +--- diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll @@ -1,14 +1,45 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=tahiti -show-mc-encoding < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -show-mc-encoding < %s | FileCheck -check-prefix=VI %s declare void @llvm.amdgcn.buffer.wbinvl1() #0 +@gint = external addrspace(1) global i32, align 4 -; GCN-LABEL: {{^}}test_buffer_wbinvl1: -; GCN-NEXT: ; %bb.0: -; SI-NEXT: buffer_wbinvl1 ; encoding: [0x00,0x00,0xc4,0xe1,0x00,0x00,0x00,0x00] -; VI-NEXT: buffer_wbinvl1 ; encoding: [0x00,0x00,0xf8,0xe0,0x00,0x00,0x00,0x00] -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_buffer_wbinvl1() #0 { +; SI-LABEL: test_buffer_wbinvl1: +; SI: ; %bb.0: +; SI-NEXT: s_getpc_b64 s[0:1] ; encoding: [0x00,0x1f,0x80,0xbe] +; SI-NEXT: s_add_u32 s0, s0, gint@gotpcrel32@lo+4 ; encoding: [0x00,0xff,0x00,0x80,A,A,A,A] +; SI-NEXT: ; fixup A - offset: 4, value: gint@gotpcrel32@lo+4, kind: FK_PCRel_4 +; SI-NEXT: s_addc_u32 s1, s1, gint@gotpcrel32@hi+12 ; encoding: [0x01,0xff,0x01,0x82,A,A,A,A] +; SI-NEXT: ; fixup A - offset: 4, value: gint@gotpcrel32@hi+12, kind: FK_PCRel_4 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x01,0x40,0xc0] +; SI-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0xf0,0x00,0x00] +; SI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] +; SI-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] +; SI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80] +; SI-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] +; SI-NEXT: buffer_wbinvl1 ; encoding: [0x00,0x00,0xc4,0xe1,0x00,0x00,0x00,0x00] +; SI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; VI-LABEL: test_buffer_wbinvl1: +; VI: ; %bb.0: +; VI-NEXT: s_getpc_b64 s[0:1] ; encoding: [0x00,0x1c,0x80,0xbe] +; VI-NEXT: s_add_u32 s0, s0, gint@gotpcrel32@lo+4 ; encoding: [0x00,0xff,0x00,0x80,A,A,A,A] +; VI-NEXT: ; fixup A - offset: 4, value: gint@gotpcrel32@lo+4, kind: FK_PCRel_4 +; VI-NEXT: s_addc_u32 s1, s1, gint@gotpcrel32@hi+12 ; encoding: [0x01,0xff,0x01,0x82,A,A,A,A] +; VI-NEXT: ; fixup A - offset: 4, value: gint@gotpcrel32@hi+12, kind: FK_PCRel_4 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e] +; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] +; VI-NEXT: v_mov_b32_e32 v0, s0 ; encoding: [0x00,0x02,0x00,0x7e] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; encoding: [0x01,0x02,0x02,0x7e] +; VI-NEXT: flat_store_dword v[0:1], v2 ; encoding: [0x00,0x00,0x70,0xdc,0x00,0x02,0x00,0x00] +; VI-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] +; VI-NEXT: buffer_wbinvl1 ; encoding: [0x00,0x00,0xf8,0xe0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] + store i32 0, i32 addrspace(1)* @gint call void @llvm.amdgcn.buffer.wbinvl1() ret void } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll @@ -1,12 +1,27 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=tahiti -show-mc-encoding < %s | FileCheck -check-prefix=SI %s declare void @llvm.amdgcn.buffer.wbinvl1.sc() #0 +@gint = external addrspace(1) global i32, align 4 -; SI-LABEL: {{^}}test_buffer_wbinvl1_sc: -; SI-NEXT: ; %bb.0: -; SI-NEXT: buffer_wbinvl1_sc ; encoding: [0x00,0x00,0xc0,0xe1,0x00,0x00,0x00,0x00] -; SI-NEXT: s_endpgm define amdgpu_kernel void @test_buffer_wbinvl1_sc() #0 { +; SI-LABEL: test_buffer_wbinvl1_sc: +; SI: ; %bb.0: +; SI-NEXT: s_getpc_b64 s[0:1] ; encoding: [0x00,0x1f,0x80,0xbe] +; SI-NEXT: s_add_u32 s0, s0, gint@gotpcrel32@lo+4 ; encoding: [0x00,0xff,0x00,0x80,A,A,A,A] +; SI-NEXT: ; fixup A - offset: 4, value: gint@gotpcrel32@lo+4, kind: FK_PCRel_4 +; SI-NEXT: s_addc_u32 s1, s1, gint@gotpcrel32@hi+12 ; encoding: [0x01,0xff,0x01,0x82,A,A,A,A] +; SI-NEXT: ; fixup A - offset: 4, value: gint@gotpcrel32@hi+12, kind: FK_PCRel_4 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x01,0x40,0xc0] +; SI-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0xf0,0x00,0x00] +; SI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] +; SI-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] +; SI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80] +; SI-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] +; SI-NEXT: buffer_wbinvl1_sc ; encoding: [0x00,0x00,0xc0,0xe1,0x00,0x00,0x00,0x00] +; SI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] + store i32 0, i32 addrspace(1)* @gint call void @llvm.amdgcn.buffer.wbinvl1.sc() ret void } diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll @@ -35,8 +35,6 @@ ; GFX90A-TGSPLIT-LABEL: singlethread_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("singlethread") acquire ret void @@ -70,8 +68,6 @@ ; GFX90A-TGSPLIT-LABEL: singlethread_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("singlethread") release ret void @@ -105,8 +101,6 @@ ; GFX90A-TGSPLIT-LABEL: singlethread_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("singlethread") acq_rel ret void @@ -140,8 +134,6 @@ ; GFX90A-TGSPLIT-LABEL: singlethread_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("singlethread") seq_cst ret void @@ -175,8 +167,6 @@ ; GFX90A-TGSPLIT-LABEL: singlethread_one_as_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("singlethread-one-as") acquire ret void @@ -210,8 +200,6 @@ ; GFX90A-TGSPLIT-LABEL: singlethread_one_as_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("singlethread-one-as") release ret void @@ -245,8 +233,6 @@ ; GFX90A-TGSPLIT-LABEL: singlethread_one_as_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("singlethread-one-as") acq_rel ret void @@ -280,8 +266,6 @@ ; GFX90A-TGSPLIT-LABEL: singlethread_one_as_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("singlethread-one-as") seq_cst ret void @@ -315,8 +299,6 @@ ; GFX90A-TGSPLIT-LABEL: wavefront_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("wavefront") acquire ret void @@ -350,8 +332,6 @@ ; GFX90A-TGSPLIT-LABEL: wavefront_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("wavefront") release ret void @@ -385,8 +365,6 @@ ; GFX90A-TGSPLIT-LABEL: wavefront_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("wavefront") acq_rel ret void @@ -420,8 +398,6 @@ ; GFX90A-TGSPLIT-LABEL: wavefront_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("wavefront") seq_cst ret void @@ -455,8 +431,6 @@ ; GFX90A-TGSPLIT-LABEL: wavefront_one_as_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("wavefront-one-as") acquire ret void @@ -490,8 +464,6 @@ ; GFX90A-TGSPLIT-LABEL: wavefront_one_as_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("wavefront-one-as") release ret void @@ -525,8 +497,6 @@ ; GFX90A-TGSPLIT-LABEL: wavefront_one_as_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("wavefront-one-as") acq_rel ret void @@ -560,8 +530,6 @@ ; GFX90A-TGSPLIT-LABEL: wavefront_one_as_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("wavefront-one-as") seq_cst ret void @@ -582,7 +550,6 @@ ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: workgroup_acquire_fence: @@ -603,10 +570,7 @@ ; GFX90A-TGSPLIT-LABEL: workgroup_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("workgroup") acquire ret void @@ -648,8 +612,6 @@ ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("workgroup") release ret void @@ -670,7 +632,6 @@ ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: workgroup_acq_rel_fence: @@ -691,10 +652,7 @@ ; GFX90A-TGSPLIT-LABEL: workgroup_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("workgroup") acq_rel ret void @@ -715,7 +673,6 @@ ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: workgroup_seq_cst_fence: @@ -736,10 +693,7 @@ ; GFX90A-TGSPLIT-LABEL: workgroup_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("workgroup") seq_cst ret void @@ -758,7 +712,6 @@ ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: workgroup_one_as_acquire_fence: @@ -776,10 +729,7 @@ ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("workgroup-one-as") acquire ret void @@ -816,8 +766,6 @@ ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("workgroup-one-as") release ret void @@ -836,7 +784,6 @@ ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: workgroup_one_as_acq_rel_fence: @@ -854,10 +801,7 @@ ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("workgroup-one-as") acq_rel ret void @@ -876,7 +820,6 @@ ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: workgroup_one_as_seq_cst_fence: @@ -894,30 +837,43 @@ ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("workgroup-one-as") seq_cst ret void } -define amdgpu_kernel void @agent_acquire_fence() { +define amdgpu_kernel void @agent_acquire_fence(i8 addrspace(1)* %ptr) { ; GFX6-LABEL: agent_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_byte v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_acquire_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_store_byte v0, v0, s[0:1] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -926,6 +882,10 @@ ; ; GFX10-CU-LABEL: agent_acquire_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_store_byte v0, v0, s[0:1] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -934,23 +894,36 @@ ; ; SKIP-CACHE-INV-LABEL: agent_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_byte v0, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_byte v0, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: + store i8 0, i8 addrspace(1)* %ptr, align 1 fence syncscope("agent") acquire ret void } @@ -992,28 +965,42 @@ ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("agent") release ret void } -define amdgpu_kernel void @agent_acq_rel_fence() { +define amdgpu_kernel void @agent_acq_rel_fence(i8 addrspace(1)* %ptr) { ; GFX6-LABEL: agent_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_byte v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_acq_rel_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_store_byte v0, v0, s[0:1] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1022,6 +1009,10 @@ ; ; GFX10-CU-LABEL: agent_acq_rel_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_store_byte v0, v0, s[0:1] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1030,42 +1021,71 @@ ; ; SKIP-CACHE-INV-LABEL: agent_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_byte v0, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_byte v0, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: + store i8 0, i8 addrspace(1)* %ptr, align 1 fence syncscope("agent") acq_rel ret void } -define amdgpu_kernel void @agent_seq_cst_fence() { +define amdgpu_kernel void @agent_seq_cst_fence(i8 addrspace(1)* %ptr) { ; GFX6-LABEL: agent_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_byte v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_seq_cst_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_store_byte v0, v0, s[0:1] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1074,6 +1094,10 @@ ; ; GFX10-CU-LABEL: agent_seq_cst_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_store_byte v0, v0, s[0:1] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1082,42 +1106,71 @@ ; ; SKIP-CACHE-INV-LABEL: agent_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_byte v0, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_byte v0, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: + store i8 0, i8 addrspace(1)* %ptr, align 1 fence syncscope("agent") seq_cst ret void } -define amdgpu_kernel void @agent_one_as_acquire_fence() { +define amdgpu_kernel void @agent_one_as_acquire_fence(i8 addrspace(1)* %ptr) { ; GFX6-LABEL: agent_one_as_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_one_as_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_byte v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_one_as_acquire_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_store_byte v0, v0, s[0:1] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1126,6 +1179,10 @@ ; ; GFX10-CU-LABEL: agent_one_as_acquire_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_store_byte v0, v0, s[0:1] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1134,23 +1191,36 @@ ; ; SKIP-CACHE-INV-LABEL: agent_one_as_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_byte v0, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_one_as_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_byte v0, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: + store i8 0, i8 addrspace(1)* %ptr, align 1 fence syncscope("agent-one-as") acquire ret void } @@ -1192,28 +1262,42 @@ ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("agent-one-as") release ret void } -define amdgpu_kernel void @agent_one_as_acq_rel_fence() { +define amdgpu_kernel void @agent_one_as_acq_rel_fence(i8 addrspace(1)* %ptr) { ; GFX6-LABEL: agent_one_as_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_one_as_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_byte v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_one_as_acq_rel_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_store_byte v0, v0, s[0:1] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1222,6 +1306,10 @@ ; ; GFX10-CU-LABEL: agent_one_as_acq_rel_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_store_byte v0, v0, s[0:1] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1230,42 +1318,71 @@ ; ; SKIP-CACHE-INV-LABEL: agent_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_byte v0, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_one_as_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_byte v0, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: + store i8 0, i8 addrspace(1)* %ptr, align 1 fence syncscope("agent-one-as") acq_rel ret void } -define amdgpu_kernel void @agent_one_as_seq_cst_fence() { +define amdgpu_kernel void @agent_one_as_seq_cst_fence(i8 addrspace(1)* %ptr) { ; GFX6-LABEL: agent_one_as_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_one_as_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_byte v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_one_as_seq_cst_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_store_byte v0, v0, s[0:1] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1274,6 +1391,10 @@ ; ; GFX10-CU-LABEL: agent_one_as_seq_cst_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_store_byte v0, v0, s[0:1] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1282,42 +1403,71 @@ ; ; SKIP-CACHE-INV-LABEL: agent_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_byte v0, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_one_as_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_byte v0, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: + store i8 0, i8 addrspace(1)* %ptr, align 1 fence syncscope("agent-one-as") seq_cst ret void } -define amdgpu_kernel void @system_acquire_fence() { +define amdgpu_kernel void @system_acquire_fence(i8 addrspace(1)* %ptr) { ; GFX6-LABEL: system_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_byte v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_acquire_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_store_byte v0, v0, s[0:1] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1326,6 +1476,10 @@ ; ; GFX10-CU-LABEL: system_acquire_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_store_byte v0, v0, s[0:1] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1334,11 +1488,21 @@ ; ; SKIP-CACHE-INV-LABEL: system_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_byte v0, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 @@ -1348,15 +1512,18 @@ ; ; GFX90A-TGSPLIT-LABEL: system_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_byte v0, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: + store i8 0, i8 addrspace(1)* %ptr, align 1 fence acquire ret void } @@ -1400,28 +1567,42 @@ ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence release ret void } -define amdgpu_kernel void @system_acq_rel_fence() { +define amdgpu_kernel void @system_acq_rel_fence(i8 addrspace(1)* %ptr) { ; GFX6-LABEL: system_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_byte v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_acq_rel_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_store_byte v0, v0, s[0:1] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1430,6 +1611,10 @@ ; ; GFX10-CU-LABEL: system_acq_rel_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_store_byte v0, v0, s[0:1] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1438,11 +1623,21 @@ ; ; SKIP-CACHE-INV-LABEL: system_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_byte v0, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 @@ -1452,34 +1647,53 @@ ; ; GFX90A-TGSPLIT-LABEL: system_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_byte v0, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: + store i8 0, i8 addrspace(1)* %ptr, align 1 fence acq_rel ret void } -define amdgpu_kernel void @system_seq_cst_fence() { +define amdgpu_kernel void @system_seq_cst_fence(i8 addrspace(1)* %ptr) { ; GFX6-LABEL: system_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_byte v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_seq_cst_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_store_byte v0, v0, s[0:1] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1488,6 +1702,10 @@ ; ; GFX10-CU-LABEL: system_seq_cst_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_store_byte v0, v0, s[0:1] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1496,11 +1714,21 @@ ; ; SKIP-CACHE-INV-LABEL: system_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_byte v0, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 @@ -1510,34 +1738,53 @@ ; ; GFX90A-TGSPLIT-LABEL: system_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_byte v0, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: + store i8 0, i8 addrspace(1)* %ptr, align 1 fence seq_cst ret void } -define amdgpu_kernel void @system_one_as_acquire_fence() { +define amdgpu_kernel void @system_one_as_acquire_fence(i8 addrspace(1)* %ptr) { ; GFX6-LABEL: system_one_as_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_one_as_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_byte v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_one_as_acquire_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_store_byte v0, v0, s[0:1] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1546,6 +1793,10 @@ ; ; GFX10-CU-LABEL: system_one_as_acquire_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_store_byte v0, v0, s[0:1] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1554,11 +1805,21 @@ ; ; SKIP-CACHE-INV-LABEL: system_one_as_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_byte v0, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 @@ -1568,15 +1829,18 @@ ; ; GFX90A-TGSPLIT-LABEL: system_one_as_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_byte v0, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: + store i8 0, i8 addrspace(1)* %ptr, align 1 fence syncscope("one-as") acquire ret void } @@ -1620,28 +1884,42 @@ ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("one-as") release ret void } -define amdgpu_kernel void @system_one_as_acq_rel_fence() { +define amdgpu_kernel void @system_one_as_acq_rel_fence(i8 addrspace(1)* %ptr) { ; GFX6-LABEL: system_one_as_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_one_as_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_byte v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_one_as_acq_rel_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_store_byte v0, v0, s[0:1] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1650,6 +1928,10 @@ ; ; GFX10-CU-LABEL: system_one_as_acq_rel_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_store_byte v0, v0, s[0:1] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1658,11 +1940,21 @@ ; ; SKIP-CACHE-INV-LABEL: system_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_byte v0, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 @@ -1672,34 +1964,53 @@ ; ; GFX90A-TGSPLIT-LABEL: system_one_as_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_byte v0, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: + store i8 0, i8 addrspace(1)* %ptr, align 1 fence syncscope("one-as") acq_rel ret void } -define amdgpu_kernel void @system_one_as_seq_cst_fence() { +define amdgpu_kernel void @system_one_as_seq_cst_fence(i8 addrspace(1)* %ptr) { ; GFX6-LABEL: system_one_as_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_one_as_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_byte v[0:1], v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_one_as_seq_cst_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_store_byte v0, v0, s[0:1] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1708,6 +2019,10 @@ ; ; GFX10-CU-LABEL: system_one_as_seq_cst_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_store_byte v0, v0, s[0:1] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1716,11 +2031,21 @@ ; ; SKIP-CACHE-INV-LABEL: system_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_byte v0, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 @@ -1730,15 +2055,18 @@ ; ; GFX90A-TGSPLIT-LABEL: system_one_as_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_byte v0, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: + store i8 0, i8 addrspace(1)* %ptr, align 1 fence syncscope("one-as") seq_cst ret void }