This is an archive of the discontinued LLVM Phabricator instance.

Differential D71934

[AMDGPU] need to insert wait between the scalar load and vector store to the same address to avoid WAR conflict.
ClosedPublic

Authored by alex-t on Dec 27 2019, 8:27 AM.

Download Raw Diff

Details

Reviewers

rampitec
vpykhtin
nhaehnle

Commits

rGca8b20ca3ba1: [AMDGPU] need to insert wait between the scalar load and vector store to the…

Summary

Before divergence driven ISel introduced scalar loads from the global address space we relied on the VMEM operations ordering enforced by the HW. Now we can easily get WAR on scalar load followed vector store to same address.
The case is here: https://github.com/RadeonOpenCompute/ROCm/issues/500

Current fix relies on the MachineMemOperands equality to check that SMRD and VMEM use same address.
Proper fix should include creating the alias analysis on the machine IR that is obviously too big hummer at the moment.

Diff Detail

Event Timeline

alex-t created this revision.Dec 27 2019, 8:27 AM

Herald added a project: Restricted Project. · View Herald TranscriptDec 27 2019, 8:27 AM

Herald added subscribers: hiraditya, t-tye, tpr and 6 others. · View Herald Transcript

alex-t edited the summary of this revision. (Show Details)Dec 27 2019, 8:30 AM

rampitec added inline comments.Dec 30 2019, 1:45 PM

llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll
1	Please add -check-prefix=GCN.
8	Test should have no numbered values. Please run opt -instnamer on it.

Test updated.

LGTM. Delete source_filename from the test before push.

llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll
9	Delete this line.

This revision is now accepted and ready to land.Jan 2 2020, 10:40 AM

Closed by commit rGca8b20ca3ba1: [AMDGPU] need to insert wait between the scalar load and vector store to the… (authored by alex-t). · Explain WhyJan 4 2020, 7:28 AM

This revision was automatically updated to reflect the committed changes.

foad mentioned this in D101177: [AMDGPU] Skip invariant loads when avoiding WAR conflicts.Apr 26 2021, 2:10 AM

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

SIInsertWaitcnts.cpp

21 lines

test/

CodeGen/

AMDGPU/

smrd_vmem_war.ll

26 lines

Diff 235425

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Show All 36 Lines
#include "llvm/ADT/SmallVector.h"		#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MachineBasicBlock.h"		#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"		#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"		#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"		#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"		#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"		#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineOperand.h"		#include "llvm/CodeGen/MachineOperand.h"
		#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"		#include "llvm/CodeGen/MachineRegisterInfo.h"
		#include "llvm/InitializePasses.h"
#include "llvm/IR/DebugLoc.h"		#include "llvm/IR/DebugLoc.h"
#include "llvm/Pass.h"		#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"		#include "llvm/Support/Debug.h"
#include "llvm/Support/DebugCounter.h"		#include "llvm/Support/DebugCounter.h"
#include "llvm/Support/ErrorHandling.h"		#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"		#include "llvm/Support/raw_ostream.h"
#include <algorithm>		#include <algorithm>
#include <cassert>		#include <cassert>
▲ Show 20 Lines • Show All 313 Lines • ▼ Show 20 Lines
private:		private:
const GCNSubtarget *ST = nullptr;		const GCNSubtarget *ST = nullptr;
const SIInstrInfo *TII = nullptr;		const SIInstrInfo *TII = nullptr;
const SIRegisterInfo *TRI = nullptr;		const SIRegisterInfo *TRI = nullptr;
const MachineRegisterInfo *MRI = nullptr;		const MachineRegisterInfo *MRI = nullptr;
AMDGPU::IsaVersion IV;		AMDGPU::IsaVersion IV;

DenseSet<MachineInstr *> TrackedWaitcntSet;		DenseSet<MachineInstr *> TrackedWaitcntSet;
		DenseMap<const Value , MachineBasicBlock > SLoadAddresses;
		MachinePostDominatorTree *PDT;

struct BlockInfo {		struct BlockInfo {
MachineBasicBlock *MBB;		MachineBasicBlock *MBB;
std::unique_ptr<WaitcntBrackets> Incoming;		std::unique_ptr<WaitcntBrackets> Incoming;
bool Dirty = true;		bool Dirty = true;

explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}		explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}
};		};
Show All 18 Lines	public:
bool runOnMachineFunction(MachineFunction &MF) override;		bool runOnMachineFunction(MachineFunction &MF) override;

StringRef getPassName() const override {		StringRef getPassName() const override {
return "SI insert wait instructions";		return "SI insert wait instructions";
}		}

void getAnalysisUsage(AnalysisUsage &AU) const override {		void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();		AU.setPreservesCFG();
		AU.addRequired<MachinePostDominatorTree>();
MachineFunctionPass::getAnalysisUsage(AU);		MachineFunctionPass::getAnalysisUsage(AU);
}		}

bool isForceEmitWaitcnt() const {		bool isForceEmitWaitcnt() const {
for (auto T : inst_counter_types())		for (auto T : inst_counter_types())
if (ForceEmitWaitcnt[T])		if (ForceEmitWaitcnt[T])
return true;		return true;
return false;		return false;
▲ Show 20 Lines • Show All 370 Lines • ▼ Show 20 Lines	bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
// Scalar memory read always can go out of order.		// Scalar memory read always can go out of order.
if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))		if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
return true;		return true;
return MixedPendingEvents[T];		return MixedPendingEvents[T];
}		}

INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,		INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
false)		false)
		INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,		INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
false)		false)

char SIInsertWaitcnts::ID = 0;		char SIInsertWaitcnts::ID = 0;

char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;		char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;

FunctionPass *llvm::createSIInsertWaitcntsPass() {		FunctionPass *llvm::createSIInsertWaitcntsPass() {
▲ Show 20 Lines • Show All 204 Lines • ▼ Show 20 Lines	if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
// Two cases are handled for destination operands:		// Two cases are handled for destination operands:
// 1) If the destination operand was defined by a load, add the s_waitcnt		// 1) If the destination operand was defined by a load, add the s_waitcnt
// instruction to guarantee the right WAW order.		// instruction to guarantee the right WAW order.
// 2) If a destination operand that was used by a recent export/store ins,		// 2) If a destination operand that was used by a recent export/store ins,
// add s_waitcnt on exp_cnt to guarantee the WAR order.		// add s_waitcnt on exp_cnt to guarantee the WAR order.
if (MI.mayStore()) {		if (MI.mayStore()) {
// FIXME: Should not be relying on memoperands.		// FIXME: Should not be relying on memoperands.
for (const MachineMemOperand *Memop : MI.memoperands()) {		for (const MachineMemOperand *Memop : MI.memoperands()) {
		const Value *Ptr = Memop->getValue();
		if (SLoadAddresses.count(Ptr)) {
		addWait(Wait, LGKM_CNT, 0);
		if (PDT->dominates(MI.getParent(),
		SLoadAddresses.find(Ptr)->second))
		SLoadAddresses.erase(Ptr);
		}
unsigned AS = Memop->getAddrSpace();		unsigned AS = Memop->getAddrSpace();
if (AS != AMDGPUAS::LOCAL_ADDRESS)		if (AS != AMDGPUAS::LOCAL_ADDRESS)
continue;		continue;
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;		unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
ScoreBrackets.determineWait(		ScoreBrackets.determineWait(
VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);		VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
ScoreBrackets.determineWait(		ScoreBrackets.determineWait(
EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);		EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
▲ Show 20 Lines • Show All 371 Lines • ▼ Show 20 Lines	if (readsVCCZ(Inst)) {
if (ScoreBrackets.getScoreLB(LGKM_CNT) <		if (ScoreBrackets.getScoreLB(LGKM_CNT) <
ScoreBrackets.getScoreUB(LGKM_CNT) &&		ScoreBrackets.getScoreUB(LGKM_CNT) &&
ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {		ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
if (ST->hasReadVCCZBug())		if (ST->hasReadVCCZBug())
VCCZBugWorkAround = true;		VCCZBugWorkAround = true;
}		}
}		}

		if (TII->isSMRD(Inst)) {
		for (const MachineMemOperand *Memop : Inst.memoperands()) {
		const Value *Ptr = Memop->getValue();
		SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent()));
		}
		}

// Generate an s_waitcnt instruction to be placed before		// Generate an s_waitcnt instruction to be placed before
// cur_Inst, if needed.		// cur_Inst, if needed.
Modified \|= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);		Modified \|= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
OldWaitcntInstr = nullptr;		OldWaitcntInstr = nullptr;

updateEventWaitcntAfter(Inst, &ScoreBrackets);		updateEventWaitcntAfter(Inst, &ScoreBrackets);

#if 0 // TODO: implement resource type check controlled by options with ub = LB.		#if 0 // TODO: implement resource type check controlled by options with ub = LB.
Show All 33 Lines

bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {		bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
ST = &MF.getSubtarget<GCNSubtarget>();		ST = &MF.getSubtarget<GCNSubtarget>();
TII = ST->getInstrInfo();		TII = ST->getInstrInfo();
TRI = &TII->getRegisterInfo();		TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();		MRI = &MF.getRegInfo();
IV = AMDGPU::getIsaVersion(ST->getCPU());		IV = AMDGPU::getIsaVersion(ST->getCPU());
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();		const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
		PDT = &getAnalysis<MachinePostDominatorTree>();

ForceEmitZeroWaitcnts = ForceEmitZeroFlag;		ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
for (auto T : inst_counter_types())		for (auto T : inst_counter_types())
ForceEmitWaitcnt[T] = false;		ForceEmitWaitcnt[T] = false;

HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);		HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);		HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);		HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
▲ Show 20 Lines • Show All 148 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll

This file was added.

				; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck %s
				rampitecUnsubmitted Not Done Reply Inline Actions Please add -check-prefix=GCN. rampitec: Please add -check-prefix=GCN.

				; CHECK-LABEL: BB0_1
				; CHECK: s_load_dword s{{[0-9]+}}, s{{\[}}[[ADDR_LO:[0-9]+]]{{\:}}[[ADDR_HI:[0-9]+]]{{\]}}, 0x0
				; CHECK: s_waitcnt lgkmcnt(0)
				; CHECK: global_store_dword v{{\[}}[[ADDR_LO]]{{\:}}[[ADDR_HI]]{{\]}}, v{{[0-9]+}}, off

				define amdgpu_kernel void @smrd_vmem_war(i32 addrspace(1)* nocapture %0, i64 addrspace(1)* nocapture %1) {
				rampitecUnsubmitted Not Done Reply Inline Actions Test should have no numbered values. Please run opt -instnamer on it. rampitec: Test should have no numbered values. Please run opt -instnamer on it.
				%3 = call i32 @llvm.amdgcn.workitem.id.x()
				rampitecUnsubmitted Not Done Reply Inline Actions Delete this line. rampitec: Delete this line.
				%4 = icmp eq i32 %3, 0
				br i1 %4, label %5, label %10

				5: ; preds = %2
				%6 = load i32, i32 addrspace(1)* %0, align 4
				store i32 0, i32 addrspace(1)* %0, align 4
				%7 = zext i32 %6 to i64
				%8 = load i64, i64 addrspace(1)* %1, align 8
				%9 = add i64 %8, %7
				store i64 %9, i64 addrspace(1)* %1, align 8
				br label %10

				10: ; preds = %5, %2
				ret void
				}

				declare i32 @llvm.amdgcn.workitem.id.x()