Diff 99425

llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp

Show All 24 Lines
#include "AMDGPUSubtarget.h"		#include "AMDGPUSubtarget.h"
#include "SIDefines.h"		#include "SIDefines.h"
#include "SIInstrInfo.h"		#include "SIInstrInfo.h"
#include "llvm/ADT/Statistic.h"		#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/STLExtras.h"		#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/MachineFunctionPass.h"		#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"		#include "llvm/CodeGen/MachineInstrBuilder.h"
#include <unordered_map>		#include <unordered_map>
		#include <unordered_set>

using namespace llvm;		using namespace llvm;

#define DEBUG_TYPE "si-peephole-sdwa"		#define DEBUG_TYPE "si-peephole-sdwa"

STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");		STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
STATISTIC(NumSDWAInstructionsPeepholed,		STATISTIC(NumSDWAInstructionsPeepholed,
"Number of instruction converted to SDWA.");		"Number of instruction converted to SDWA.");

namespace {		namespace {

class SDWAOperand;		class SDWAOperand;

class SIPeepholeSDWA : public MachineFunctionPass {		class SIPeepholeSDWA : public MachineFunctionPass {
		public:
		typedef SmallVector<SDWAOperand *, 4> SDWAOperandsVector;

private:		private:
MachineRegisterInfo *MRI;		MachineRegisterInfo *MRI;
const SIRegisterInfo *TRI;		const SIRegisterInfo *TRI;
const SIInstrInfo *TII;		const SIInstrInfo *TII;

std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;		std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
		std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches;

Optional<int64_t> foldToImm(const MachineOperand &Op) const;		Optional<int64_t> foldToImm(const MachineOperand &Op) const;

public:		public:
static char ID;		static char ID;

typedef SmallVector<std::unique_ptr<SDWAOperand>, 4> SDWAOperandsVector;

SIPeepholeSDWA() : MachineFunctionPass(ID) {		SIPeepholeSDWA() : MachineFunctionPass(ID) {
initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());		initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());
}		}

bool runOnMachineFunction(MachineFunction &MF) override;		bool runOnMachineFunction(MachineFunction &MF) override;
void matchSDWAOperands(MachineFunction &MF);		void matchSDWAOperands(MachineFunction &MF);
		bool isConvertibleToSDWA(const MachineInstr &MI) const;
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);		bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);

StringRef getPassName() const override { return "SI Peephole SDWA"; }		StringRef getPassName() const override { return "SI Peephole SDWA"; }

void getAnalysisUsage(AnalysisUsage &AU) const override {		void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();		AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);		MachineFunctionPass::getAnalysisUsage(AU);
}		}
▲ Show 20 Lines • Show All 388 Lines • ▼ Show 20 Lines	for (MachineInstr &MI : MBB) {
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);		MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);

if (TRI->isPhysicalRegister(Src1->getReg()) \|\|		if (TRI->isPhysicalRegister(Src1->getReg()) \|\|
TRI->isPhysicalRegister(Dst->getReg()))		TRI->isPhysicalRegister(Dst->getReg()))
break;		break;

if (Opcode == AMDGPU::V_LSHLREV_B16_e32) {		if (Opcode == AMDGPU::V_LSHLREV_B16_e32) {
auto SDWADst =		auto SDWADst =
make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);		make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');		DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
SDWAOperands[&MI] = std::move(SDWADst);		SDWAOperands[&MI] = std::move(SDWADst);
++NumSDWAPatternsFound;		++NumSDWAPatternsFound;
} else {		} else {
auto SDWASrc = make_unique<SDWASrcOperand>(		auto SDWASrc = make_unique<SDWASrcOperand>(
Src1, Dst, BYTE_1, false, false,		Src1, Dst, BYTE_1, false, false,
Opcode == AMDGPU::V_LSHRREV_B16_e32 ? false : true);		Opcode == AMDGPU::V_LSHRREV_B16_e32 ? false : true);
DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');		DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
▲ Show 20 Lines • Show All 90 Lines • ▼ Show 20 Lines	for (MachineInstr &MI : MBB) {
++NumSDWAPatternsFound;		++NumSDWAPatternsFound;
break;		break;
}		}
}		}
}		}
}		}
}		}

bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,		bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI) const {
const SDWAOperandsVector &SDWAOperands) {
// Check if this instruction can be converted to SDWA:		// Check if this instruction can be converted to SDWA:
// 1. Does this opcode support SDWA		// 1. Does this opcode support SDWA
if (AMDGPU::getSDWAOp(MI.getOpcode()) == -1)		if (AMDGPU::getSDWAOp(MI.getOpcode()) == -1)
return false;		return false;

// 2. Are all operands - VGPRs		// 2. Are all operands - VGPRs
for (const MachineOperand &Operand : MI.explicit_operands()) {		for (const MachineOperand &Operand : MI.explicit_operands()) {
if (!Operand.isReg() \|\| !TRI->isVGPR(*MRI, Operand.getReg()))		if (!Operand.isReg() \|\| !TRI->isVGPR(*MRI, Operand.getReg()))
return false;		return false;
}		}

		return true;
		}

		bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
		const SDWAOperandsVector &SDWAOperands) {
// Convert to sdwa		// Convert to sdwa
int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode());		int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode());
assert(SDWAOpcode != -1);		assert(SDWAOpcode != -1);

const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);		const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);

// Create SDWA version of instruction MI and initialize its operands		// Create SDWA version of instruction MI and initialize its operands
MachineInstrBuilder SDWAInst =		MachineInstrBuilder SDWAInst =
▲ Show 20 Lines • Show All 60 Lines • ▼ Show 20 Lines	bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
if (Src1) {		if (Src1) {
assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1);		assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1);
SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);		SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
}		}

// Apply all sdwa operand pattenrs		// Apply all sdwa operand pattenrs
bool Converted = false;		bool Converted = false;
for (auto &Operand : SDWAOperands) {		for (auto &Operand : SDWAOperands) {
		// There should be no intesection between SDWA operands and potential MIs
		// e.g.:
		// v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
		// v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
		// v_add_u32 v3, v4, v2
		//
		// In that example it is possible that we would fold 2nd instruction into 3rd
		// (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was
		// already destroyed). So if SDWAOperand is also a potential MI then do not
		// apply it.
		if (PotentialMatches.count(Operand->getParentInst()) == 0)
Converted \|= Operand->convertToSDWA(*SDWAInst, TII);		Converted \|= Operand->convertToSDWA(*SDWAInst, TII);
}		}
if (!Converted) {		if (!Converted) {
SDWAInst->eraseFromParent();		SDWAInst->eraseFromParent();
return false;		return false;
}		}

DEBUG(dbgs() << "Convert instruction:" << MI		DEBUG(dbgs() << "Convert instruction:" << MI
<< "Into:" << *SDWAInst << '\n');		<< "Into:" << *SDWAInst << '\n');
Show All 9 Lines	bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
if (!ST.hasSDWA() \|\|		if (!ST.hasSDWA() \|\|
!AMDGPU::isVI(ST)) { // TODO: Add support for SDWA on gfx9		!AMDGPU::isVI(ST)) { // TODO: Add support for SDWA on gfx9
return false;		return false;
}		}

MRI = &MF.getRegInfo();		MRI = &MF.getRegInfo();
TRI = ST.getRegisterInfo();		TRI = ST.getRegisterInfo();
TII = ST.getInstrInfo();		TII = ST.getInstrInfo();

std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches;		// Find all SDWA operands in MF.

matchSDWAOperands(MF);		matchSDWAOperands(MF);

for (auto &OperandPair : SDWAOperands) {		for (const auto &OperandPair : SDWAOperands) {
auto &Operand = OperandPair.second;		const auto &Operand = OperandPair.second;
MachineInstr *PotentialMI = Operand->potentialToConvert(TII);		MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
if (PotentialMI) {		if (PotentialMI && isConvertibleToSDWA(*PotentialMI)) {
PotentialMatches[PotentialMI].push_back(std::move(Operand));		PotentialMatches[PotentialMI].push_back(Operand.get());
}		}
}		}

for (auto &PotentialPair : PotentialMatches) {		for (auto &PotentialPair : PotentialMatches) {
MachineInstr &PotentialMI = *PotentialPair.first;		MachineInstr &PotentialMI = *PotentialPair.first;
convertToSDWA(PotentialMI, PotentialPair.second);		convertToSDWA(PotentialMI, PotentialPair.second);
}		}

		PotentialMatches.clear();
SDWAOperands.clear();		SDWAOperands.clear();
return false;		return false;
}		}

llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole.ll

Show First 20 Lines • Show All 387 Lines • ▼ Show 20 Lines	entry:
br label %add_label		br label %add_label
add_label:		add_label:
%add = add <2 x i16> %a, %b		%add = add <2 x i16> %a, %b
br label %store_label		br label %store_label
store_label:		store_label:
store <2 x i16> %add, <2 x i16> addrspace(1)* %out, align 4		store <2 x i16> %add, <2 x i16> addrspace(1)* %out, align 4
ret void		ret void
}		}


		; Check that "pulling out" SDWA operands works correctly.
		; GCN-LABEL: {{^}}pulled_out_test:
		; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
		; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
		; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
		; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
		; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
		; NOSDWA-NOT: v_and_b32_sdwa
		; NOSDWA-NOT: v_or_b32_sdwa

		; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
		; SDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
		; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
		; SDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
		; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD

		define amdgpu_kernel void @pulled_out_test(<8 x i8> addrspace(1)* %sourceA, <8 x i8> addrspace(1)* %destValues) {
		entry:
		%idxprom = ashr exact i64 15, 32
		%arrayidx = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %sourceA, i64 %idxprom
		%tmp = load <8 x i8>, <8 x i8> addrspace(1)* %arrayidx, align 8

		%tmp1 = extractelement <8 x i8> %tmp, i32 0
		%tmp2 = extractelement <8 x i8> %tmp, i32 1
		%tmp3 = extractelement <8 x i8> %tmp, i32 2
		%tmp4 = extractelement <8 x i8> %tmp, i32 3
		%tmp5 = extractelement <8 x i8> %tmp, i32 4
		%tmp6 = extractelement <8 x i8> %tmp, i32 5
		%tmp7 = extractelement <8 x i8> %tmp, i32 6
		%tmp8 = extractelement <8 x i8> %tmp, i32 7

		%tmp9 = insertelement <2 x i8> undef, i8 %tmp1, i32 0
		%tmp10 = insertelement <2 x i8> %tmp9, i8 %tmp2, i32 1
		%tmp11 = insertelement <2 x i8> undef, i8 %tmp3, i32 0
		%tmp12 = insertelement <2 x i8> %tmp11, i8 %tmp4, i32 1
		%tmp13 = insertelement <2 x i8> undef, i8 %tmp5, i32 0
		%tmp14 = insertelement <2 x i8> %tmp13, i8 %tmp6, i32 1
		%tmp15 = insertelement <2 x i8> undef, i8 %tmp7, i32 0
		%tmp16 = insertelement <2 x i8> %tmp15, i8 %tmp8, i32 1

		%tmp17 = shufflevector <2 x i8> %tmp10, <2 x i8> %tmp12, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
		%tmp18 = shufflevector <2 x i8> %tmp14, <2 x i8> %tmp16, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
		%tmp19 = shufflevector <4 x i8> %tmp17, <4 x i8> %tmp18, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>

		%arrayidx5 = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %destValues, i64 %idxprom
		store <8 x i8> %tmp19, <8 x i8> addrspace(1)* %arrayidx5, align 8
		ret void
		}

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] SDWA operands should not intersect with potential MIs
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 99425

llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp

llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] SDWA operands should not intersect with potential MIsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 99425

llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp

llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole.ll

[AMDGPU] SDWA operands should not intersect with potential MIs
ClosedPublic