Diff 224876

llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp

Show First 20 Lines • Show All 347 Lines • ▼ Show 20 Lines

bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {		bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);		assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);		LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);

auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);		auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
assert(DstOpnd && DstOpnd->isReg());		assert(DstOpnd && DstOpnd->isReg());
auto DPPMovReg = DstOpnd->getReg();		auto DPPMovReg = DstOpnd->getReg();
auto DPPMovSubReg = DstOpnd->getSubReg();
if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) {		if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) {
LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same"		LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same"
" for all uses\n");		" for all uses\n");
return false;		return false;
}		}

auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask);		auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask);
assert(RowMaskOpnd && RowMaskOpnd->isImm());		assert(RowMaskOpnd && RowMaskOpnd->isImm());
▲ Show 20 Lines • Show All 190 Lines • ▼ Show 20 Lines	bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {

bool Changed = false;		bool Changed = false;
for (auto &MBB : MF) {		for (auto &MBB : MF) {
for (auto I = MBB.rbegin(), E = MBB.rend(); I != E;) {		for (auto I = MBB.rbegin(), E = MBB.rend(); I != E;) {
auto &MI = *I++;		auto &MI = *I++;
if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) {		if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) {
Changed = true;		Changed = true;
++NumDPPMovsCombined;		++NumDPPMovsCombined;
		} else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO) {
		auto Split = TII->expandMovDPP64(MI);
		for (auto M : { Split.first, Split.second }) {
		if (combineDPPMov(*M))
		++NumDPPMovsCombined;
		}
		Changed = true;
}		}
}		}
}		}
return Changed;		return Changed;
}		}
		vpykhtinUnsubmitted Done Reply Inline Actions Changed should depend on the Split done. vpykhtin: Changed should depend on the Split done.

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Show First 20 Lines • Show All 223 Lines • ▼ Show 20 Lines	public:

void loadRegFromStackSlot(MachineBasicBlock &MBB,		void loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI, unsigned DestReg,		MachineBasicBlock::iterator MI, unsigned DestReg,
int FrameIndex, const TargetRegisterClass *RC,		int FrameIndex, const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;		const TargetRegisterInfo *TRI) const override;

bool expandPostRAPseudo(MachineInstr &MI) const override;		bool expandPostRAPseudo(MachineInstr &MI) const override;

		// Splits a V_MOV_B64_DPP_PSEUDO opcode into a pair of v_mov_b32_dpp
		vpykhtinUnsubmitted Done Reply Inline Actions please add short desc vpykhtin: please add short desc
		// instructions. Returns a pair of generated instructions.
		// Can split either post-RA with physical registers or pre-RA with
		// virtual registers. In latter case IR needs to be in SSA form and
		// and a REG_SEQUENCE is produced to define original register.
		std::pair<MachineInstr, MachineInstr>
		expandMovDPP64(MachineInstr &MI) const;

// Returns an opcode that can be used to move a value to a \p DstRC		// Returns an opcode that can be used to move a value to a \p DstRC
// register. If there is no hardware instruction that can store to \p		// register. If there is no hardware instruction that can store to \p
// DstRC, then AMDGPU::COPY is returned.		// DstRC, then AMDGPU::COPY is returned.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;		unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;

LLVM_READONLY		LLVM_READONLY
int commuteOpcode(unsigned Opc) const;		int commuteOpcode(unsigned Opc) const;

▲ Show 20 Lines • Show All 904 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Show First 20 Lines • Show All 1,448 Lines • ▼ Show 20 Lines	if (SrcOp.isImm()) {
.addReg(Dst, RegState::Implicit \| RegState::Define);		.addReg(Dst, RegState::Implicit \| RegState::Define);
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)		BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
.addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))		.addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
.addReg(Dst, RegState::Implicit \| RegState::Define);		.addReg(Dst, RegState::Implicit \| RegState::Define);
}		}
MI.eraseFromParent();		MI.eraseFromParent();
break;		break;
}		}
		case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
		expandMovDPP64(MI);
		break;
		}
case AMDGPU::V_SET_INACTIVE_B32: {		case AMDGPU::V_SET_INACTIVE_B32: {
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;		unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;		unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
BuildMI(MBB, MI, DL, get(NotOpc), Exec)		BuildMI(MBB, MI, DL, get(NotOpc), Exec)
.addReg(Exec);		.addReg(Exec);
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())		BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
.add(MI.getOperand(2));		.add(MI.getOperand(2));
BuildMI(MBB, MI, DL, get(NotOpc), Exec)		BuildMI(MBB, MI, DL, get(NotOpc), Exec)
▲ Show 20 Lines • Show All 97 Lines • ▼ Show 20 Lines	case TargetOpcode::BUNDLE: {

MI.eraseFromParent();		MI.eraseFromParent();
break;		break;
}		}
}		}
return true;		return true;
}		}

		std::pair<MachineInstr, MachineInstr>
		SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
		assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);

		MachineBasicBlock &MBB = *MI.getParent();
		DebugLoc DL = MBB.findDebugLoc(MI);
		MachineFunction *MF = MBB.getParent();
		MachineRegisterInfo &MRI = MF->getRegInfo();
		Register Dst = MI.getOperand(0).getReg();
		unsigned Part = 0;
		MachineInstr *Split[2];


		for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
		auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
		if (Dst.isPhysical()) {
		MovDPP.addDef(RI.getSubReg(Dst, Sub));
		} else {
		assert(MRI.isSSA());
		auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
		MovDPP.addDef(Tmp);
		}

		for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
		const MachineOperand &SrcOp = MI.getOperand(I);
		assert(!SrcOp.isFPImm());
		if (SrcOp.isImm()) {
		APInt Imm(64, SrcOp.getImm());
		Imm.ashrInPlace(Part * 32);
		MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
		} else {
		assert(SrcOp.isReg());
		Register Src = SrcOp.getReg();
		if (Src.isPhysical())
		MovDPP.addReg(RI.getSubReg(Src, Sub));
		else
		MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
		}
		}

		for (unsigned I = 3; I < MI.getNumExplicitOperands(); ++I)
		MovDPP.addImm(MI.getOperand(I).getImm());

		Split[Part] = MovDPP;
		++Part;
		}

		if (Dst.isVirtual())
		BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
		.addReg(Split[0]->getOperand(0).getReg())
		.addImm(AMDGPU::sub0)
		.addReg(Split[1]->getOperand(0).getReg())
		.addImm(AMDGPU::sub1);

		MI.eraseFromParent();
		return std::make_pair(Split[0], Split[1]);
		}

bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,		bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
MachineOperand &Src0,		MachineOperand &Src0,
unsigned Src0OpName,		unsigned Src0OpName,
MachineOperand &Src1,		MachineOperand &Src1,
unsigned Src1OpName) const {		unsigned Src1OpName) const {
MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);		MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
if (!Src0Mods)		if (!Src0Mods)
return false;		return false;
▲ Show 20 Lines • Show All 4,884 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIInstructions.td

Show First 20 Lines • Show All 86 Lines • ▼ Show 20 Lines
def ATOMIC_FENCE : SPseudoInstSI<		def ATOMIC_FENCE : SPseudoInstSI<
(outs), (ins i32imm:$ordering, i32imm:$scope),		(outs), (ins i32imm:$ordering, i32imm:$scope),
[(atomic_fence (i32 imm:$ordering), (i32 imm:$scope))],		[(atomic_fence (i32 imm:$ordering), (i32 imm:$scope))],
"ATOMIC_FENCE $ordering, $scope"> {		"ATOMIC_FENCE $ordering, $scope"> {
let hasSideEffects = 1;		let hasSideEffects = 1;
let maybeAtomic = 1;		let maybeAtomic = 1;
}		}

		def VOP_I64_I64_DPP : VOPProfile <[i64, i64, untyped, untyped]> {
		let HasExt = 1;
		let HasExtDPP = 1;
		}

let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {		let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {

// For use in patterns		// For use in patterns
def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),		def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
(ins VSrc_b64:$src0, VSrc_b64:$src1, SSrc_b64:$src2), "", []> {		(ins VSrc_b64:$src0, VSrc_b64:$src1, SSrc_b64:$src2), "", []> {
let isPseudo = 1;		let isPseudo = 1;
let isCodeGenOnly = 1;		let isCodeGenOnly = 1;
let usesCustomInserter = 1;		let usesCustomInserter = 1;
}		}

// 64-bit vector move instruction. This is mainly used by the		// 64-bit vector move instruction. This is mainly used by the
// SIFoldOperands pass to enable folding of inline immediates.		// SIFoldOperands pass to enable folding of inline immediates.
def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),		def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
(ins VSrc_b64:$src0)>;		(ins VSrc_b64:$src0)>;

		// 64-bit vector move with dpp. Expanded post-RA.
		def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64_DPP> {
		let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete.
		}

// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the		// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the
// WQM pass processes it.		// WQM pass processes it.
def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;		def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;

// Pseudoinstruction for @llvm.amdgcn.softwqm. Like @llvm.amdgcn.wqm it is		// Pseudoinstruction for @llvm.amdgcn.softwqm. Like @llvm.amdgcn.wqm it is
// turned into a copy by WQM pass, but does not seed WQM requirements.		// turned into a copy by WQM pass, but does not seed WQM requirements.
def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;		def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;

▲ Show 20 Lines • Show All 1,732 Lines • ▼ Show 20 Lines	def : GCNPat <
(INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)		(INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
>;		>;

def : GCNPat <		def : GCNPat <
(v4f16 (scalar_to_vector f16:$src0)),		(v4f16 (scalar_to_vector f16:$src0)),
(INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)		(INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
>;		>;

		def : GCNPat <
		(i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask,
		timm:$bound_ctrl)),
		(V_MOV_B64_DPP_PSEUDO $src, $src, (as_i32imm $dpp_ctrl),
		(as_i32imm $row_mask), (as_i32imm $bank_mask),
		(as_i1imm $bound_ctrl))
		>;

		arsenmUnsubmitted Done Reply Inline Actions Why not do the split here? Why treat it as a post-RA pseudo? At latest I would have expected this to be expanded in FinalizeISel arsenm: Why not do the split here? Why treat it as a post-RA pseudo? At latest I would have expected…
		arsenmUnsubmitted Not Done Reply Inline Actions I still don't see the point of the pseudo. You can just emit the reg_sequence directly here? arsenm: I still don't see the point of the pseudo. You can just emit the reg_sequence directly here?
		def : GCNPat <
		(i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask,
		timm:$bank_mask, timm:$bound_ctrl)),
		(V_MOV_B64_DPP_PSEUDO $old, $src, (as_i32imm $dpp_ctrl),
		(as_i32imm $row_mask), (as_i32imm $bank_mask),
		(as_i1imm $bound_ctrl))
		>;

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Fract Patterns		// Fract Patterns
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

let SubtargetPredicate = isGFX6 in {		let SubtargetPredicate = isGFX6 in {

// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is		// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is
// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient		// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient
▲ Show 20 Lines • Show All 129 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/dpp_combine.mir

Show First 20 Lines • Show All 712 Lines • ▼ Show 20 Lines	bb.0:
%8:vgpr_32 = V_MOV_B32_e32 5, implicit $exec		%8:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
%2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec		%2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec
%3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec		%3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec
%4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1		%4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
%5:vreg_64 = REG_SEQUENCE %4.sub0, %subreg.sub0, %4.sub1, %subreg.sub1		%5:vreg_64 = REG_SEQUENCE %4.sub0, %subreg.sub0, %4.sub1, %subreg.sub1
%6:vgpr_32 = V_ADD_I32_e32 %5.sub0, %8, implicit-def $vcc, implicit $exec		%6:vgpr_32 = V_ADD_I32_e32 %5.sub0, %8, implicit-def $vcc, implicit $exec
%7:vgpr_32 = V_ADDC_U32_e32 %5.sub1, %8, implicit-def $vcc, implicit $vcc, implicit $exec		%7:vgpr_32 = V_ADDC_U32_e32 %5.sub1, %8, implicit-def $vcc, implicit $vcc, implicit $exec
...		...

		# GCN-LABEL: name: dpp64_add64_impdef
		# GCN: %3:vgpr_32 = V_ADD_I32_dpp %1.sub0, %0.sub0, undef %4:vgpr_32, 1, 15, 15, 1, implicit-def $vcc, implicit $exec
		# GCN: %5:vgpr_32 = V_ADDC_U32_dpp %1.sub1, %0.sub1, undef %4:vgpr_32, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
		name: dpp64_add64_impdef
		tracksRegLiveness: true
		body: \|
		bb.0:
		%0:vreg_64 = IMPLICIT_DEF
		%1:vreg_64 = IMPLICIT_DEF
		%2:vreg_64 = V_MOV_B64_DPP_PSEUDO %1:vreg_64, %0:vreg_64, 1, 15, 15, 1, implicit $exec
		%5:vgpr_32 = V_ADD_I32_e32 %2.sub0, undef %4:vgpr_32, implicit-def $vcc, implicit $exec
		%6:vgpr_32 = V_ADDC_U32_e32 %2.sub1, undef %4, implicit-def $vcc, implicit $vcc, implicit $exec
		...

		# GCN-LABEL: name: dpp64_add64_undef
		# GCN: %3:vgpr_32 = V_ADD_I32_dpp undef %1.sub0:vreg_64, undef %2.sub0:vreg_64, undef %4:vgpr_32, 1, 15, 15, 1, implicit-def $vcc, implicit $exec
		# GCN: %5:vgpr_32 = V_ADDC_U32_dpp undef %1.sub1:vreg_64, undef %2.sub1:vreg_64, undef %4:vgpr_32, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
		name: dpp64_add64_undef
		tracksRegLiveness: true
		body: \|
		bb.0:
		%2:vreg_64 = V_MOV_B64_DPP_PSEUDO undef %1:vreg_64, undef %0:vreg_64, 1, 15, 15, 1, implicit $exec
		%5:vgpr_32 = V_ADD_I32_e32 %2.sub0, undef %4:vgpr_32, implicit-def $vcc, implicit $exec
		%6:vgpr_32 = V_ADDC_U32_e32 %2.sub1, undef %4, implicit-def $vcc, implicit $vcc, implicit $exec
		...

		# GCN-LABEL: name: dpp64_add64_first_combined
		# GCN: %8:vgpr_32 = V_MOV_B32_dpp undef %1.sub1:vreg_64, undef %2.sub1:vreg_64, 1, 15, 15, 1, implicit $exec
		# GCN: %0:vreg_64 = REG_SEQUENCE undef %7:vgpr_32, %subreg.sub0, %8, %subreg.sub1
		# GCN: %3:vgpr_32 = V_ADD_I32_dpp undef %1.sub0:vreg_64, undef %2.sub0:vreg_64, undef %4:vgpr_32, 1, 15, 15, 1, implicit-def $vcc, implicit $exec
		# GCN: %5:vgpr_32, dead %6:sreg_64_xexec = V_ADDC_U32_e64 1, %0.sub1, undef $vcc, 0, implicit $exec
		name: dpp64_add64_first_combined
		tracksRegLiveness: true
		body: \|
		bb.0:
		%2:vreg_64 = V_MOV_B64_DPP_PSEUDO undef %1:vreg_64, undef %0:vreg_64, 1, 15, 15, 1, implicit $exec
		%4:vgpr_32 = V_ADD_I32_e32 %2.sub0, undef %3:vgpr_32, implicit-def $vcc, implicit $exec
		%5:vgpr_32, dead %6:sreg_64_xexec = V_ADDC_U32_e64 1, %2.sub1, undef $vcc, 0, implicit $exec
		...

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll

Show First 20 Lines • Show All 70 Lines • ▼ Show 20 Lines	endif:
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %val_i32, i32 1, i32 1, i32 1, i1 1) #0		%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %val_i32, i32 1, i32 1, i32 1, i1 1) #0
%tmp1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0		%tmp1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0
%tmp2 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp1, i32 1, i32 1, i32 1, i1 1) #0		%tmp2 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp1, i32 1, i32 1, i32 1, i1 1) #0
%tmp_float = bitcast i32 %tmp2 to float		%tmp_float = bitcast i32 %tmp2 to float
store float %tmp_float, float addrspace(1)* %out		store float %tmp_float, float addrspace(1)* %out
ret void		ret void
}		}

		; VI-LABEL: {{^}}mov_dpp64_test:
		; VI: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
		; VI: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
		define amdgpu_kernel void @mov_dpp64_test(i64 addrspace(1)* %out, i64 %in1) {
		%tmp0 = call i64 @llvm.amdgcn.mov.dpp.i64(i64 %in1, i32 1, i32 1, i32 1, i1 0) #0
		store i64 %tmp0, i64 addrspace(1)* %out
		ret void
		}

		; VI-LABEL: {{^}}mov_dpp64_imm_test:
		; VI-OPT-DAG: s_mov_b32 s[[SOLD_LO:[0-9]+]], 0x3afaedd9
		; VI-OPT-DAG: s_movk_i32 s[[SOLD_HI:[0-9]+]], 0x7047
		; VI-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], s[[SOLD_LO]]
		; VI-OPT-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], s[[SOLD_HI]]
		; VI-OPT-DAG: v_mov_b32_dpp v[[OLD_LO]], v[[OLD_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
		; VI-OPT-DAG: v_mov_b32_dpp v[[OLD_HI]], v[[OLD_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
		; VI-NOOPT-COUNT2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
		define amdgpu_kernel void @mov_dpp64_imm_test(i64 addrspace(1)* %out) {
		%tmp0 = call i64 @llvm.amdgcn.mov.dpp.i64(i64 123451234512345, i32 1, i32 1, i32 1, i1 0) #0
		store i64 %tmp0, i64 addrspace(1)* %out
		ret void
		}

declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #0		declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #0
		declare i64 @llvm.amdgcn.mov.dpp.i64(i64, i32, i32, i32, i1) #0

attributes #0 = { nounwind readnone convergent }		attributes #0 = { nounwind readnone convergent }

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll

	; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s \| FileCheck --check-prefixes=GCN,GFX8 %s			; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s \| FileCheck --check-prefixes=GCN,GFX8,GFX8-OPT,GCN-OPT %s
	; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s \| FileCheck --check-prefixes=GCN,GFX10 %s			; RUN: llc -march=amdgcn -mcpu=tonga -O0 -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s \| FileCheck --check-prefixes=GCN,GFX8,GFX8-NOOPT %s
				; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s \| FileCheck --check-prefixes=GCN,GFX10,GCN-OPT %s

	; GCN-LABEL: {{^}}dpp_test:			; GCN-LABEL: {{^}}dpp_test:
	; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}			; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
	; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}			; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
	; GFX8: s_nop 1			; GFX8-OPT: s_nop 1
				; GFX8-NOOPT: s_nop 0
				; GFX8-NOOPT-NEXT: s_nop 0
	; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}			; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
	define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) {			define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) {
	%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 0) #0			%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 0) #0
	store i32 %tmp0, i32 addrspace(1)* %out			store i32 %tmp0, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: {{^}}dpp_test_bc:			; GCN-LABEL: {{^}}dpp_test_bc:
	; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}			; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
	; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}			; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
	; GFX8: s_nop 1			; GFX8-OPT: s_nop 1
				; GFX8-NOOPT: s_nop 0
				; GFX8-NOOPT-NEXT: s_nop 0
	; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0{{$}}			; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0{{$}}
	define amdgpu_kernel void @dpp_test_bc(i32 addrspace(1)* %out, i32 %in1, i32 %in2) {			define amdgpu_kernel void @dpp_test_bc(i32 addrspace(1)* %out, i32 %in1, i32 %in2) {
	%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 2, i32 1, i32 1, i1 1) #0			%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 2, i32 1, i32 1, i1 1) #0
	store i32 %tmp0, i32 addrspace(1)* %out			store i32 %tmp0, i32 addrspace(1)* %out
	ret void			ret void
	}			}


	; VI-LABEL: {{^}}dpp_test1:			; GCN-LABEL: {{^}}dpp_test1:
	; GFX10: v_add_nc_u32_e32 [[REG:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}			; GFX10: v_add_nc_u32_e32 [[REG:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
	; GFX8-OPT: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}			; GFX8-OPT: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}
	; GFX8-NOOPT: v_add_u32_e64 [[REG:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}}			; GFX8-NOOPT: v_add_u32_e64 [[REG:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}}
	; GFX8-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0			; GFX8-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0
	; GFX8: s_nop 0			; GFX8: s_nop 0
	; GFX8-NEXT: s_nop 0			; GFX8-NEXT: s_nop 0
	; GFX8-OPT-NEXT: v_mov_b32_dpp {{v[0-9]+}}, [[REG]] quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf			; GFX8-NEXT: v_mov_b32_dpp {{v[0-9]+}}, [[REG]] quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
	@0 = internal unnamed_addr addrspace(3) global [448 x i32] undef, align 4			@0 = internal unnamed_addr addrspace(3) global [448 x i32] undef, align 4
	define weak_odr amdgpu_kernel void @dpp_test1(i32* %arg) local_unnamed_addr {			define weak_odr amdgpu_kernel void @dpp_test1(i32* %arg) local_unnamed_addr {
	bb:			bb:
	%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()			%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
	%tmp1 = zext i32 %tmp to i64			%tmp1 = zext i32 %tmp to i64
	%tmp2 = getelementptr inbounds [448 x i32], [448 x i32] addrspace(3)* @0, i32 0, i32 %tmp			%tmp2 = getelementptr inbounds [448 x i32], [448 x i32] addrspace(3)* @0, i32 0, i32 %tmp
	%tmp3 = load i32, i32 addrspace(3)* %tmp2, align 4			%tmp3 = load i32, i32 addrspace(3)* %tmp2, align 4
	fence syncscope("workgroup-one-as") release			fence syncscope("workgroup-one-as") release
	tail call void @llvm.amdgcn.s.barrier()			tail call void @llvm.amdgcn.s.barrier()
	fence syncscope("workgroup-one-as") acquire			fence syncscope("workgroup-one-as") acquire
	%tmp4 = add nsw i32 %tmp3, %tmp3			%tmp4 = add nsw i32 %tmp3, %tmp3
	%tmp5 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp4, i32 177, i32 15, i32 15, i1 zeroext false)			%tmp5 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp4, i32 177, i32 15, i32 15, i1 zeroext false)
	%tmp6 = add nsw i32 %tmp5, %tmp4			%tmp6 = add nsw i32 %tmp5, %tmp4
	%tmp7 = getelementptr inbounds i32, i32* %arg, i64 %tmp1			%tmp7 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
	store i32 %tmp6, i32* %tmp7, align 4			store i32 %tmp6, i32* %tmp7, align 4
	ret void			ret void
	}			}

				; GCN-LABEL: {{^}}update_dpp64_test:
				; GCN: load_dwordx2 v{{\[}}[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
				; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
				; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
				define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i64 %in2) {
				%id = tail call i32 @llvm.amdgcn.workitem.id.x()
				%gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id
				%load = load i64, i64 addrspace(1)* %gep
				%tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 1, i32 1, i32 1, i1 0) #0
				store i64 %tmp0, i64 addrspace(1)* %gep
				ret void
				}

				; GCN-LABEL: {{^}}update_dpp64_imm_old_test:
				; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x3afaedd9
				; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047
				; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_LO:[0-9]+]], 0x3afaedd9
				; GFX8-NOOPT-DAG: s_movk_i32 s[[SOLD_HI:[0-9]+]], 0x7047
				; GCN-DAG: load_dwordx2 v{{\[}}[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
				; GCN-OPT-DAG: v_mov_b32_dpp v[[OLD_LO]], v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
				; GCN-OPT-DAG: v_mov_b32_dpp v[[OLD_HI]], v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
				; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
				; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
				define amdgpu_kernel void @update_dpp64_imm_old_test(i64 addrspace(1)* %arg, i64 %in2) {
				%id = tail call i32 @llvm.amdgcn.workitem.id.x()
				%gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id
				%load = load i64, i64 addrspace(1)* %gep
				%tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 123451234512345, i64 %load, i32 1, i32 1, i32 1, i1 0) #0
				store i64 %tmp0, i64 addrspace(1)* %gep
				ret void
				}

				; GCN-LABEL: {{^}}update_dpp64_imm_src_test:
				; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x3afaedd9
				; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047
				; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_LO:[0-9]+]], 0x3afaedd9
				; GFX8-NOOPT-DAG: s_movk_i32 s[[SOLD_HI:[0-9]+]], 0x7047
				; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[OLD_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
				; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[OLD_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
				; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
				; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
				define amdgpu_kernel void @update_dpp64_imm_src_test(i64 addrspace(1)* %out, i64 %in1) {
				%tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 123451234512345, i32 1, i32 1, i32 1, i1 0) #0
				store i64 %tmp0, i64 addrspace(1)* %out
				ret void
				}

	declare i32 @llvm.amdgcn.workitem.id.x()			declare i32 @llvm.amdgcn.workitem.id.x()
	declare void @llvm.amdgcn.s.barrier()			declare void @llvm.amdgcn.s.barrier()
	declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0			declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0
				declare i64 @llvm.amdgcn.update.dpp.i64(i64, i64, i32, i32, i32, i1) #0

	attributes #0 = { nounwind readnone convergent }			attributes #0 = { nounwind readnone convergent }

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Support mov dpp with 64 bit operands
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 224876

llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp

llvm/lib/Target/AMDGPU/SIInstrInfo.h

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

llvm/lib/Target/AMDGPU/SIInstructions.td

llvm/test/CodeGen/AMDGPU/dpp_combine.mir

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Support mov dpp with 64 bit operandsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 224876

llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp

llvm/lib/Target/AMDGPU/SIInstrInfo.h

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

llvm/lib/Target/AMDGPU/SIInstructions.td

llvm/test/CodeGen/AMDGPU/dpp_combine.mir

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll

[AMDGPU] Support mov dpp with 64 bit operands
ClosedPublic