Diff 223962

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Show First 20 Lines • Show All 964 Lines • ▼ Show 20 Lines	public:
bool hasDPPWavefrontShifts() const {		bool hasDPPWavefrontShifts() const {
return HasDPP && getGeneration() < GFX10;		return HasDPP && getGeneration() < GFX10;
}		}

bool hasDPP8() const {		bool hasDPP8() const {
return HasDPP8;		return HasDPP8;
}		}

		bool hasDPP16() const {
		return HasDPP && GFX10Insts;
		}

bool hasR128A16() const {		bool hasR128A16() const {
return HasR128A16;		return HasR128A16;
}		}

bool hasOffset3fBug() const {		bool hasOffset3fBug() const {
return HasOffset3fBug;		return HasOffset3fBug;
}		}

▲ Show 20 Lines • Show All 371 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Show First 20 Lines • Show All 1,448 Lines • ▼ Show 20 Lines	if (SrcOp.isImm()) {
.addReg(Dst, RegState::Implicit \| RegState::Define);		.addReg(Dst, RegState::Implicit \| RegState::Define);
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)		BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
.addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))		.addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
.addReg(Dst, RegState::Implicit \| RegState::Define);		.addReg(Dst, RegState::Implicit \| RegState::Define);
}		}
MI.eraseFromParent();		MI.eraseFromParent();
break;		break;
}		}
		case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
		Register Dst = MI.getOperand(0).getReg();
		unsigned Opc = ST.hasDPP16() ? AMDGPU::V_MOV_B32_dpp_gfx10
		: AMDGPU::V_MOV_B32_dpp;
		unsigned Part = 0;

		for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
		Register DstSub = RI.getSubReg(Dst, Sub);
		auto MovDPP = BuildMI(MBB, MI, DL, get(Opc), DstSub);

		for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
		const MachineOperand &SrcOp = MI.getOperand(I);
		assert(!SrcOp.isFPImm());
		if (SrcOp.isImm()) {
		APInt Imm(64, SrcOp.getImm());
		Imm.ashrInPlace(Part * 32);
		MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
		} else {
		assert(SrcOp.isReg());
		MovDPP.addReg(RI.getSubReg(SrcOp.getReg(), Sub));
		}
		}

		for (unsigned I = 3; I < MI.getNumExplicitOperands(); ++I)
		MovDPP.addImm(MI.getOperand(I).getImm());

		if (ST.hasDPP16())
		MovDPP.addImm(0); // FI

		MovDPP.addReg(Dst, RegState::Implicit \| RegState::Define);
		++Part;
		}
		MI.eraseFromParent();
		break;
		}
case AMDGPU::V_SET_INACTIVE_B32: {		case AMDGPU::V_SET_INACTIVE_B32: {
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;		unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;		unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
BuildMI(MBB, MI, DL, get(NotOpc), Exec)		BuildMI(MBB, MI, DL, get(NotOpc), Exec)
.addReg(Exec);		.addReg(Exec);
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())		BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
.add(MI.getOperand(2));		.add(MI.getOperand(2));
BuildMI(MBB, MI, DL, get(NotOpc), Exec)		BuildMI(MBB, MI, DL, get(NotOpc), Exec)
▲ Show 20 Lines • Show All 4,997 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIInstructions.td

Show First 20 Lines • Show All 86 Lines • ▼ Show 20 Lines
def ATOMIC_FENCE : SPseudoInstSI<		def ATOMIC_FENCE : SPseudoInstSI<
(outs), (ins i32imm:$ordering, i32imm:$scope),		(outs), (ins i32imm:$ordering, i32imm:$scope),
[(atomic_fence (i32 imm:$ordering), (i32 imm:$scope))],		[(atomic_fence (i32 imm:$ordering), (i32 imm:$scope))],
"ATOMIC_FENCE $ordering, $scope"> {		"ATOMIC_FENCE $ordering, $scope"> {
let hasSideEffects = 1;		let hasSideEffects = 1;
let maybeAtomic = 1;		let maybeAtomic = 1;
}		}

		def VOP_I64_I64_DPP : VOPProfile <[i64, i64, untyped, untyped]> {
		let HasExt = 1;
		let HasExtDPP = 1;
		}

let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {		let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {

// For use in patterns		// For use in patterns
def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),		def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
(ins VSrc_b64:$src0, VSrc_b64:$src1, SSrc_b64:$src2), "", []> {		(ins VSrc_b64:$src0, VSrc_b64:$src1, SSrc_b64:$src2), "", []> {
let isPseudo = 1;		let isPseudo = 1;
let isCodeGenOnly = 1;		let isCodeGenOnly = 1;
let usesCustomInserter = 1;		let usesCustomInserter = 1;
}		}

// 64-bit vector move instruction. This is mainly used by the		// 64-bit vector move instruction. This is mainly used by the
// SIFoldOperands pass to enable folding of inline immediates.		// SIFoldOperands pass to enable folding of inline immediates.
def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),		def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
(ins VSrc_b64:$src0)>;		(ins VSrc_b64:$src0)>;

		// 64-bit vector move with dpp. Expanded post-RA.
		def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64_DPP> {
		let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete.
		}

// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the		// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the
// WQM pass processes it.		// WQM pass processes it.
def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;		def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;

// Pseudoinstruction for @llvm.amdgcn.softwqm. Like @llvm.amdgcn.wqm it is		// Pseudoinstruction for @llvm.amdgcn.softwqm. Like @llvm.amdgcn.wqm it is
// turned into a copy by WQM pass, but does not seed WQM requirements.		// turned into a copy by WQM pass, but does not seed WQM requirements.
def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;		def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;

▲ Show 20 Lines • Show All 1,732 Lines • ▼ Show 20 Lines	def : GCNPat <
(INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)		(INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
>;		>;

def : GCNPat <		def : GCNPat <
(v4f16 (scalar_to_vector f16:$src0)),		(v4f16 (scalar_to_vector f16:$src0)),
(INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)		(INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
>;		>;

		def : GCNPat <
		(i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask,
		timm:$bound_ctrl)),
		(V_MOV_B64_DPP_PSEUDO $src, $src, (as_i32imm $dpp_ctrl),
		(as_i32imm $row_mask), (as_i32imm $bank_mask),
		(as_i1imm $bound_ctrl))
		>;

		arsenmUnsubmitted Done Reply Inline Actions Why not do the split here? Why treat it as a post-RA pseudo? At latest I would have expected this to be expanded in FinalizeISel arsenm: Why not do the split here? Why treat it as a post-RA pseudo? At latest I would have expected…
		arsenmUnsubmitted Not Done Reply Inline Actions I still don't see the point of the pseudo. You can just emit the reg_sequence directly here? arsenm: I still don't see the point of the pseudo. You can just emit the reg_sequence directly here?
		def : GCNPat <
		(i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask,
		timm:$bank_mask, timm:$bound_ctrl)),
		(V_MOV_B64_DPP_PSEUDO $old, $src, (as_i32imm $dpp_ctrl),
		(as_i32imm $row_mask), (as_i32imm $bank_mask),
		(as_i1imm $bound_ctrl))
		>;

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Fract Patterns		// Fract Patterns
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

let SubtargetPredicate = isGFX6 in {		let SubtargetPredicate = isGFX6 in {

// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is		// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is
// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient		// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient
▲ Show 20 Lines • Show All 129 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll

Show First 20 Lines • Show All 70 Lines • ▼ Show 20 Lines	endif:
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %val_i32, i32 1, i32 1, i32 1, i1 1) #0		%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %val_i32, i32 1, i32 1, i32 1, i1 1) #0
%tmp1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0		%tmp1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0
%tmp2 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp1, i32 1, i32 1, i32 1, i1 1) #0		%tmp2 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp1, i32 1, i32 1, i32 1, i1 1) #0
%tmp_float = bitcast i32 %tmp2 to float		%tmp_float = bitcast i32 %tmp2 to float
store float %tmp_float, float addrspace(1)* %out		store float %tmp_float, float addrspace(1)* %out
ret void		ret void
}		}

		; VI-LABEL: {{^}}mov_dpp64_test:
		; VI: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
		; VI: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
		define amdgpu_kernel void @mov_dpp64_test(i64 addrspace(1)* %out, i64 %in1) {
		%tmp0 = call i64 @llvm.amdgcn.mov.dpp.i64(i64 %in1, i32 1, i32 1, i32 1, i1 0) #0
		store i64 %tmp0, i64 addrspace(1)* %out
		ret void
		}

		; VI-LABEL: {{^}}mov_dpp64_imm_test:
		; VI-OPT-DAG: s_mov_b32 s[[SOLD_LO:[0-9]+]], 0x3afaedd9
		; VI-OPT-DAG: s_movk_i32 s[[SOLD_HI:[0-9]+]], 0x7047
		; VI-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], s[[SOLD_LO]]
		; VI-OPT-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], s[[SOLD_HI]]
		; VI-OPT-DAG: v_mov_b32_dpp v[[OLD_LO]], v[[OLD_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
		; VI-OPT-DAG: v_mov_b32_dpp v[[OLD_HI]], v[[OLD_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
		; VI-NOOPT-COUNT2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
		define amdgpu_kernel void @mov_dpp64_imm_test(i64 addrspace(1)* %out) {
		%tmp0 = call i64 @llvm.amdgcn.mov.dpp.i64(i64 123451234512345, i32 1, i32 1, i32 1, i1 0) #0
		store i64 %tmp0, i64 addrspace(1)* %out
		ret void
		}

declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #0		declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #0
		declare i64 @llvm.amdgcn.mov.dpp.i64(i64, i32, i32, i32, i1) #0

attributes #0 = { nounwind readnone convergent }		attributes #0 = { nounwind readnone convergent }

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll

Show First 20 Lines • Show All 44 Lines • ▼ Show 20 Lines	bb:
%tmp4 = add nsw i32 %tmp3, %tmp3		%tmp4 = add nsw i32 %tmp3, %tmp3
%tmp5 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp4, i32 177, i32 15, i32 15, i1 zeroext false)		%tmp5 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp4, i32 177, i32 15, i32 15, i1 zeroext false)
%tmp6 = add nsw i32 %tmp5, %tmp4		%tmp6 = add nsw i32 %tmp5, %tmp4
%tmp7 = getelementptr inbounds i32, i32* %arg, i64 %tmp1		%tmp7 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
store i32 %tmp6, i32* %tmp7, align 4		store i32 %tmp6, i32* %tmp7, align 4
ret void		ret void
}		}

		; GCN-LABEL: {{^}}update_dpp64_test:
		; GCN: load_dwordx2 v{{\[}}[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
		; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
		; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
		define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i64 %in2) {
		%id = tail call i32 @llvm.amdgcn.workitem.id.x()
		%gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id
		%load = load i64, i64 addrspace(1)* %gep
		%tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 1, i32 1, i32 1, i1 0) #0
		store i64 %tmp0, i64 addrspace(1)* %gep
		ret void
		}

		; GCN-LABEL: {{^}}update_dpp64_imm_old_test:
		; GCN-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x3afaedd9
		; GCN-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047
		; GCN-DAG: load_dwordx2 v{{\[}}[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
		; GCN-DAG: v_mov_b32_dpp v[[OLD_LO]], v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
		; GCN-DAG: v_mov_b32_dpp v[[OLD_HI]], v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
		define amdgpu_kernel void @update_dpp64_imm_old_test(i64 addrspace(1)* %arg, i64 %in2) {
		%id = tail call i32 @llvm.amdgcn.workitem.id.x()
		%gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id
		%load = load i64, i64 addrspace(1)* %gep
		%tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 123451234512345, i64 %load, i32 1, i32 1, i32 1, i1 0) #0
		store i64 %tmp0, i64 addrspace(1)* %gep
		ret void
		}

		; GCN-LABEL: {{^}}update_dpp64_imm_src_test:
		; GCN-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x3afaedd9
		; GCN-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047
		; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[OLD_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
		; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[OLD_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
		define amdgpu_kernel void @update_dpp64_imm_src_test(i64 addrspace(1)* %out, i64 %in1) {
		%tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 123451234512345, i32 1, i32 1, i32 1, i1 0) #0
		store i64 %tmp0, i64 addrspace(1)* %out
		ret void
		}

declare i32 @llvm.amdgcn.workitem.id.x()		declare i32 @llvm.amdgcn.workitem.id.x()
declare void @llvm.amdgcn.s.barrier()		declare void @llvm.amdgcn.s.barrier()
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0		declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0
		declare i64 @llvm.amdgcn.update.dpp.i64(i64, i64, i32, i32, i32, i1) #0

attributes #0 = { nounwind readnone convergent }		attributes #0 = { nounwind readnone convergent }

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Support mov dpp with 64 bit operands
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 223962

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

llvm/lib/Target/AMDGPU/SIInstructions.td

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Support mov dpp with 64 bit operandsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 223962

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

llvm/lib/Target/AMDGPU/SIInstructions.td

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll

[AMDGPU] Support mov dpp with 64 bit operands
ClosedPublic