Diff 474487

llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp

Show First 20 Lines • Show All 826 Lines • ▼ Show 20 Lines	for (MachineInstr &MI : MBB) {
if (auto Operand = matchSDWAOperand(MI)) {		if (auto Operand = matchSDWAOperand(MI)) {
LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');		LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
SDWAOperands[&MI] = std::move(Operand);		SDWAOperands[&MI] = std::move(Operand);
++NumSDWAPatternsFound;		++NumSDWAPatternsFound;
}		}
}		}
}		}

// Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and		// Convert the V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows
// V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows isConvertibleToSDWA		// isConvertibleToSDWA to perform its transformation on V_ADD_CO_U32_e32 into
// to perform its transformation on V_ADD_CO_U32_e32 into V_ADD_CO_U32_sdwa.		// V_ADD_CO_U32_sdwa.
//		//
// We are transforming from a VOP3 into a VOP2 form of the instruction.		// We are transforming from a VOP3 into a VOP2 form of the instruction.
// %19:vgpr_32 = V_AND_B32_e32 255,		// %19:vgpr_32 = V_AND_B32_e32 255,
// killed %16:vgpr_32, implicit $exec		// killed %16:vgpr_32, implicit $exec
// %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64		// %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64
// %26.sub0:vreg_64, %19:vgpr_32, implicit $exec		// %26.sub0:vreg_64, %19:vgpr_32, implicit $exec
// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64		// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
// %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec		// %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec
//		//
// becomes		// becomes
// %47:vgpr_32 = V_ADD_CO_U32_sdwa		// %47:vgpr_32 = V_ADD_CO_U32_sdwa
// 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,		// 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,
// implicit-def $vcc, implicit $exec		// implicit-def $vcc, implicit $exec
// %48:vgpr_32 = V_ADDC_U32_e32		// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
// 0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec		// %26.sub1:vreg_64, %54:vgpr_32, killed $vcc, implicit $exec
void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,		void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
		arsenmUnsubmitted Not Done Reply Inline Actions Only converting the low half to VOP2 is somewhat surprising given the name, but I guess this makes sense arsenm: Only converting the low half to VOP2 is somewhat surprising given the name, but I guess this…
const GCNSubtarget &ST) const {		const GCNSubtarget &ST) const {
int Opc = MI.getOpcode();		int Opc = MI.getOpcode();
assert((Opc == AMDGPU::V_ADD_CO_U32_e64 \|\| Opc == AMDGPU::V_SUB_CO_U32_e64) &&		assert((Opc == AMDGPU::V_ADD_CO_U32_e64 \|\| Opc == AMDGPU::V_SUB_CO_U32_e64) &&
"Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64");		"Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64");

// Can the candidate MI be shrunk?		// Can the candidate MI be shrunk?
if (!TII->canShrink(MI, *MRI))		if (!TII->canShrink(MI, *MRI))
return;		return;
Opc = AMDGPU::getVOPe32(Opc);		Opc = AMDGPU::getVOPe32(Opc);
// Find the related ADD instruction.		// Find the related ADD instruction.
const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);		const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
if (!Sdst)		if (!Sdst)
return;		return;
MachineOperand *NextOp = findSingleRegUse(Sdst, MRI);		MachineOperand *NextOp = findSingleRegUse(Sdst, MRI);
if (!NextOp)		if (!NextOp)
return;		return;
MachineInstr &MISucc = *NextOp->getParent();		MachineInstr &MISucc = *NextOp->getParent();
// Can the successor be shrunk?
		rampitecUnsubmitted Not Done Reply Inline Actions This can be inline literal and still useful I suppose. rampitec: This can be inline literal and still useful I suppose.
		yassinghAuthorUnsubmitted Done Reply Inline Actions Sorry I don't understand, are you suggesting moving this condition to a string literal? yassingh: Sorry I don't understand, are you suggesting moving this condition to a string literal?
		rampitecUnsubmitted Not Done Reply Inline Actions I mean bailing on non-register operand limits the pass. The src1 operand can be inline literal and still convertible to sdwa form. rampitec: I mean bailing on non-register operand limits the pass. The src1 operand can be inline literal…
		yassinghAuthorUnsubmitted Done Reply Inline Actions Earlier we were checking if the instruction is not shrinkable don't proceed. Now we are also checking whether the reason for not shrinking is src1 being an immediate operand. Hence we are covering src1 being an inline literal(correct me if I'm wrongly assuming that inline literal means immediate operand) yassingh: Earlier we were checking if the instruction is not shrinkable don't proceed. Now we are also…
		arsenmUnsubmitted Not Done Reply Inline Actions Inline immediate ares the ones that are free to encode in vsrc0, or in all vsrc operands for VOP3, as opposed to other literals that require materialization in a register. See TII::isInlineConstant arsenm: Inline immediate ares the ones that are free to encode in vsrc0, or in all vsrc operands for…
		rampitecUnsubmitted Not Done Reply Inline Actions If an immediate is an inline literal instruction is still shrinkable. For example if it is 0 or 1 it is shrinkable, if it is 100 it is not. rampitec: If an immediate is an inline literal instruction is still shrinkable. For example if it is 0 or…
		cdevadasUnsubmitted Not Done Reply Inline Actions Need a comment here for allowing non-register vsrc1 cases. cdevadas: Need a comment here for allowing non-register vsrc1 cases.
if (!TII->canShrink(MISucc, *MRI))
return;
int SuccOpc = AMDGPU::getVOPe32(MISucc.getOpcode());
// Make sure the carry in/out are subsequently unused.		// Make sure the carry in/out are subsequently unused.
MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2);		MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2);
if (!CarryIn)		if (!CarryIn)
return;		return;
MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst);		MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst);
if (!CarryOut)		if (!CarryOut)
return;		return;
if (!MRI->hasOneUse(CarryIn->getReg()) \|\| !MRI->use_empty(CarryOut->getReg()))		if (!MRI->hasOneUse(CarryIn->getReg()) \|\| !MRI->use_empty(CarryOut->getReg()))
return;		return;
// Make sure VCC or its subregs are dead before MI.		// Make sure VCC or its subregs are dead before MI.
MachineBasicBlock &MBB = *MI.getParent();		MachineBasicBlock &MBB = *MI.getParent();
auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25);		auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25);
if (Liveness != MachineBasicBlock::LQR_Dead)		if (Liveness != MachineBasicBlock::LQR_Dead)
return;		return;
// Check if VCC is referenced in range of (MI,MISucc].		// Check if VCC is referenced in range of (MI,MISucc].
for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator();		for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator();
I != E; ++I) {		I != E; ++I) {
if (I->modifiesRegister(AMDGPU::VCC, TRI))		if (I->modifiesRegister(AMDGPU::VCC, TRI))
return;		return;
}		}

// Make the two new e32 instruction variants.
// Replace MI with V_{SUB\|ADD}_I32_e32		// Replace MI with V_{SUB\|ADD}_I32_e32
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc))		BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc))
.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst))		.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst))
.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0))		.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0))
.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1))		.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1))
.setMIFlags(MI.getFlags());		.setMIFlags(MI.getFlags());

MI.eraseFromParent();		MI.eraseFromParent();

// Replace MISucc with V_{SUBB\|ADDC}_U32_e32		// Since the carry outpur of MI is now VCC, update it's use in MISucc
		foadUnsubmitted Not Done Reply Inline Actions "output", "its" (no apostrophe), full stop at end of sentence. foad: "output", "its" (no apostrophe), full stop at end of sentence.
BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc))
.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst))
.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0))
.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1))
.setMIFlags(MISucc.getFlags());

MISucc.eraseFromParent();		MISucc.substituteRegister(CarryIn->getReg(), AMDGPU::VCC, 0, *TRI);
		foadUnsubmitted Not Done Reply Inline Actions Doesn't this need to be VCC_LO for wave32? Please add a test for that. You can use SIRegisterInfo::getVCC() to get the appropriate reg for the wave size. foad: Doesn't this need to be VCC_LO for wave32? Please add a test for that. You can use…
		yassinghAuthorUnsubmitted Done Reply Inline Actions Updated to TRI->getVCC(). However I am not able to add test for VCC_LO. Tried compiling for gfx1010, wavefrontsize=32 but SIInstrInfo::canShrink returns false for V_ADD_CO_U32_e64 hence the pass does not attempt converting it to sdwa form. Responsible condition in SIInstrInfo::canShrink() => if (!hasVALU32BitEncoding(MI.getOpcode())) return false; yassingh: Updated to TRI->getVCC(). However I am not able to add test for VCC_LO. Tried compiling for…
		foadUnsubmitted Not Done Reply Inline Actions Oh you're right, GFX10/11 V_ADD_CO_U32 does not have an e32 or sdwa form. Sorry. foad: Oh you're right, GFX10/11 V_ADD_CO_U32 does not have an e32 or sdwa form. Sorry.
}		}

bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,		bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
const GCNSubtarget &ST) const {		const GCNSubtarget &ST) const {
// Check if this is already an SDWA instruction		// Check if this is already an SDWA instruction
unsigned Opc = MI.getOpcode();		unsigned Opc = MI.getOpcode();
if (TII->isSDWA(Opc))		if (TII->isSDWA(Opc))
return true;		return true;

// Check if this instruction has opcode that supports SDWA		// Check if this instruction has opcode that supports SDWA
if (AMDGPU::getSDWAOp(Opc) == -1)		if (AMDGPU::getSDWAOp(Opc) == -1)
Opc = AMDGPU::getVOPe32(Opc);		Opc = AMDGPU::getVOPe32(Opc);

if (AMDGPU::getSDWAOp(Opc) == -1)		if (AMDGPU::getSDWAOp(Opc) == -1)
return false;		return false;

if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod))		if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
return false;		return false;

if (TII->isVOPC(Opc)) {		if (TII->isVOPC(Opc)) {
if (!ST.hasSDWASdst()) {		if (!ST.hasSDWASdst()) {
const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);		const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
if (SDst && (SDst->getReg() != AMDGPU::VCC &&		if (SDst && (SDst->getReg() != AMDGPU::VCC &&
		arsenmUnsubmitted Not Done Reply Inline Actions This is increasing the instruction size (and most likely the code size). This only makes sense to do if we know the fold into the operand can happen. This should perform those legality checks and make the full transform arsenm: This is increasing the instruction size (and most likely the code size). This only makes sense…
		yassinghAuthorUnsubmitted Done Reply Inline Actions Yes. There are 2 possible scenarios, if only the src1 operand is immediate then the 'mov' instruction will be folded. If both operands are immediates then 'MOV' will be part of generated assembly. Function 1 and 2 in the both test-files depict these scenarios. I can add the legality check for this fold happening but then some of the instructions will be missed? yassingh: Yes. There are 2 possible scenarios, if only the src1 operand is immediate then the 'mov'…
SDst->getReg() != AMDGPU::VCC_LO))		SDst->getReg() != AMDGPU::VCC_LO))
return false;		return false;
}		}

if (!ST.hasSDWAOutModsVOPC() &&		if (!ST.hasSDWAOutModsVOPC() &&
(TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) \|\|		(TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) \|\|
TII->hasModifiersSet(MI, AMDGPU::OpName::omod)))		TII->hasModifiersSet(MI, AMDGPU::OpName::omod)))
return false;		return false;
▲ Show 20 Lines • Show All 305 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/sdwa-ops.mir

# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=si-peephole-sdwa -o - %s \| FileCheck -check-prefix=GFX9 %s		# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=si-peephole-sdwa -o - %s \| FileCheck -check-prefix=GFX9 %s
# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass=si-peephole-sdwa -o - %s \| FileCheck -check-prefix=GFX9 %s		# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass=si-peephole-sdwa -o - %s \| FileCheck -check-prefix=GFX9 %s

# test for 3 consecutive _sdwa's		# test for 3 consecutive _sdwa's
# GFX9-LABEL: name: test1_add_co_sdwa		# GFX9-LABEL: name: test1_add_co_sdwa
# GFX9: = nsw V_ADD_CO_U32_sdwa		# GFX9: = nsw V_ADD_CO_U32_sdwa
# GFX9-NEXT: = nuw V_ADDC_U32_e32		# GFX9-NEXT: = nuw V_ADDC_U32_e64
# GFX9: V_ADD_CO_U32_sdwa		# GFX9: V_ADD_CO_U32_sdwa
# GFX9-NEXT: V_ADDC_U32_e32		# GFX9-NEXT: V_ADDC_U32_e64
# GFX9: V_ADD_CO_U32_sdwa		# GFX9: V_ADD_CO_U32_sdwa
# GFX9-NEXT: V_ADDC_U32_e32		# GFX9-NEXT: V_ADDC_U32_e64
---		---
name: test1_add_co_sdwa		name: test1_add_co_sdwa
tracksRegLiveness: true		tracksRegLiveness: true
registers:		registers:
- { id: 0, class: vgpr_32, preferred-register: '' }		- { id: 0, class: vgpr_32, preferred-register: '' }
liveins:		liveins:
- { reg: '$vgpr0', virtual-reg: '%0' }		- { reg: '$vgpr0', virtual-reg: '%0' }
- { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }		- { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
Show All 23 Lines	bb.0:
%172:vreg_64 = REG_SEQUENCE %173, %subreg.sub0, %174, %subreg.sub1		%172:vreg_64 = REG_SEQUENCE %173, %subreg.sub0, %174, %subreg.sub1
GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %172, %1, 0, 0, implicit $exec, implicit $exec :: (store (s64))		GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %172, %1, 0, 0, implicit $exec, implicit $exec :: (store (s64))

...		...

# test for VCC interference on sdwa, should generate 1 xform only		# test for VCC interference on sdwa, should generate 1 xform only
# GFX9-LABEL: name: test2_add_co_sdwa		# GFX9-LABEL: name: test2_add_co_sdwa
# GFX9: V_ADD_CO_U32_sdwa		# GFX9: V_ADD_CO_U32_sdwa
# GFX9: V_ADDC_U32_e32		# GFX9: V_ADDC_U32_e64
# GFX9-NOT: V_ADD_CO_U32_sdwa		# GFX9-NOT: V_ADD_CO_U32_sdwa
# GFX9-NOT: V_ADDC_U32_e32		# GFX9-NOT: V_ADDC_U32_e32
---		---
name: test2_add_co_sdwa		name: test2_add_co_sdwa
tracksRegLiveness: true		tracksRegLiveness: true
registers:		registers:
- { id: 0, class: vgpr_32, preferred-register: '' }		- { id: 0, class: vgpr_32, preferred-register: '' }
liveins:		liveins:
▲ Show 20 Lines • Show All 86 Lines • ▼ Show 20 Lines	bb.0:
GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, implicit $exec, implicit $exec :: (store (s64))		GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, implicit $exec, implicit $exec :: (store (s64))


...		...

# test for simple example, should generate sdwa		# test for simple example, should generate sdwa
# GFX9-LABEL: name: test5_add_co_sdwa		# GFX9-LABEL: name: test5_add_co_sdwa
# GFX9: V_ADD_CO_U32_sdwa		# GFX9: V_ADD_CO_U32_sdwa
# GFX9: V_ADDC_U32_e32		# GFX9: V_ADDC_U32_e64
---		---
name: test5_add_co_sdwa		name: test5_add_co_sdwa
tracksRegLiveness: true		tracksRegLiveness: true
registers:		registers:
- { id: 0, class: vgpr_32, preferred-register: '' }		- { id: 0, class: vgpr_32, preferred-register: '' }
liveins:		liveins:
- { reg: '$vgpr0', virtual-reg: '%0' }		- { reg: '$vgpr0', virtual-reg: '%0' }
- { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }		- { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
▲ Show 20 Lines • Show All 220 Lines • ▼ Show 20 Lines	bb.0:
%30:vreg_64 = COPY $sgpr0_sgpr1		%30:vreg_64 = COPY $sgpr0_sgpr1
%23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec		%23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec
%63:vgpr_32, %65:sreg_64_xexec = V_ADD_CO_U32_e64 %30.sub0, %23, 0, implicit $exec		%63:vgpr_32, %65:sreg_64_xexec = V_ADD_CO_U32_e64 %30.sub0, %23, 0, implicit $exec
$vcc = COPY %30		$vcc = COPY %30
%31:vreg_64 = COPY killed $vcc		%31:vreg_64 = COPY killed $vcc
%64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, 0, implicit $exec		%64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, 0, implicit $exec
%62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1		%62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1
GLOBAL_STORE_DWORDX2_SADDR %31.sub0, %62, %1, 0, 0, implicit $exec, implicit $exec :: (store (s64))		GLOBAL_STORE_DWORDX2_SADDR %31.sub0, %62, %1, 0, 0, implicit $exec, implicit $exec :: (store (s64))

llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GFX9 %s			; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GFX9 %s
	define amdgpu_kernel void @sdwa_test() local_unnamed_addr #0 {			define amdgpu_kernel void @sdwa_test() local_unnamed_addr #0 {
	; GFX9-LABEL: sdwa_test:			; GFX9-LABEL: sdwa_test:
	; GFX9: ; %bb.0: ; %bb			; GFX9: ; %bb.0: ; %bb
	; GFX9-NEXT: v_add_u32_e32 v1, 10, v0			; GFX9-NEXT: v_add_u32_e32 v1, 10, v0
	; GFX9-NEXT: v_add_u32_e32 v0, 20, v0			; GFX9-NEXT: v_add_u32_e32 v0, 20, v0
	; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0			; GFX9-NEXT: v_add_co_u32_sdwa v0, vcc, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
	; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0
	; GFX9-NEXT: v_addc_co_u32_e64 v1, s[0:1], 0, 0, vcc			; GFX9-NEXT: v_addc_co_u32_e64 v1, s[0:1], 0, 0, vcc
	; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off			; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
				foadUnsubmitted Not Done Reply Inline Actions This is silly. You have added a mov instruction to shrink the addc instruction for no reason, because it is not actually converted to sdwa form. foad: This is silly. You have added a mov instruction to shrink the addc instruction for no reason…
				yassinghAuthorUnsubmitted Done Reply Inline Actions Yes you are right but the pass right now only attempts to convert to sdwa when both instructions (V_ADD_CO and V_ADDC_CO) are shrinkable to their sdwa formed so I went this way. I can explore only shrinking the v_add_co instruction and avoid inserting that mov instruction? yassingh: Yes you are right but the pass right now only attempts to convert to sdwa when both…
	; GFX9-NEXT: s_endpgm			; GFX9-NEXT: s_endpgm
	bb:			bb:
	%tid = tail call i32 @llvm.amdgcn.workitem.id.x()			%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
	%v0 = add i32 %tid, 10			%v0 = add i32 %tid, 10
	%v1 = add i32 %tid, 20			%v1 = add i32 %tid, 20
	%v2 = zext i32 %v0 to i64			%v2 = zext i32 %v0 to i64
	%v3 = zext i32 %v1 to i64			%v3 = zext i32 %v1 to i64
	%v.t = and i64 %v3, 255			%v.t = and i64 %v3, 255
	%v4 = add i64 %v2, %v.t			%v4 = add i64 %v2, %v.t
	store i64 %v4, i64 addrspace(1) * undef			store i64 %v4, i64 addrspace(1) * undef
	ret void			ret void
	}			}


	define amdgpu_kernel void @test_add_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1) #0 {			define amdgpu_kernel void @test_add_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1) #0 {
	; GFX9-LABEL: test_add_co_sdwa:			; GFX9-LABEL: test_add_co_sdwa:
	; GFX9: ; %bb.0: ; %bb			; GFX9: ; %bb.0: ; %bb
	; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24			; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
	; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0			; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
	; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0			; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0
	; GFX9-NEXT: s_waitcnt lgkmcnt(0)			; GFX9-NEXT: s_waitcnt lgkmcnt(0)
	; GFX9-NEXT: global_load_dword v2, v1, s[2:3]			; GFX9-NEXT: global_load_dword v4, v2, s[2:3]
	; GFX9-NEXT: s_nop 0
	; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1]			; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1]
	; GFX9-NEXT: s_waitcnt vmcnt(1)
	; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2
	; GFX9-NEXT: s_waitcnt vmcnt(0)			; GFX9-NEXT: s_waitcnt vmcnt(0)
	; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2			; GFX9-NEXT: v_add_co_u32_sdwa v0, vcc, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
	; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc			; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
	; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]			; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
	; GFX9-NEXT: s_endpgm			; GFX9-NEXT: s_endpgm
	bb:			bb:
	%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()			%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
	%tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp			%tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp
	%tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4			%tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4
	%tmp5 = and i32 %tmp4, 255			%tmp5 = and i32 %tmp4, 255
	Show All 10 Lines

llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GFX9 %s			; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GFX9 %s
	define amdgpu_kernel void @sdwa_test_sub() local_unnamed_addr #0 {			define amdgpu_kernel void @sdwa_test_sub() local_unnamed_addr #0 {
	; GFX9-LABEL: sdwa_test_sub:			; GFX9-LABEL: sdwa_test_sub:
	; GFX9: ; %bb.0: ; %bb			; GFX9: ; %bb.0: ; %bb
	; GFX9-NEXT: v_add_u32_e32 v1, 10, v0			; GFX9-NEXT: v_add_u32_e32 v1, 10, v0
	; GFX9-NEXT: v_add_u32_e32 v0, 20, v0			; GFX9-NEXT: v_add_u32_e32 v0, 20, v0
	; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0			; GFX9-NEXT: v_sub_co_u32_sdwa v0, vcc, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
	; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v1, v0
	; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], 0, 0, vcc			; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], 0, 0, vcc
	; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off			; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
	; GFX9-NEXT: s_endpgm			; GFX9-NEXT: s_endpgm
	bb:			bb:
	%tid = tail call i32 @llvm.amdgcn.workitem.id.x()			%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
	%v0 = add i32 %tid, 10			%v0 = add i32 %tid, 10
	%v1 = add i32 %tid, 20			%v1 = add i32 %tid, 20
	%v2 = zext i32 %v0 to i64			%v2 = zext i32 %v0 to i64
	%v3 = zext i32 %v1 to i64			%v3 = zext i32 %v1 to i64
	%v.t = and i64 %v3, 255			%v.t = and i64 %v3, 255
	%v4 = sub i64 %v2, %v.t			%v4 = sub i64 %v2, %v.t
	store i64 %v4, i64 addrspace(1) * undef			store i64 %v4, i64 addrspace(1) * undef
	ret void			ret void
	}			}


	define amdgpu_kernel void @test_sub_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1) #0 {			define amdgpu_kernel void @test_sub_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1) #0 {
	; GFX9-LABEL: test_sub_co_sdwa:			; GFX9-LABEL: test_sub_co_sdwa:
	; GFX9: ; %bb.0: ; %bb			; GFX9: ; %bb.0: ; %bb
	; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24			; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
	; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0			; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
	; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0			; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0
	; GFX9-NEXT: s_waitcnt lgkmcnt(0)			; GFX9-NEXT: s_waitcnt lgkmcnt(0)
	; GFX9-NEXT: global_load_dword v2, v1, s[2:3]			; GFX9-NEXT: global_load_dword v4, v2, s[2:3]
	; GFX9-NEXT: s_nop 0
	; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1]			; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1]
	; GFX9-NEXT: s_waitcnt vmcnt(1)
	; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2
	; GFX9-NEXT: s_waitcnt vmcnt(0)			; GFX9-NEXT: s_waitcnt vmcnt(0)
	; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2			; GFX9-NEXT: v_sub_co_u32_sdwa v0, vcc, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
	; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc			; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
	; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]			; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
	; GFX9-NEXT: s_endpgm			; GFX9-NEXT: s_endpgm
	bb:			bb:
	%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()			%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
	%tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp			%tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp
	%tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4			%tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4
	%tmp5 = and i32 %tmp4, 255			%tmp5 = and i32 %tmp4, 255
	Show All 10 Lines

llvm/tmp.mir

This file was added.

				--- \|
				; ModuleID = './test/CodeGen/AMDGPU/sdwa-ops.mir'
				source_filename = "./test/CodeGen/AMDGPU/sdwa-ops.mir"
				target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
				target triple = "amdgcn-unknown-unknown"

				define void @test1_add_co_sdwa() #0 {
				entry:
				unreachable
				}

				define void @test2_add_co_sdwa() #0 {
				entry:
				unreachable
				}

				define void @test3_add_co_sdwa() #0 {
				entry:
				unreachable
				}

				define void @test4_add_co_sdwa() #0 {
				entry:
				unreachable
				}

				define void @test5_add_co_sdwa() #0 {
				entry:
				unreachable
				}

				define void @test6_add_co_sdwa() #0 {
				entry:
				unreachable
				}

				define void @test7_add_co_sdwa() #0 {
				entry:
				unreachable
				}

				define void @test8_add_co_sdwa() #0 {
				entry:
				unreachable
				}

				define void @test9_add_co_sdwa() #0 {
				entry:
				unreachable
				}

				define void @test10_add_co_sdwa() #0 {
				entry:
				unreachable
				}

				define void @test11_add_co_sdwa() #0 {
				entry:
				unreachable
				}

				define void @test12_add_co_sdwa() #0 {
				entry:
				unreachable
				}

				attributes #0 = { "target-cpu"="gfx900" }

				...
				---
				name: test1_add_co_sdwa
				foadUnsubmitted Not Done Reply Inline Actions This can be much simpler. Typically you only need `name:` and `tracksRegLiveness:` and `body:`. If you used `llc` to generate this then you should try adding `-simplify-mir`. foad: This can be much simpler. Typically you only need `name:` and `tracksRegLiveness:` and `body:`.
				yassinghAuthorUnsubmitted Done Reply Inline Actions Thanks for pointing out, accidently added this file to review. Removing now yassingh: Thanks for pointing out, accidently added this file to review. Removing now
				alignment: 1
				exposesReturnsTwice: false
				legalized: false
				regBankSelected: false
				selected: false
				failedISel: false
				tracksRegLiveness: true
				hasWinCFI: false
				callsEHReturn: false
				callsUnwindInit: false
				hasEHCatchret: false
				hasEHScopes: false
				hasEHFunclets: false
				failsVerification: false
				tracksDebugUserValues: false
				registers:
				- { id: 0, class: vgpr_32, preferred-register: '' }
				- { id: 1, class: sgpr_64, preferred-register: '' }
				- { id: 2, class: sreg_32_xm0, preferred-register: '' }
				- { id: 3, class: vgpr_32, preferred-register: '' }
				- { id: 4, class: vreg_64, preferred-register: '' }
				- { id: 5, class: vgpr_32, preferred-register: '' }
				- { id: 6, class: sreg_64_xexec, preferred-register: '' }
				- { id: 7, class: vgpr_32, preferred-register: '' }
				- { id: 8, class: sreg_64_xexec, preferred-register: '' }
				- { id: 9, class: vreg_64, preferred-register: '' }
				- { id: 10, class: vgpr_32, preferred-register: '' }
				- { id: 11, class: vgpr_32, preferred-register: '' }
				- { id: 12, class: sreg_64_xexec, preferred-register: '' }
				- { id: 13, class: vgpr_32, preferred-register: '' }
				- { id: 14, class: sreg_64_xexec, preferred-register: '' }
				- { id: 15, class: vreg_64, preferred-register: '' }
				- { id: 16, class: vgpr_32, preferred-register: '' }
				- { id: 17, class: vgpr_32, preferred-register: '' }
				- { id: 18, class: sreg_64_xexec, preferred-register: '' }
				- { id: 19, class: vgpr_32, preferred-register: '' }
				- { id: 20, class: sreg_64_xexec, preferred-register: '' }
				- { id: 21, class: vreg_64, preferred-register: '' }
				liveins:
				- { reg: '$vgpr0', virtual-reg: '%0' }
				- { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
				frameInfo:
				isFrameAddressTaken: false
				isReturnAddressTaken: false
				hasStackMap: false
				hasPatchPoint: false
				stackSize: 0
				offsetAdjustment: 0
				maxAlignment: 1
				adjustsStack: false
				hasCalls: false
				stackProtector: ''
				functionContext: ''
				maxCallFrameSize: 4294967295
				cvBytesOfCalleeSavedRegisters: 0
				hasOpaqueSPAdjustment: false
				hasVAStart: false
				hasMustTailInVarArgFunc: false
				hasTailCall: false
				localFrameSize: 0
				savePoint: ''
				restorePoint: ''
				fixedStack: []
				stack: []
				callSites: []
				debugValueSubstitutions: []
				constants: []
				machineFunctionInfo:
				explicitKernArgSize: 0
				maxKernArgAlign: 1
				ldsSize: 0
				gdsSize: 0
				dynLDSAlign: 1
				isEntryFunction: false
				noSignedZerosFPMath: false
				memoryBound: false
				waveLimiter: false
				hasSpilledSGPRs: false
				hasSpilledVGPRs: false
				scratchRSrcReg: '$private_rsrc_reg'
				frameOffsetReg: '$fp_reg'
				stackPtrOffsetReg: '$sp_reg'
				bytesInStackArgArea: 0
				returnsVoid: true
				argumentInfo:
				privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
				dispatchPtr: { reg: '$sgpr4_sgpr5' }
				queuePtr: { reg: '$sgpr6_sgpr7' }
				dispatchID: { reg: '$sgpr10_sgpr11' }
				workGroupIDX: { reg: '$sgpr12' }
				workGroupIDY: { reg: '$sgpr13' }
				workGroupIDZ: { reg: '$sgpr14' }
				LDSKernelId: { reg: '$sgpr15' }
				implicitArgPtr: { reg: '$sgpr8_sgpr9' }
				workItemIDX: { reg: '$vgpr31', mask: 1023 }
				workItemIDY: { reg: '$vgpr31', mask: 1047552 }
				workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
				mode:
				ieee: true
				dx10-clamp: true
				fp32-input-denormals: true
				fp32-output-denormals: true
				fp64-fp16-input-denormals: true
				fp64-fp16-output-denormals: true
				highBitsOf32BitAddress: 0
				occupancy: 10
				vgprForAGPRCopy: ''
				body: \|
				bb.0:
				liveins: $vgpr0, $sgpr0_sgpr1

				%1:sgpr_64 = COPY $sgpr0_sgpr1
				%0:vgpr_32 = COPY $vgpr0
				%2:sreg_32_xm0 = S_MOV_B32 255
				%3:vgpr_32 = V_AND_B32_e32 %2, %0, implicit $exec
				%4:vreg_64 = COPY $sgpr0_sgpr1
				%5:vgpr_32 = nsw V_ADD_CO_U32_sdwa 0, %4.sub0, 0, %0, 0, 6, 0, 6, 0, implicit-def $vcc, implicit $exec
				%7:vgpr_32, dead %8:sreg_64_xexec = nuw V_ADDC_U32_e64 %4.sub1, %0, killed $vcc, 0, implicit $exec
				%9:vreg_64 = REG_SEQUENCE %5, %subreg.sub0, %7, %subreg.sub1
				GLOBAL_STORE_DWORDX2_SADDR %4.sub0, %9, %1, 0, 0, implicit $exec, implicit $exec :: (store (s64))
				%10:vgpr_32 = V_AND_B32_e32 %2, %0, implicit $exec
				%11:vgpr_32 = V_ADD_CO_U32_sdwa 0, %4.sub0, 0, %0, 0, 6, 0, 6, 0, implicit-def $vcc, implicit $exec
				%13:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 %4.sub1, %0, killed $vcc, 0, implicit $exec
				%15:vreg_64 = REG_SEQUENCE %11, %subreg.sub0, %13, %subreg.sub1
				GLOBAL_STORE_DWORDX2_SADDR %4.sub0, %15, %1, 0, 0, implicit $exec, implicit $exec :: (store (s64))
				%16:vgpr_32 = V_AND_B32_e32 %2, %0, implicit $exec
				%17:vgpr_32 = V_ADD_CO_U32_sdwa 0, %4.sub0, 0, %0, 0, 6, 0, 6, 0, implicit-def $vcc, implicit $exec
				%19:vgpr_32, dead %20:sreg_64_xexec = V_ADDC_U32_e64 %4.sub1, %0, killed $vcc, 0, implicit $exec
				%21:vreg_64 = REG_SEQUENCE %17, %subreg.sub0, %19, %subreg.sub1
				GLOBAL_STORE_DWORDX2_SADDR %4.sub0, %21, %1, 0, 0, implicit $exec, implicit $exec :: (store (s64))

				...
				---
				name: test2_add_co_sdwa
				alignment: 1
				exposesReturnsTwice: false
				legalized: false
				regBankSelected: false
				selected: false
				failedISel: false
				tracksRegLiveness: true
				hasWinCFI: false
				callsEHReturn: false
				callsUnwindInit: false
				hasEHCatchret: false
				hasEHScopes: false
				hasEHFunclets: false
				failsVerification: false
				tracksDebugUserValues: false
				registers:
				- { id: 0, class: vgpr_32, preferred-register: '' }
				- { id: 1, class: sgpr_64, preferred-register: '' }
				- { id: 2, class: sreg_32_xm0, preferred-register: '' }
				- { id: 3, class: vgpr_32, preferred-register: '' }
				- { id: 4, class: vreg_64, preferred-register: '' }
				- { id: 5, class: vgpr_32, preferred-register: '' }
				- { id: 6, class: sreg_64_xexec, preferred-register: '' }
				- { id: 7, class: vgpr_32, preferred-register: '' }
				- { id: 8, class: vgpr_32, preferred-register: '' }
				- { id: 9, class: sreg_64_xexec, preferred-register: '' }
				- { id: 10, class: vgpr_32, preferred-register: '' }
				- { id: 11, class: sreg_64_xexec, preferred-register: '' }
				- { id: 12, class: vreg_64, preferred-register: '' }
				- { id: 13, class: vgpr_32, preferred-register: '' }
				- { id: 14, class: sreg_64_xexec, preferred-register: '' }
				- { id: 15, class: vreg_64, preferred-register: '' }
				liveins:
				- { reg: '$vgpr0', virtual-reg: '%0' }
				- { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
				frameInfo:
				isFrameAddressTaken: false
				isReturnAddressTaken: false
				hasStackMap: false
				hasPatchPoint: false
				stackSize: 0
				offsetAdjustment: 0
				maxAlignment: 1
				adjustsStack: false
				hasCalls: false
				stackProtector: ''
				functionContext: ''
				maxCallFrameSize: 4294967295
				cvBytesOfCalleeSavedRegisters: 0
				hasOpaqueSPAdjustment: false
				hasVAStart: false
				hasMustTailInVarArgFunc: false
				hasTailCall: false
				localFrameSize: 0
				savePoint: ''
				restorePoint: ''
				fixedStack: []
				stack: []
				callSites: []
				debugValueSubstitutions: []
				constants: []
				machineFunctionInfo:
				explicitKernArgSize: 0
				maxKernArgAlign: 1
				ldsSize: 0
				gdsSize: 0
				dynLDSAlign: 1
				isEntryFunction: false
				noSignedZerosFPMath: false
				memoryBound: false
				waveLimiter: false
				hasSpilledSGPRs: false
				hasSpilledVGPRs: false
				scratchRSrcReg: '$private_rsrc_reg'
				frameOffsetReg: '$fp_reg'
				stackPtrOffsetReg: '$sp_reg'
				bytesInStackArgArea: 0
				returnsVoid: true
				argumentInfo:
				privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
				dispatchPtr: { reg: '$sgpr4_sgpr5' }
				queuePtr: { reg: '$sgpr6_sgpr7' }
				dispatchID: { reg: '$sgpr10_sgpr11' }
				workGroupIDX: { reg: '$sgpr12' }
				workGroupIDY: { reg: '$sgpr13' }
				workGroupIDZ: { reg: '$sgpr14' }
				LDSKernelId: { reg: '$sgpr15' }
				implicitArgPtr: { reg: '$sgpr8_sgpr9' }
				workItemIDX: { reg: '$vgpr31', mask: 1023 }
				workItemIDY: { reg: '$vgpr31', mask: 1047552 }
				workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
				mode:
				ieee: true
				dx10-clamp: true
				fp32-input-denormals: true
				fp32-output-denormals: true
				fp64-fp16-input-denormals: true
				fp64-fp16-output-denormals: true
				highBitsOf32BitAddress: 0
				occupancy: 10
				vgprForAGPRCopy: ''
				body: \|
				bb.0:
				liveins: $vgpr0, $sgpr0_sgpr1

				%1:sgpr_64 = COPY $sgpr0_sgpr1
				%0:vgpr_32 = COPY $vgpr0
				%2:sreg_32_xm0 = S_MOV_B32 255
				%3:vgpr_32 = V_AND_B32_e32 %2, %0, implicit $exec
				%4:vreg_64 = COPY $sgpr0_sgpr1
				%5:vgpr_32 = V_ADD_CO_U32_sdwa 0, %4.sub0, 0, %0, 0, 6, 0, 6, 0, implicit-def $vcc, implicit $exec
				%7:vgpr_32 = V_AND_B32_e32 %2, %0, implicit $exec
				%8:vgpr_32, %9:sreg_64_xexec = V_ADD_CO_U32_e64 %4.sub0, %7, 0, implicit $exec
				%10:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 %4.sub1, %0, killed %9, 0, implicit $exec
				%12:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %10, %subreg.sub1
				%13:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 %4.sub1, %0, killed $vcc, 0, implicit $exec
				%15:vreg_64 = REG_SEQUENCE %5, %subreg.sub0, %13, %subreg.sub1
				GLOBAL_STORE_DWORDX2_SADDR %4.sub0, %15, %1, 0, 0, implicit $exec, implicit $exec :: (store (s64))
				%7:vgpr_32 = V_AND_B32_e32 %2, %0, implicit $exec
				%8:vgpr_32, %9:sreg_64_xexec = V_ADD_CO_U32_e64 %4.sub0, %7, 0, implicit $exec
				%10:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 %4.sub1, %0, killed %9, 0, implicit $exec
				%12:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %10, %subreg.sub1
				GLOBAL_STORE_DWORDX2_SADDR %4.sub0, %12, %1, 0, 0, implicit $exec, implicit $exec :: (store (s64))

				...
				---
				name: test3_add_co_sdwa
				alignment: 1
				exposesReturnsTwice: false
				legalized: false
				regBankSelected: false
				selected: false
				failedISel: false
				tracksRegLiveness: true
				hasWinCFI: false
				callsEHReturn: false
				callsUnwindInit: false
				hasEHCatchret: false
				hasEHScopes: false
				hasEHFunclets: false
				failsVerification: false
				tracksDebugUserValues: false
				registers:
				- { id: 0, class: vgpr_32, preferred-register: '' }
				- { id: 1, class: sgpr_64, preferred-register: '' }
				- { id: 2, class: sreg_32_xm0, preferred-register: '' }
				- { id: 3, class: vgpr_32, preferred-register: '' }
				- { id: 4, class: vreg_64, preferred-register: '' }
				- { id: 5, class: vgpr_32, preferred-register: '' }
				- { id: 6, class: sreg_64_xexec, preferred-register: '' }
				- { id: 7, class: vgpr_32, preferred-register: '' }
				- { id: 8, class: sreg_64_xexec, preferred-register: '' }
				- { id: 9, class: vreg_64, preferred-register: '' }
				liveins:
				- { reg: '$vgpr0', virtual-reg: '%0' }
				- { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
				frameInfo:
				isFrameAddressTaken: false
				isReturnAddressTaken: false
				hasStackMap: false
				hasPatchPoint: false
				stackSize: 0
				offsetAdjustment: 0
				maxAlignment: 1
				adjustsStack: false
				hasCalls: false
				stackProtector: ''
				functionContext: ''
				maxCallFrameSize: 4294967295
				cvBytesOfCalleeSavedRegisters: 0
				hasOpaqueSPAdjustment: false
				hasVAStart: false
				hasMustTailInVarArgFunc: false
				hasTailCall: false
				localFrameSize: 0
				savePoint: ''
				restorePoint: ''
				fixedStack: []
				stack: []
				callSites: []
				debugValueSubstitutions: []
				constants: []
				machineFunctionInfo:
				explicitKernArgSize: 0
				maxKernArgAlign: 1
				ldsSize: 0
				gdsSize: 0
				dynLDSAlign: 1
				isEntryFunction: false
				noSignedZerosFPMath: false
				memoryBound: false
				waveLimiter: false
				hasSpilledSGPRs: false
				hasSpilledVGPRs: false
				scratchRSrcReg: '$private_rsrc_reg'
				frameOffsetReg: '$fp_reg'
				stackPtrOffsetReg: '$sp_reg'
				bytesInStackArgArea: 0
				returnsVoid: true
				argumentInfo:
				privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
				dispatchPtr: { reg: '$sgpr4_sgpr5' }
				queuePtr: { reg: '$sgpr6_sgpr7' }
				dispatchID: { reg: '$sgpr10_sgpr11' }
				workGroupIDX: { reg: '$sgpr12' }
				workGroupIDY: { reg: '$sgpr13' }
				workGroupIDZ: { reg: '$sgpr14' }
				LDSKernelId: { reg: '$sgpr15' }
				implicitArgPtr: { reg: '$sgpr8_sgpr9' }
				workItemIDX: { reg: '$vgpr31', mask: 1023 }
				workItemIDY: { reg: '$vgpr31', mask: 1047552 }
				workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
				mode:
				ieee: true
				dx10-clamp: true
				fp32-input-denormals: true
				fp32-output-denormals: true
				fp64-fp16-input-denormals: true
				fp64-fp16-output-denormals: true
				highBitsOf32BitAddress: 0
				occupancy: 10
				vgprForAGPRCopy: ''
				body: \|
				bb.0:
				liveins: $vgpr0, $sgpr0_sgpr1

				%1:sgpr_64 = COPY $sgpr0_sgpr1
				%0:vgpr_32 = COPY $vgpr0
				%2:sreg_32_xm0 = S_MOV_B32 255
				%3:vgpr_32 = V_AND_B32_e32 %2, %0, implicit $exec
				%4:vreg_64 = COPY $sgpr0_sgpr1
				%5:vgpr_32, %6:sreg_64_xexec = V_ADD_CO_U32_e64 %4.sub0, %3, 0, implicit $exec
				%7:vgpr_32, %8:sreg_64_xexec = V_ADDC_U32_e64 %4.sub1, %0, killed %6, 0, implicit $exec
				%9:vreg_64 = REG_SEQUENCE %5, %subreg.sub0, %8, %subreg.sub1
				GLOBAL_STORE_DWORDX2_SADDR %4.sub0, %9, %1, 0, 0, implicit $exec, implicit $exec :: (store (s64))

				...
				---
				name: test4_add_co_sdwa
				alignment: 1
				exposesReturnsTwice: false
				legalized: false
				regBankSelected: false
				selected: false
				failedISel: false
				tracksRegLiveness: true
				hasWinCFI: false
				callsEHReturn: false
				callsUnwindInit: false
				hasEHCatchret: false
				hasEHScopes: false
				hasEHFunclets: false
				failsVerification: false
				tracksDebugUserValues: false
				registers:
				- { id: 0, class: vgpr_32, preferred-register: '' }
				- { id: 1, class: sgpr_64, preferred-register: '' }
				- { id: 2, class: sreg_32_xm0, preferred-register: '' }
				- { id: 3, class: vgpr_32, preferred-register: '' }
				- { id: 4, class: vreg_64, preferred-register: '' }
				- { id: 5, class: vgpr_32, preferred-register: '' }
				- { id: 6, class: sreg_64_xexec, preferred-register: '' }
				- { id: 7, class: vgpr_32, preferred-register: '' }
				- { id: 8, class: sreg_64_xexec, preferred-register: '' }
				- { id: 9, class: vreg_64, preferred-register: '' }
				liveins:
				- { reg: '$vgpr0', virtual-reg: '%0' }
				- { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
				frameInfo:
				isFrameAddressTaken: false
				isReturnAddressTaken: false
				hasStackMap: false
				hasPatchPoint: false
				stackSize: 0
				offsetAdjustment: 0
				maxAlignment: 1
				adjustsStack: false
				hasCalls: false
				stackProtector: ''
				functionContext: ''
				maxCallFrameSize: 4294967295
				cvBytesOfCalleeSavedRegisters: 0
				hasOpaqueSPAdjustment: false
				hasVAStart: false
				hasMustTailInVarArgFunc: false
				hasTailCall: false
				localFrameSize: 0
				savePoint: ''
				restorePoint: ''
				fixedStack: []
				stack: []
				callSites: []
				debugValueSubstitutions: []
				constants: []
				machineFunctionInfo:
				explicitKernArgSize: 0
				maxKernArgAlign: 1
				ldsSize: 0
				gdsSize: 0
				dynLDSAlign: 1
				isEntryFunction: false
				noSignedZerosFPMath: false
				memoryBound: false
				waveLimiter: false
				hasSpilledSGPRs: false
				hasSpilledVGPRs: false
				scratchRSrcReg: '$private_rsrc_reg'
				frameOffsetReg: '$fp_reg'
				stackPtrOffsetReg: '$sp_reg'
				bytesInStackArgArea: 0
				returnsVoid: true
				argumentInfo:
				privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
				dispatchPtr: { reg: '$sgpr4_sgpr5' }
				queuePtr: { reg: '$sgpr6_sgpr7' }
				dispatchID: { reg: '$sgpr10_sgpr11' }
				workGroupIDX: { reg: '$sgpr12' }
				workGroupIDY: { reg: '$sgpr13' }
				workGroupIDZ: { reg: '$sgpr14' }
				LDSKernelId: { reg: '$sgpr15' }
				implicitArgPtr: { reg: '$sgpr8_sgpr9' }
				workItemIDX: { reg: '$vgpr31', mask: 1023 }
				workItemIDY: { reg: '$vgpr31', mask: 1047552 }
				workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
				mode:
				ieee: true
				dx10-clamp: true
				fp32-input-denormals: true
				fp32-output-denormals: true
				fp64-fp16-input-denormals: true
				fp64-fp16-output-denormals: true
				highBitsOf32BitAddress: 0
				occupancy: 10
				vgprForAGPRCopy: ''
				body: \|
				bb.0:
				liveins: $vgpr0, $sgpr0_sgpr1

				%1:sgpr_64 = COPY $sgpr0_sgpr1
				%0:vgpr_32 = COPY $vgpr0
				%2:sreg_32_xm0 = S_MOV_B32 255
				%3:vgpr_32 = V_AND_B32_e32 %2, %0, implicit $exec
				%4:vreg_64 = COPY $sgpr0_sgpr1
				%5:vgpr_32, %6:sreg_64_xexec = V_ADD_CO_U32_e64 %4.sub0, %3, 0, implicit $exec
				%7:vgpr_32, %8:sreg_64_xexec = V_ADDC_U32_e64 %4.sub1, %0, %6, 0, implicit $exec
				%9:vreg_64 = REG_SEQUENCE %5, %subreg.sub0, %6, %subreg.sub1
				GLOBAL_STORE_DWORDX2_SADDR %4.sub0, %9, %1, 0, 0, implicit $exec, implicit $exec :: (store (s64))

				...
				---
				name: test5_add_co_sdwa
				alignment: 1
				exposesReturnsTwice: false
				legalized: false
				regBankSelected: false
				selected: false
				failedISel: false
				tracksRegLiveness: true
				hasWinCFI: false
				callsEHReturn: false
				callsUnwindInit: false
				hasEHCatchret: false
				hasEHScopes: false
				hasEHFunclets: false
				failsVerification: false
				tracksDebugUserValues: false
				registers:
				- { id: 0, class: vgpr_32, preferred-register: '' }
				- { id: 1, class: sgpr_64, preferred-register: '' }
				- { id: 2, class: sreg_32_xm0, preferred-register: '' }
				- { id: 3, class: vgpr_32, preferred-register: '' }
				- { id: 4, class: vreg_64, preferred-register: '' }
				- { id: 5, class: vgpr_32, preferred-register: '' }
				- { id: 6, class: sreg_64_xexec, preferred-register: '' }
				- { id: 7, class: vgpr_32, preferred-register: '' }
				- { id: 8, class: sreg_64_xexec, preferred-register: '' }
				- { id: 9, class: vreg_64, preferred-register: '' }
				liveins:
				- { reg: '$vgpr0', virtual-reg: '%0' }
				- { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
				frameInfo:
				isFrameAddressTaken: false
				isReturnAddressTaken: false
				hasStackMap: false
				hasPatchPoint: false
				stackSize: 0
				offsetAdjustment: 0
				maxAlignment: 1
				adjustsStack: false
				hasCalls: false
				stackProtector: ''
				functionContext: ''
				maxCallFrameSize: 4294967295
				cvBytesOfCalleeSavedRegisters: 0
				hasOpaqueSPAdjustment: false
				hasVAStart: false
				hasMustTailInVarArgFunc: false
				hasTailCall: false
				localFrameSize: 0
				savePoint: ''
				restorePoint: ''
				fixedStack: []
				stack: []
				callSites: []
				debugValueSubstitutions: []
				constants: []
				machineFunctionInfo:
				explicitKernArgSize: 0
				maxKernArgAlign: 1
				ldsSize: 0
				gdsSize: 0
				dynLDSAlign: 1
				isEntryFunction: false
				noSignedZerosFPMath: false
				memoryBound: false
				waveLimiter: false
				hasSpilledSGPRs: false
				hasSpilledVGPRs: false
				scratchRSrcReg: '$private_rsrc_reg'
				frameOffsetReg: '$fp_reg'
				stackPtrOffsetReg: '$sp_reg'
				bytesInStackArgArea: 0
				returnsVoid: true
				argumentInfo:
				privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
				dispatchPtr: { reg: '$sgpr4_sgpr5' }
				queuePtr: { reg: '$sgpr6_sgpr7' }
				dispatchID: { reg: '$sgpr10_sgpr11' }
				workGroupIDX: { reg: '$sgpr12' }
				workGroupIDY: { reg: '$sgpr13' }
				workGroupIDZ: { reg: '$sgpr14' }
				LDSKernelId: { reg: '$sgpr15' }
				implicitArgPtr: { reg: '$sgpr8_sgpr9' }
				workItemIDX: { reg: '$vgpr31', mask: 1023 }
				workItemIDY: { reg: '$vgpr31', mask: 1047552 }
				workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
				mode:
				ieee: true
				dx10-clamp: true
				fp32-input-denormals: true
				fp32-output-denormals: true
				fp64-fp16-input-denormals: true
				fp64-fp16-output-denormals: true
				highBitsOf32BitAddress: 0
				occupancy: 10
				vgprForAGPRCopy: ''
				body: \|
				bb.0:
				liveins: $vgpr0, $sgpr0_sgpr1

				%1:sgpr_64 = COPY $sgpr0_sgpr1
				%0:vgpr_32 = COPY $vgpr0
				%2:sreg_32_xm0 = S_MOV_B32 255
				%3:vgpr_32 = V_AND_B32_e32 %2, %0, implicit $exec
				%4:vreg_64 = COPY $sgpr0_sgpr1
				%5:vgpr_32 = V_ADD_CO_U32_sdwa 0, %4.sub0, 0, %0, 0, 6, 0, 6, 0, implicit-def $vcc, implicit $exec
				%7:vgpr_32, %8:sreg_64_xexec = V_ADDC_U32_e64 %4.sub1, %0, $vcc, 0, implicit $exec
				%9:vreg_64 = REG_SEQUENCE %5, %subreg.sub0, %7, %subreg.sub1
				GLOBAL_STORE_DWORDX2_SADDR %4.sub0, %9, %1, 0, 0, implicit $exec, implicit $exec :: (store (s64))

				...
				---
				name: test6_add_co_sdwa
				alignment: 1
				exposesReturnsTwice: false
				legalized: false
				regBankSelected: false
				selected: false
				failedISel: false
				tracksRegLiveness: true
				hasWinCFI: false
				callsEHReturn: false
				callsUnwindInit: false
				hasEHCatchret: false
				hasEHScopes: false
				hasEHFunclets: false
				failsVerification: false
				tracksDebugUserValues: false
				registers:
				- { id: 0, class: vgpr_32, preferred-register: '' }
				- { id: 1, class: sgpr_64, preferred-register: '' }
				- { id: 2, class: sreg_32_xm0, preferred-register: '' }
				- { id: 3, class: vgpr_32, preferred-register: '' }
				- { id: 4, class: vreg_64, preferred-register: '' }
				- { id: 5, class: vgpr_32, preferred-register: '' }
				- { id: 6, class: sreg_64_xexec, preferred-register: '' }
				- { id: 7, class: vreg_64, preferred-register: '' }
				liveins:
				- { reg: '$vgpr0', virtual-reg: '%0' }
				- { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
				frameInfo:
				isFrameAddressTaken: false
				isReturnAddressTaken: false
				hasStackMap: false
				hasPatchPoint: false
				stackSize: 0
				offsetAdjustment: 0
				maxAlignment: 1
				adjustsStack: false
				hasCalls: false
				stackProtector: ''
				functionContext: ''
				maxCallFrameSize: 4294967295
				cvBytesOfCalleeSavedRegisters: 0
				hasOpaqueSPAdjustment: false
				hasVAStart: false
				hasMustTailInVarArgFunc: false
				hasTailCall: false
				localFrameSize: 0
				savePoint: ''
				restorePoint: ''
				fixedStack: []
				stack: []
				callSites: []
				debugValueSubstitutions: []
				constants: []
				machineFunctionInfo:
				explicitKernArgSize: 0
				maxKernArgAlign: 1
				ldsSize: 0
				gdsSize: 0
				dynLDSAlign: 1
				isEntryFunction: false
				noSignedZerosFPMath: false
				memoryBound: false
				waveLimiter: false
				hasSpilledSGPRs: false
				hasSpilledVGPRs: false
				scratchRSrcReg: '$private_rsrc_reg'
				frameOffsetReg: '$fp_reg'
				stackPtrOffsetReg: '$sp_reg'
				bytesInStackArgArea: 0
				returnsVoid: true
				argumentInfo:
				privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
				dispatchPtr: { reg: '$sgpr4_sgpr5' }
				queuePtr: { reg: '$sgpr6_sgpr7' }
				dispatchID: { reg: '$sgpr10_sgpr11' }
				workGroupIDX: { reg: '$sgpr12' }
				workGroupIDY: { reg: '$sgpr13' }
				workGroupIDZ: { reg: '$sgpr14' }
				LDSKernelId: { reg: '$sgpr15' }
				implicitArgPtr: { reg: '$sgpr8_sgpr9' }
				workItemIDX: { reg: '$vgpr31', mask: 1023 }
				workItemIDY: { reg: '$vgpr31', mask: 1047552 }
				workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
				mode:
				ieee: true
				dx10-clamp: true
				fp32-input-denormals: true
				fp32-output-denormals: true
				fp64-fp16-input-denormals: true
				fp64-fp16-output-denormals: true
				highBitsOf32BitAddress: 0
				occupancy: 10
				vgprForAGPRCopy: ''
				body: \|
				bb.0:
				liveins: $vgpr0, $sgpr0_sgpr1

				%1:sgpr_64 = COPY $sgpr0_sgpr1
				%0:vgpr_32 = COPY $vgpr0
				%2:sreg_32_xm0 = S_MOV_B32 255
				%3:vgpr_32 = V_AND_B32_e32 %2, %0, implicit $exec
				%4:vreg_64 = COPY $sgpr0_sgpr1
				%5:vgpr_32, %6:sreg_64_xexec = V_ADD_CO_U32_e64 %4.sub0, %3, 0, implicit $exec
				%7:vreg_64 = REG_SEQUENCE %5, %subreg.sub0, %3, %subreg.sub1
				GLOBAL_STORE_DWORDX2_SADDR %4.sub0, %7, %1, 0, 0, implicit $exec, implicit $exec :: (store (s64))

				...
				---
				name: test7_add_co_sdwa
				alignment: 1
				exposesReturnsTwice: false
				legalized: false
				regBankSelected: false
				selected: false
				failedISel: false
				tracksRegLiveness: true
				hasWinCFI: false
				callsEHReturn: false
				callsUnwindInit: false
				hasEHCatchret: false
				hasEHScopes: false
				hasEHFunclets: false
				failsVerification: false
				tracksDebugUserValues: false
				registers:
				- { id: 0, class: vgpr_32, preferred-register: '' }
				- { id: 1, class: sgpr_64, preferred-register: '' }
				- { id: 2, class: sreg_32_xm0, preferred-register: '' }
				- { id: 3, class: vgpr_32, preferred-register: '' }
				- { id: 4, class: sreg_64_xexec, preferred-register: '' }
				- { id: 5, class: vreg_64, preferred-register: '' }
				- { id: 6, class: vgpr_32, preferred-register: '' }
				- { id: 7, class: sreg_64_xexec, preferred-register: '' }
				- { id: 8, class: vreg_64, preferred-register: '' }
				liveins:
				- { reg: '$vgpr0', virtual-reg: '%0' }
				- { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
				frameInfo:
				isFrameAddressTaken: false
				isReturnAddressTaken: false
				hasStackMap: false
				hasPatchPoint: false
				stackSize: 0
				offsetAdjustment: 0
				maxAlignment: 1
				adjustsStack: false
				hasCalls: false
				stackProtector: ''
				functionContext: ''
				maxCallFrameSize: 4294967295
				cvBytesOfCalleeSavedRegisters: 0
				hasOpaqueSPAdjustment: false
				hasVAStart: false
				hasMustTailInVarArgFunc: false
				hasTailCall: false
				localFrameSize: 0
				savePoint: ''
				restorePoint: ''
				fixedStack: []
				stack: []
				callSites: []
				debugValueSubstitutions: []
				constants: []
				machineFunctionInfo:
				explicitKernArgSize: 0
				maxKernArgAlign: 1
				ldsSize: 0
				gdsSize: 0
				dynLDSAlign: 1
				isEntryFunction: false
				noSignedZerosFPMath: false
				memoryBound: false
				waveLimiter: false
				hasSpilledSGPRs: false
				hasSpilledVGPRs: false
				scratchRSrcReg: '$private_rsrc_reg'
				frameOffsetReg: '$fp_reg'
				stackPtrOffsetReg: '$sp_reg'
				bytesInStackArgArea: 0
				returnsVoid: true
				argumentInfo:
				privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
				dispatchPtr: { reg: '$sgpr4_sgpr5' }
				queuePtr: { reg: '$sgpr6_sgpr7' }
				dispatchID: { reg: '$sgpr10_sgpr11' }
				workGroupIDX: { reg: '$sgpr12' }
				workGroupIDY: { reg: '$sgpr13' }
				workGroupIDZ: { reg: '$sgpr14' }
				LDSKernelId: { reg: '$sgpr15' }
				implicitArgPtr: { reg: '$sgpr8_sgpr9' }
				workItemIDX: { reg: '$vgpr31', mask: 1023 }
				workItemIDY: { reg: '$vgpr31', mask: 1047552 }
				workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
				mode:
				ieee: true
				dx10-clamp: true
				fp32-input-denormals: true
				fp32-output-denormals: true
				fp64-fp16-input-denormals: true
				fp64-fp16-output-denormals: true
				highBitsOf32BitAddress: 0
				occupancy: 10
				vgprForAGPRCopy: ''
				body: \|
				bb.0:
				liveins: $vgpr0, $sgpr0_sgpr1

				%1:sgpr_64 = COPY $sgpr0_sgpr1
				%0:vgpr_32 = COPY $vgpr0
				%2:sreg_32_xm0 = S_MOV_B32 255
				%3:vgpr_32 = V_AND_B32_e32 %2, %0, implicit $exec
				%4:sreg_64_xexec = COPY $sgpr0_sgpr1
				%5:vreg_64 = COPY $sgpr0_sgpr1
				%6:vgpr_32, %7:sreg_64_xexec = V_ADDC_U32_e64 %5.sub1, %0, %4, 0, implicit $exec
				%8:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %3, %subreg.sub1
				GLOBAL_STORE_DWORDX2_SADDR %5.sub0, %8, %1, 0, 0, implicit $exec, implicit $exec :: (store (s64))

				...
				---
				name: test8_add_co_sdwa
				alignment: 1
				exposesReturnsTwice: false
				legalized: false
				regBankSelected: false
				selected: false
				failedISel: false
				tracksRegLiveness: true
				hasWinCFI: false
				callsEHReturn: false
				callsUnwindInit: false
				hasEHCatchret: false
				hasEHScopes: false
				hasEHFunclets: false
				failsVerification: false
				tracksDebugUserValues: false
				registers:
				- { id: 0, class: vgpr_32, preferred-register: '' }
				- { id: 1, class: sgpr_64, preferred-register: '' }
				- { id: 2, class: sreg_32_xm0, preferred-register: '' }
				- { id: 3, class: vgpr_32, preferred-register: '' }
				- { id: 4, class: vreg_64, preferred-register: '' }
				- { id: 5, class: vgpr_32, preferred-register: '' }
				- { id: 6, class: sreg_64_xexec, preferred-register: '' }
				- { id: 7, class: vgpr_32, preferred-register: '' }
				- { id: 8, class: sreg_64_xexec, preferred-register: '' }
				- { id: 9, class: vreg_64, preferred-register: '' }
				- { id: 10, class: vreg_64, preferred-register: '' }
				liveins:
				- { reg: '$vgpr0', virtual-reg: '%0' }
				- { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
				frameInfo:
				isFrameAddressTaken: false
				isReturnAddressTaken: false
				hasStackMap: false
				hasPatchPoint: false
				stackSize: 0
				offsetAdjustment: 0
				maxAlignment: 1
				adjustsStack: false
				hasCalls: false
				stackProtector: ''
				functionContext: ''
				maxCallFrameSize: 4294967295
				cvBytesOfCalleeSavedRegisters: 0
				hasOpaqueSPAdjustment: false
				hasVAStart: false
				hasMustTailInVarArgFunc: false
				hasTailCall: false
				localFrameSize: 0
				savePoint: ''
				restorePoint: ''
				fixedStack: []
				stack: []
				callSites: []
				debugValueSubstitutions: []
				constants: []
				machineFunctionInfo:
				explicitKernArgSize: 0
				maxKernArgAlign: 1
				ldsSize: 0
				gdsSize: 0
				dynLDSAlign: 1
				isEntryFunction: false
				noSignedZerosFPMath: false
				memoryBound: false
				waveLimiter: false
				hasSpilledSGPRs: false
				hasSpilledVGPRs: false
				scratchRSrcReg: '$private_rsrc_reg'
				frameOffsetReg: '$fp_reg'
				stackPtrOffsetReg: '$sp_reg'
				bytesInStackArgArea: 0
				returnsVoid: true
				argumentInfo:
				privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
				dispatchPtr: { reg: '$sgpr4_sgpr5' }
				queuePtr: { reg: '$sgpr6_sgpr7' }
				dispatchID: { reg: '$sgpr10_sgpr11' }
				workGroupIDX: { reg: '$sgpr12' }
				workGroupIDY: { reg: '$sgpr13' }
				workGroupIDZ: { reg: '$sgpr14' }
				LDSKernelId: { reg: '$sgpr15' }
				implicitArgPtr: { reg: '$sgpr8_sgpr9' }
				workItemIDX: { reg: '$vgpr31', mask: 1023 }
				workItemIDY: { reg: '$vgpr31', mask: 1047552 }
				workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
				mode:
				ieee: true
				dx10-clamp: true
				fp32-input-denormals: true
				fp32-output-denormals: true
				fp64-fp16-input-denormals: true
				fp64-fp16-output-denormals: true
				highBitsOf32BitAddress: 0
				occupancy: 10
				vgprForAGPRCopy: ''
				body: \|
				bb.0:
				liveins: $vgpr0, $sgpr0_sgpr1

				%1:sgpr_64 = COPY $sgpr0_sgpr1
				%0:vgpr_32 = COPY $vgpr0
				%2:sreg_32_xm0 = S_MOV_B32 255
				%3:vgpr_32 = V_AND_B32_e32 %2, %0, implicit $exec
				%4:vreg_64 = COPY $sgpr0_sgpr1
				%5:vgpr_32, %6:sreg_64_xexec = V_ADD_CO_U32_e64 %4.sub0, %3, 0, implicit $exec
				$vcc = COPY %4
				%7:vgpr_32, %8:sreg_64_xexec = V_ADDC_U32_e64 %4.sub1, %0, %6, 0, implicit $exec
				%9:vreg_64 = COPY $vcc
				%10:vreg_64 = REG_SEQUENCE %5, %subreg.sub0, %7, %subreg.sub1
				GLOBAL_STORE_DWORDX2_SADDR %9.sub0, %10, %1, 0, 0, implicit $exec, implicit $exec :: (store (s64))

				...
				---
				name: test9_add_co_sdwa
				alignment: 1
				exposesReturnsTwice: false
				legalized: false
				regBankSelected: false
				selected: false
				failedISel: false
				tracksRegLiveness: true
				hasWinCFI: false
				callsEHReturn: false
				callsUnwindInit: false
				hasEHCatchret: false
				hasEHScopes: false
				hasEHFunclets: false
				failsVerification: false
				tracksDebugUserValues: false
				registers:
				- { id: 0, class: vgpr_32, preferred-register: '' }
				- { id: 1, class: sgpr_64, preferred-register: '' }
				- { id: 2, class: sreg_32_xm0, preferred-register: '' }
				- { id: 3, class: vreg_64, preferred-register: '' }
				- { id: 4, class: vgpr_32, preferred-register: '' }
				- { id: 5, class: vgpr_32, preferred-register: '' }
				- { id: 6, class: sreg_64_xexec, preferred-register: '' }
				- { id: 7, class: vgpr_32, preferred-register: '' }
				- { id: 8, class: sreg_64_xexec, preferred-register: '' }
				- { id: 9, class: vreg_64, preferred-register: '' }
				- { id: 10, class: vreg_64, preferred-register: '' }
				liveins:
				- { reg: '$vgpr0', virtual-reg: '%0' }
				- { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
				frameInfo:
				isFrameAddressTaken: false
				isReturnAddressTaken: false
				hasStackMap: false
				hasPatchPoint: false
				stackSize: 0
				offsetAdjustment: 0
				maxAlignment: 1
				adjustsStack: false
				hasCalls: false
				stackProtector: ''
				functionContext: ''
				maxCallFrameSize: 4294967295
				cvBytesOfCalleeSavedRegisters: 0
				hasOpaqueSPAdjustment: false
				hasVAStart: false
				hasMustTailInVarArgFunc: false
				hasTailCall: false
				localFrameSize: 0
				savePoint: ''
				restorePoint: ''
				fixedStack: []
				stack: []
				callSites: []
				debugValueSubstitutions: []
				constants: []
				machineFunctionInfo:
				explicitKernArgSize: 0
				maxKernArgAlign: 1
				ldsSize: 0
				gdsSize: 0
				dynLDSAlign: 1
				isEntryFunction: false
				noSignedZerosFPMath: false
				memoryBound: false
				waveLimiter: false
				hasSpilledSGPRs: false
				hasSpilledVGPRs: false
				scratchRSrcReg: '$private_rsrc_reg'
				frameOffsetReg: '$fp_reg'
				stackPtrOffsetReg: '$sp_reg'
				bytesInStackArgArea: 0
				returnsVoid: true
				argumentInfo:
				privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
				dispatchPtr: { reg: '$sgpr4_sgpr5' }
				queuePtr: { reg: '$sgpr6_sgpr7' }
				dispatchID: { reg: '$sgpr10_sgpr11' }
				workGroupIDX: { reg: '$sgpr12' }
				workGroupIDY: { reg: '$sgpr13' }
				workGroupIDZ: { reg: '$sgpr14' }
				LDSKernelId: { reg: '$sgpr15' }
				implicitArgPtr: { reg: '$sgpr8_sgpr9' }
				workItemIDX: { reg: '$vgpr31', mask: 1023 }
				workItemIDY: { reg: '$vgpr31', mask: 1047552 }
				workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
				mode:
				ieee: true
				dx10-clamp: true
				fp32-input-denormals: true
				fp32-output-denormals: true
				fp64-fp16-input-denormals: true
				fp64-fp16-output-denormals: true
				highBitsOf32BitAddress: 0
				occupancy: 10
				vgprForAGPRCopy: ''
				body: \|
				bb.0:
				liveins: $vgpr0, $sgpr0_sgpr1

				%1:sgpr_64 = COPY $sgpr0_sgpr1
				%0:vgpr_32 = COPY $vgpr0
				%2:sreg_32_xm0 = S_MOV_B32 255
				%3:vreg_64 = COPY $sgpr0_sgpr1
				$vcc = COPY %3
				%4:vgpr_32 = V_AND_B32_e32 %2, %0, implicit $exec
				%5:vgpr_32, %6:sreg_64_xexec = V_ADD_CO_U32_e64 %3.sub0, %4, 0, implicit $exec
				%7:vgpr_32, %8:sreg_64_xexec = V_ADDC_U32_e64 %3.sub1, %0, %6, 0, implicit $exec
				%9:vreg_64 = COPY $vcc
				%10:vreg_64 = REG_SEQUENCE %5, %subreg.sub0, %7, %subreg.sub1
				GLOBAL_STORE_DWORDX2_SADDR %9.sub0, %10, %1, 0, 0, implicit $exec, implicit $exec :: (store (s64))

				...
				---
				name: test10_add_co_sdwa
				alignment: 1
				exposesReturnsTwice: false
				legalized: false
				regBankSelected: false
				selected: false
				failedISel: false
				tracksRegLiveness: true
				hasWinCFI: false
				callsEHReturn: false
				callsUnwindInit: false
				hasEHCatchret: false
				hasEHScopes: false
				hasEHFunclets: false
				failsVerification: false
				tracksDebugUserValues: false
				registers:
				- { id: 0, class: vgpr_32, preferred-register: '' }
				- { id: 1, class: sgpr_64, preferred-register: '' }
				- { id: 2, class: sreg_32_xm0, preferred-register: '' }
				- { id: 3, class: vreg_64, preferred-register: '' }
				- { id: 4, class: vgpr_32, preferred-register: '' }
				- { id: 5, class: vgpr_32, preferred-register: '' }
				- { id: 6, class: sreg_64_xexec, preferred-register: '' }
				- { id: 7, class: vgpr_32, preferred-register: '' }
				- { id: 8, class: vreg_64, preferred-register: '' }
				- { id: 9, class: vgpr_32, preferred-register: '' }
				- { id: 10, class: sreg_64_xexec, preferred-register: '' }
				- { id: 11, class: vreg_64, preferred-register: '' }
				liveins:
				- { reg: '$vgpr0', virtual-reg: '%0' }
				- { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
				frameInfo:
				isFrameAddressTaken: false
				isReturnAddressTaken: false
				hasStackMap: false
				hasPatchPoint: false
				stackSize: 0
				offsetAdjustment: 0
				maxAlignment: 1
				adjustsStack: false
				hasCalls: false
				stackProtector: ''
				functionContext: ''
				maxCallFrameSize: 4294967295
				cvBytesOfCalleeSavedRegisters: 0
				hasOpaqueSPAdjustment: false
				hasVAStart: false
				hasMustTailInVarArgFunc: false
				hasTailCall: false
				localFrameSize: 0
				savePoint: ''
				restorePoint: ''
				fixedStack: []
				stack: []
				callSites: []
				debugValueSubstitutions: []
				constants: []
				machineFunctionInfo:
				explicitKernArgSize: 0
				maxKernArgAlign: 1
				ldsSize: 0
				gdsSize: 0
				dynLDSAlign: 1
				isEntryFunction: false
				noSignedZerosFPMath: false
				memoryBound: false
				waveLimiter: false
				hasSpilledSGPRs: false
				hasSpilledVGPRs: false
				scratchRSrcReg: '$private_rsrc_reg'
				frameOffsetReg: '$fp_reg'
				stackPtrOffsetReg: '$sp_reg'
				bytesInStackArgArea: 0
				returnsVoid: true
				argumentInfo:
				privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
				dispatchPtr: { reg: '$sgpr4_sgpr5' }
				queuePtr: { reg: '$sgpr6_sgpr7' }
				dispatchID: { reg: '$sgpr10_sgpr11' }
				workGroupIDX: { reg: '$sgpr12' }
				workGroupIDY: { reg: '$sgpr13' }
				workGroupIDZ: { reg: '$sgpr14' }
				LDSKernelId: { reg: '$sgpr15' }
				implicitArgPtr: { reg: '$sgpr8_sgpr9' }
				workItemIDX: { reg: '$vgpr31', mask: 1023 }
				workItemIDY: { reg: '$vgpr31', mask: 1047552 }
				workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
				mode:
				ieee: true
				dx10-clamp: true
				fp32-input-denormals: true
				fp32-output-denormals: true
				fp64-fp16-input-denormals: true
				fp64-fp16-output-denormals: true
				highBitsOf32BitAddress: 0
				occupancy: 10
				vgprForAGPRCopy: ''
				body: \|
				bb.0:
				liveins: $vgpr0, $sgpr0_sgpr1

				%1:sgpr_64 = COPY $sgpr0_sgpr1
				%0:vgpr_32 = COPY $vgpr0
				%2:sreg_32_xm0 = S_MOV_B32 255
				%3:vreg_64 = COPY $sgpr0_sgpr1
				$vcc_lo = COPY %3.sub0
				%4:vgpr_32 = V_AND_B32_e32 %2, %0, implicit $exec
				%5:vgpr_32, %6:sreg_64_xexec = V_ADD_CO_U32_e64 %3.sub0, %4, 0, implicit $exec
				%7:vgpr_32 = COPY $vcc_lo
				%8:vreg_64 = REG_SEQUENCE %7, %subreg.sub0, %4, %subreg.sub1
				%9:vgpr_32, %10:sreg_64_xexec = V_ADDC_U32_e64 %3.sub1, %0, %6, 0, implicit $exec
				%11:vreg_64 = REG_SEQUENCE %5, %subreg.sub0, %9, %subreg.sub1
				GLOBAL_STORE_DWORDX2_SADDR %8.sub0, %11, %1, 0, 0, implicit $exec, implicit $exec :: (store (s64))

				...
				---
				name: test11_add_co_sdwa
				alignment: 1
				exposesReturnsTwice: false
				legalized: false
				regBankSelected: false
				selected: false
				failedISel: false
				tracksRegLiveness: true
				hasWinCFI: false
				callsEHReturn: false
				callsUnwindInit: false
				hasEHCatchret: false
				hasEHScopes: false
				hasEHFunclets: false
				failsVerification: false
				tracksDebugUserValues: false
				registers:
				- { id: 0, class: vgpr_32, preferred-register: '' }
				- { id: 1, class: sgpr_64, preferred-register: '' }
				- { id: 2, class: sreg_32_xm0, preferred-register: '' }
				- { id: 3, class: vreg_64, preferred-register: '' }
				- { id: 4, class: vgpr_32, preferred-register: '' }
				- { id: 5, class: vgpr_32, preferred-register: '' }
				- { id: 6, class: sreg_64_xexec, preferred-register: '' }
				- { id: 7, class: vgpr_32, preferred-register: '' }
				- { id: 8, class: vreg_64, preferred-register: '' }
				- { id: 9, class: vgpr_32, preferred-register: '' }
				- { id: 10, class: sreg_64_xexec, preferred-register: '' }
				- { id: 11, class: vreg_64, preferred-register: '' }
				liveins:
				- { reg: '$vgpr0', virtual-reg: '%0' }
				- { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
				frameInfo:
				isFrameAddressTaken: false
				isReturnAddressTaken: false
				hasStackMap: false
				hasPatchPoint: false
				stackSize: 0
				offsetAdjustment: 0
				maxAlignment: 1
				adjustsStack: false
				hasCalls: false
				stackProtector: ''
				functionContext: ''
				maxCallFrameSize: 4294967295
				cvBytesOfCalleeSavedRegisters: 0
				hasOpaqueSPAdjustment: false
				hasVAStart: false
				hasMustTailInVarArgFunc: false
				hasTailCall: false
				localFrameSize: 0
				savePoint: ''
				restorePoint: ''
				fixedStack: []
				stack: []
				callSites: []
				debugValueSubstitutions: []
				constants: []
				machineFunctionInfo:
				explicitKernArgSize: 0
				maxKernArgAlign: 1
				ldsSize: 0
				gdsSize: 0
				dynLDSAlign: 1
				isEntryFunction: false
				noSignedZerosFPMath: false
				memoryBound: false
				waveLimiter: false
				hasSpilledSGPRs: false
				hasSpilledVGPRs: false
				scratchRSrcReg: '$private_rsrc_reg'
				frameOffsetReg: '$fp_reg'
				stackPtrOffsetReg: '$sp_reg'
				bytesInStackArgArea: 0
				returnsVoid: true
				argumentInfo:
				privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
				dispatchPtr: { reg: '$sgpr4_sgpr5' }
				queuePtr: { reg: '$sgpr6_sgpr7' }
				dispatchID: { reg: '$sgpr10_sgpr11' }
				workGroupIDX: { reg: '$sgpr12' }
				workGroupIDY: { reg: '$sgpr13' }
				workGroupIDZ: { reg: '$sgpr14' }
				LDSKernelId: { reg: '$sgpr15' }
				implicitArgPtr: { reg: '$sgpr8_sgpr9' }
				workItemIDX: { reg: '$vgpr31', mask: 1023 }
				workItemIDY: { reg: '$vgpr31', mask: 1047552 }
				workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
				mode:
				ieee: true
				dx10-clamp: true
				fp32-input-denormals: true
				fp32-output-denormals: true
				fp64-fp16-input-denormals: true
				fp64-fp16-output-denormals: true
				highBitsOf32BitAddress: 0
				occupancy: 10
				vgprForAGPRCopy: ''
				body: \|
				bb.0:
				liveins: $vgpr0, $sgpr0_sgpr1

				%1:sgpr_64 = COPY $sgpr0_sgpr1
				%0:vgpr_32 = COPY $vgpr0
				%2:sreg_32_xm0 = S_MOV_B32 255
				%3:vreg_64 = COPY $sgpr0_sgpr1
				$vcc_hi = COPY %3.sub0
				%4:vgpr_32 = V_AND_B32_e32 %2, %0, implicit $exec
				%5:vgpr_32, %6:sreg_64_xexec = V_ADD_CO_U32_e64 %3.sub0, %4, 0, implicit $exec
				%7:vgpr_32 = COPY $vcc_hi
				%8:vreg_64 = REG_SEQUENCE %7, %subreg.sub0, %4, %subreg.sub1
				%9:vgpr_32, %10:sreg_64_xexec = V_ADDC_U32_e64 %3.sub1, %0, %6, 0, implicit $exec
				%11:vreg_64 = REG_SEQUENCE %5, %subreg.sub0, %9, %subreg.sub1
				GLOBAL_STORE_DWORDX2_SADDR %8.sub0, %11, %1, 0, 0, implicit $exec, implicit $exec :: (store (s64))

				...
				---
				name: test12_add_co_sdwa
				alignment: 1
				exposesReturnsTwice: false
				legalized: false
				regBankSelected: false
				selected: false
				failedISel: false
				tracksRegLiveness: true
				hasWinCFI: false
				callsEHReturn: false
				callsUnwindInit: false
				hasEHCatchret: false
				hasEHScopes: false
				hasEHFunclets: false
				failsVerification: false
				tracksDebugUserValues: false
				registers:
				- { id: 0, class: vgpr_32, preferred-register: '' }
				- { id: 1, class: sgpr_64, preferred-register: '' }
				- { id: 2, class: sreg_32_xm0, preferred-register: '' }
				- { id: 3, class: vreg_64, preferred-register: '' }
				- { id: 4, class: vgpr_32, preferred-register: '' }
				- { id: 5, class: vgpr_32, preferred-register: '' }
				- { id: 6, class: sreg_64_xexec, preferred-register: '' }
				- { id: 7, class: vreg_64, preferred-register: '' }
				- { id: 8, class: vgpr_32, preferred-register: '' }
				- { id: 9, class: sreg_64_xexec, preferred-register: '' }
				- { id: 10, class: vreg_64, preferred-register: '' }
				liveins:
				- { reg: '$vgpr0', virtual-reg: '%0' }
				- { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
				frameInfo:
				isFrameAddressTaken: false
				isReturnAddressTaken: false
				hasStackMap: false
				hasPatchPoint: false
				stackSize: 0
				offsetAdjustment: 0
				maxAlignment: 1
				adjustsStack: false
				hasCalls: false
				stackProtector: ''
				functionContext: ''
				maxCallFrameSize: 4294967295
				cvBytesOfCalleeSavedRegisters: 0
				hasOpaqueSPAdjustment: false
				hasVAStart: false
				hasMustTailInVarArgFunc: false
				hasTailCall: false
				localFrameSize: 0
				savePoint: ''
				restorePoint: ''
				fixedStack: []
				stack: []
				callSites: []
				debugValueSubstitutions: []
				constants: []
				machineFunctionInfo:
				explicitKernArgSize: 0
				maxKernArgAlign: 1
				ldsSize: 0
				gdsSize: 0
				dynLDSAlign: 1
				isEntryFunction: false
				noSignedZerosFPMath: false
				memoryBound: false
				waveLimiter: false
				hasSpilledSGPRs: false
				hasSpilledVGPRs: false
				scratchRSrcReg: '$private_rsrc_reg'
				frameOffsetReg: '$fp_reg'
				stackPtrOffsetReg: '$sp_reg'
				bytesInStackArgArea: 0
				returnsVoid: true
				argumentInfo:
				privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
				dispatchPtr: { reg: '$sgpr4_sgpr5' }
				queuePtr: { reg: '$sgpr6_sgpr7' }
				dispatchID: { reg: '$sgpr10_sgpr11' }
				workGroupIDX: { reg: '$sgpr12' }
				workGroupIDY: { reg: '$sgpr13' }
				workGroupIDZ: { reg: '$sgpr14' }
				LDSKernelId: { reg: '$sgpr15' }
				implicitArgPtr: { reg: '$sgpr8_sgpr9' }
				workItemIDX: { reg: '$vgpr31', mask: 1023 }
				workItemIDY: { reg: '$vgpr31', mask: 1047552 }
				workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
				mode:
				ieee: true
				dx10-clamp: true
				fp32-input-denormals: true
				fp32-output-denormals: true
				fp64-fp16-input-denormals: true
				fp64-fp16-output-denormals: true
				highBitsOf32BitAddress: 0
				occupancy: 10
				vgprForAGPRCopy: ''
				body: \|
				bb.0:
				liveins: $vgpr0, $sgpr0_sgpr1

				%1:sgpr_64 = COPY $sgpr0_sgpr1
				%0:vgpr_32 = COPY $vgpr0
				%2:sreg_32_xm0 = S_MOV_B32 255
				%3:vreg_64 = COPY $sgpr0_sgpr1
				%4:vgpr_32 = V_AND_B32_e32 %2, %0, implicit $exec
				%5:vgpr_32, %6:sreg_64_xexec = V_ADD_CO_U32_e64 %3.sub0, %4, 0, implicit $exec
				$vcc = COPY %3
				%7:vreg_64 = COPY killed $vcc
				%8:vgpr_32, %9:sreg_64_xexec = V_ADDC_U32_e64 %3.sub1, %0, %6, 0, implicit $exec
				%10:vreg_64 = REG_SEQUENCE %5, %subreg.sub0, %8, %subreg.sub1
				GLOBAL_STORE_DWORDX2_SADDR %7.sub0, %10, %1, 0, 0, implicit $exec, implicit $exec :: (store (s64))

				...

This is an archive of the discontinued LLVM Phabricator instance.

Handling ADD|SUB U64 decomposed Pseudos not getting lowered to SDWA form
ClosedPublic

Details

Diff Detail

Unit TestsFailed

Event Timeline

Revision Contents

Diff 474487

llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp

llvm/test/CodeGen/AMDGPU/sdwa-ops.mir

llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll

llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll

llvm/tmp.mir

This is an archive of the discontinued LLVM Phabricator instance.

Handling ADD|SUB U64 decomposed Pseudos not getting lowered to SDWA formClosedPublic

Details

Diff Detail

Unit TestsFailed

Event Timeline

Revision Contents

Diff 474487

llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp

llvm/test/CodeGen/AMDGPU/sdwa-ops.mir

llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll

llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll

llvm/tmp.mir

Handling ADD|SUB U64 decomposed Pseudos not getting lowered to SDWA form
ClosedPublic