This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Fold 64 bit immediates
AbandonedPublic

Authored by rampitec on Dec 11 2019, 10:41 AM.

Download Raw Diff

Details

Reviewers

arsenm
kzhuravl

Summary

64 bit operand folding did not work because these constants
are usually repsresented as a reg_sequence. Also legality checks
were partially missing and partially too restrictive. We can use
a 64 bit immediate if it can be represented by a 32 bit sign
extended integer.

Diff Detail

Event Timeline

rampitec created this revision.Dec 11 2019, 10:41 AM

Herald added a project: Restricted Project. · View Herald TranscriptDec 11 2019, 10:41 AM

Herald added subscribers: hiraditya, t-tye, tpr and 5 others. · View Herald Transcript

arsenm added inline comments.Dec 11 2019, 8:50 PM

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
2954–2958	I think this is more complicated and depends on the context instruction, which is why this was never done. I think some instructions zero-extend the 32-bit constants (including FP), and then maybe some sign extend

I would also expect the immediate selection to understand the expanded set of immediates before the folding

rampitec marked an inline comment as done.Dec 11 2019, 9:25 PM

rampitec added inline comments.

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
2954–2958	Do you have an example? As far as I understand HW logic is quite primitive and doesn't distinguish, so it is always sign extended. At least this has passed PSDB and was used specifically in the fp context.

In D71367#1780848, @arsenm wrote:

I would also expect the immediate selection to understand the expanded set of immediates before the folding

That's a separate optimization. I can also see how this folding can be converted into an s_mov_b64 in a constant limited case, but it is again a separate optimization.

rampitec abandoned this revision.Dec 12 2019, 11:13 AM

arsenm added inline comments.Dec 21 2019, 2:27 AM

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
2954–2958	You would have to craft new execution tests for FP constants. It's unlikely anything is really testing FP constants that would stress this

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

MCTargetDesc/

AMDGPUInstPrinter.cpp

2 lines

SIFoldOperands.cpp

33 lines

SIInstrInfo.cpp

9 lines

test/

CodeGen/

AMDGPU/

cross-block-use-is-not-abi-copy.ll

12 lines

inline-constraints.ll

120 lines

shl.ll

9 lines

Diff 233411

llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp

Show First 20 Lines • Show All 443 Lines • ▼ Show 20 Lines	void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
else if (Imm == DoubleToBits(4.0))		else if (Imm == DoubleToBits(4.0))
O << "4.0";		O << "4.0";
else if (Imm == DoubleToBits(-4.0))		else if (Imm == DoubleToBits(-4.0))
O << "-4.0";		O << "-4.0";
else if (Imm == 0x3fc45f306dc9c882 &&		else if (Imm == 0x3fc45f306dc9c882 &&
STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])		STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
O << "0.15915494309189532";		O << "0.15915494309189532";
else {		else {
assert(isUInt<32>(Imm) \|\| Imm == 0x3fc45f306dc9c882);		assert(isInt<32>(Imm) \|\| isUInt<32>(Imm) \|\| Imm == 0x3fc45f306dc9c882);

// In rare situations, we will have a 32-bit literal in a 64-bit		// In rare situations, we will have a 32-bit literal in a 64-bit
// operand. This is technically allowed for the encoding of s_mov_b64.		// operand. This is technically allowed for the encoding of s_mov_b64.
O << formatHex(static_cast<uint64_t>(Imm));		O << formatHex(static_cast<uint64_t>(Imm));
}		}
}		}

void AMDGPUInstPrinter::printBLGP(const MCInst *MI, unsigned OpNo,		void AMDGPUInstPrinter::printBLGP(const MCInst *MI, unsigned OpNo,
▲ Show 20 Lines • Show All 1,087 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Show First 20 Lines • Show All 99 Lines • ▼ Show 20 Lines	public:

void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;		void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;

const MachineOperand *isClamp(const MachineInstr &MI) const;		const MachineOperand *isClamp(const MachineInstr &MI) const;
bool tryFoldClamp(MachineInstr &MI);		bool tryFoldClamp(MachineInstr &MI);

std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;		std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
bool tryFoldOMod(MachineInstr &MI);		bool tryFoldOMod(MachineInstr &MI);
		bool tryFoldRegSeqence(MachineInstr &MI);

public:		public:
SIFoldOperands() : MachineFunctionPass(ID) {		SIFoldOperands() : MachineFunctionPass(ID) {
initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());		initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
}		}

bool runOnMachineFunction(MachineFunction &MF) override;		bool runOnMachineFunction(MachineFunction &MF) override;

▲ Show 20 Lines • Show All 1,342 Lines • ▼ Show 20 Lines	bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n');		LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n');

DefOMod->setImm(OMod);		DefOMod->setImm(OMod);
MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());		MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
MI.eraseFromParent();		MI.eraseFromParent();
return true;		return true;
}		}

		// Try to fold 64-bit immediate reg_sequence into uses.
		bool SIFoldOperands::tryFoldRegSeqence(MachineInstr &MI) {
		assert(MI.isRegSequence());
		auto Reg = MI.getOperand(0).getReg();
		SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;

		if (Reg.isPhysical())
		return false;

		if (TII->getOpSize(MI, 0) != 8 \|\|
		!getRegSeqInit(Defs, Reg, AMDGPU::OPERAND_REG_IMM_INT32, TII, *MRI))
		return false;

		assert(Defs.size() == 2);
		if (!Defs[0].first->isImm() \|\| !Defs[1].first->isImm())
		return false;

		uint64_t Lit = ((Defs[0].first->getImm() & 0xffffffff) <<
		(Defs[0].second == AMDGPU::sub0 ? 0 : 32)) \|
		((Defs[1].first->getImm() & 0xffffffff) <<
		(Defs[1].second == AMDGPU::sub0 ? 0 : 32));

		MI.addOperand(MachineOperand::CreateImm(Lit));
		foldInstOperand(MI, MI.getOperand(MI.getNumOperands() - 1));
		MI.RemoveOperand(MI.getNumOperands() - 1);

		return true;
		}

bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {		bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))		if (skipFunction(MF.getFunction()))
return false;		return false;

MRI = &MF.getRegInfo();		MRI = &MF.getRegInfo();
ST = &MF.getSubtarget<GCNSubtarget>();		ST = &MF.getSubtarget<GCNSubtarget>();
TII = ST->getInstrInfo();		TII = ST->getInstrInfo();
TRI = &TII->getRegisterInfo();		TRI = &TII->getRegisterInfo();
Show All 11 Lines	for (MachineBasicBlock *MBB : depth_first(&MF)) {

MachineOperand *CurrentKnownM0Val = nullptr;		MachineOperand *CurrentKnownM0Val = nullptr;
for (I = MBB->begin(); I != MBB->end(); I = Next) {		for (I = MBB->begin(); I != MBB->end(); I = Next) {
Next = std::next(I);		Next = std::next(I);
MachineInstr &MI = *I;		MachineInstr &MI = *I;

tryFoldInst(TII, &MI);		tryFoldInst(TII, &MI);

		if (MI.isRegSequence() && tryFoldRegSeqence(MI))
		continue;

if (!TII->isFoldableCopy(MI)) {		if (!TII->isFoldableCopy(MI)) {
// Saw an unknown clobber of m0, so we no longer know what it is.		// Saw an unknown clobber of m0, so we no longer know what it is.
if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))		if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
CurrentKnownM0Val = nullptr;		CurrentKnownM0Val = nullptr;

// TODO: Omod might be OK if there is NSZ only on the source		// TODO: Omod might be OK if there is NSZ only on the source
// instruction, and not the omod multiply.		// instruction, and not the omod multiply.
if (IsIEEEMode \|\| (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) \|\|		if (IsIEEEMode \|\| (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) \|\|
▲ Show 20 Lines • Show All 47 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Show First 20 Lines • Show All 2,844 Lines • ▼ Show 20 Lines	bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
switch (OperandType) {		switch (OperandType) {
case AMDGPU::OPERAND_REG_IMM_INT32:		case AMDGPU::OPERAND_REG_IMM_INT32:
case AMDGPU::OPERAND_REG_IMM_FP32:		case AMDGPU::OPERAND_REG_IMM_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_INT32:		case AMDGPU::OPERAND_REG_INLINE_C_INT32:
case AMDGPU::OPERAND_REG_INLINE_C_FP32:		case AMDGPU::OPERAND_REG_INLINE_C_FP32:
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:		case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP32: {		case AMDGPU::OPERAND_REG_INLINE_AC_FP32: {
int32_t Trunc = static_cast<int32_t>(Imm);		int32_t Trunc = static_cast<int32_t>(Imm);
return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());		return (isInt<32>(Imm) \|\| isUInt<32>(Imm)) &&
		AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
}		}
case AMDGPU::OPERAND_REG_IMM_INT64:		case AMDGPU::OPERAND_REG_IMM_INT64:
case AMDGPU::OPERAND_REG_IMM_FP64:		case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_INT64:		case AMDGPU::OPERAND_REG_INLINE_C_INT64:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:		case AMDGPU::OPERAND_REG_INLINE_C_FP64:
return AMDGPU::isInlinableLiteral64(MO.getImm(),		return AMDGPU::isInlinableLiteral64(MO.getImm(),
ST.hasInv2PiInlineImm());		ST.hasInv2PiInlineImm());
case AMDGPU::OPERAND_REG_IMM_INT16:		case AMDGPU::OPERAND_REG_IMM_INT16:
▲ Show 20 Lines • Show All 83 Lines • ▼ Show 20 Lines	if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
AMDGPU::OpName::src2))		AMDGPU::OpName::src2))
return false;		return false;
return RI.opCanUseInlineConstant(OpInfo.OperandType);		return RI.opCanUseInlineConstant(OpInfo.OperandType);
}		}

if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))		if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
return false;		return false;

		if (MO.isImm()) {
		int64_t Imm = MO.getImm();
		if (!isInt<32>(Imm) && !isUInt<32>(Imm))
		return false;
		}
		arsenmUnsubmitted Not Done Reply Inline Actions I think this is more complicated and depends on the context instruction, which is why this was never done. I think some instructions zero-extend the 32-bit constants (including FP), and then maybe some sign extend arsenm: I think this is more complicated and depends on the context instruction, which is why this was…
		rampitecAuthorUnsubmitted Done Reply Inline Actions Do you have an example? As far as I understand HW logic is quite primitive and doesn't distinguish, so it is always sign extended. At least this has passed PSDB and was used specifically in the fp context. rampitec: Do you have an example? As far as I understand HW logic is quite primitive and doesn't…
		arsenmUnsubmitted Not Done Reply Inline Actions You would have to craft new execution tests for FP constants. It's unlikely anything is really testing FP constants that would stress this arsenm: You would have to craft new execution tests for FP constants. It's unlikely anything is really…

if (!isVOP3(MI) \|\| !AMDGPU::isSISrcOperand(InstDesc, OpNo))		if (!isVOP3(MI) \|\| !AMDGPU::isSISrcOperand(InstDesc, OpNo))
return true;		return true;

return ST.hasVOP3Literal();		return ST.hasVOP3Literal();
}		}

bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {		bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
int Op32 = AMDGPU::getVOPe32(Opcode);		int Op32 = AMDGPU::getVOPe32(Opcode);
▲ Show 20 Lines • Show All 3,662 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll

	Show First 20 Lines • Show All 173 Lines • ▼ Show 20 Lines
	; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0			; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
	; GCN-NEXT: s_mov_b32 s32, s33			; GCN-NEXT: s_mov_b32 s32, s33
	; GCN-NEXT: s_waitcnt lgkmcnt(0)			; GCN-NEXT: s_waitcnt lgkmcnt(0)
	; GCN-NEXT: s_and_b32 s4, 1, s4			; GCN-NEXT: s_and_b32 s4, 1, s4
	; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 1			; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 1
	; GCN-NEXT: s_and_b64 vcc, exec, s[4:5]			; GCN-NEXT: s_and_b64 vcc, exec, s[4:5]
	; GCN-NEXT: s_cbranch_vccz BB4_2			; GCN-NEXT: s_cbranch_vccz BB4_2
	; GCN-NEXT: ; %bb.1:			; GCN-NEXT: ; %bb.1:
	; GCN-NEXT: s_mov_b32 s4, 0			; GCN-NEXT: v_mov_b32_e32 v0, 0
	; GCN-NEXT: s_mov_b32 s5, s4			; GCN-NEXT: v_mov_b32_e32 v1, 0
	; GCN-NEXT: v_mov_b32_e32 v0, s4
	; GCN-NEXT: v_mov_b32_e32 v1, s5
	; GCN-NEXT: s_branch BB4_3			; GCN-NEXT: s_branch BB4_3
	; GCN-NEXT: BB4_2: ; %if.else			; GCN-NEXT: BB4_2: ; %if.else
	; GCN-NEXT: s_getpc_b64 s[4:5]			; GCN-NEXT: s_getpc_b64 s[4:5]
	; GCN-NEXT: s_add_u32 s4, s4, func_v3i16@rel32@lo+4			; GCN-NEXT: s_add_u32 s4, s4, func_v3i16@rel32@lo+4
	; GCN-NEXT: s_addc_u32 s5, s5, func_v3i16@rel32@hi+4			; GCN-NEXT: s_addc_u32 s5, s5, func_v3i16@rel32@hi+4
	; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]			; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
	; GCN-NEXT: BB4_3: ; %if.end			; GCN-NEXT: BB4_3: ; %if.end
	; GCN-NEXT: global_store_short v[0:1], v1, off			; GCN-NEXT: global_store_short v[0:1], v1, off
	Show All 24 Lines
	; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0			; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
	; GCN-NEXT: s_mov_b32 s32, s33			; GCN-NEXT: s_mov_b32 s32, s33
	; GCN-NEXT: s_waitcnt lgkmcnt(0)			; GCN-NEXT: s_waitcnt lgkmcnt(0)
	; GCN-NEXT: s_and_b32 s4, 1, s4			; GCN-NEXT: s_and_b32 s4, 1, s4
	; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 1			; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 1
	; GCN-NEXT: s_and_b64 vcc, exec, s[4:5]			; GCN-NEXT: s_and_b64 vcc, exec, s[4:5]
	; GCN-NEXT: s_cbranch_vccz BB5_2			; GCN-NEXT: s_cbranch_vccz BB5_2
	; GCN-NEXT: ; %bb.1:			; GCN-NEXT: ; %bb.1:
	; GCN-NEXT: s_mov_b32 s4, 0			; GCN-NEXT: v_mov_b32_e32 v0, 0
	; GCN-NEXT: s_mov_b32 s5, s4			; GCN-NEXT: v_mov_b32_e32 v1, 0
	; GCN-NEXT: v_mov_b32_e32 v0, s4
	; GCN-NEXT: v_mov_b32_e32 v1, s5
	; GCN-NEXT: s_branch BB5_3			; GCN-NEXT: s_branch BB5_3
	; GCN-NEXT: BB5_2: ; %if.else			; GCN-NEXT: BB5_2: ; %if.else
	; GCN-NEXT: s_getpc_b64 s[4:5]			; GCN-NEXT: s_getpc_b64 s[4:5]
	; GCN-NEXT: s_add_u32 s4, s4, func_v3f16@rel32@lo+4			; GCN-NEXT: s_add_u32 s4, s4, func_v3f16@rel32@lo+4
	; GCN-NEXT: s_addc_u32 s5, s5, func_v3f16@rel32@hi+4			; GCN-NEXT: s_addc_u32 s5, s5, func_v3f16@rel32@hi+4
	; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]			; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
	; GCN-NEXT: BB5_3: ; %if.end			; GCN-NEXT: BB5_3: ; %if.end
	; GCN-NEXT: global_store_short v[0:1], v1, off			; GCN-NEXT: global_store_short v[0:1], v1, off
	Show All 28 Lines

llvm/test/CodeGen/AMDGPU/inline-constraints.ll

	Show First 20 Lines • Show All 52 Lines • ▼ Show 20 Lines
	; GCN: ; use [[REG]]			; GCN: ; use [[REG]]
	define amdgpu_kernel void @inline_sreg_constraint_imm_f32() {			define amdgpu_kernel void @inline_sreg_constraint_imm_f32() {
	tail call void asm sideeffect "; use $0", "s"(float 1.0)			tail call void asm sideeffect "; use $0", "s"(float 1.0)
	ret void			ret void
	}			}

	; FIXME: Should be able to use s_mov_b64			; FIXME: Should be able to use s_mov_b64
	; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i64:			; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i64:
	; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], -4{{$}}			; GCN-DAG: s_mov_b64 s{{\[}}[[REG_LO:[0-9]+]]:[[REG_HI:[0-9]+]]], -4{{$}}
	; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], -1{{$}}
	; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}			; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
	define amdgpu_kernel void @inline_sreg_constraint_imm_i64() {			define amdgpu_kernel void @inline_sreg_constraint_imm_i64() {
	tail call void asm sideeffect "; use $0", "s"(i64 -4)			tail call void asm sideeffect "; use $0", "s"(i64 -4)
	ret void			ret void
	}			}

	; GCN-LABEL: {{^}}inline_sreg_constraint_imm_f64:			; GCN-LABEL: {{^}}inline_sreg_constraint_imm_f64:
	; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], 0{{$}}			; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], 0{{$}}
	; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], 0x3ff00000{{$}}			; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], 0x3ff00000{{$}}
	; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}			; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
	define amdgpu_kernel void @inline_sreg_constraint_imm_f64() {			define amdgpu_kernel void @inline_sreg_constraint_imm_f64() {
	tail call void asm sideeffect "; use $0", "s"(double 1.0)			tail call void asm sideeffect "; use $0", "s"(double 1.0)
	ret void			ret void
	}			}

				; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i64_m4:
				; GCN-DAG: s_mov_b64 s{{\[}}[[REG_LO:[0-9]+]]:[[REG_HI:[0-9]+]]], -4{{$}}
				; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
				define amdgpu_kernel void @inline_sreg_constraint_imm_i64_m4() {
				tail call void asm sideeffect "; use $0", "s"(i64 -4)
				ret void
				}

				; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i64_4_0:
				; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], 0{{$}}
				; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], 4.0{{$}}
				; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
				define amdgpu_kernel void @inline_sreg_constraint_imm_i64_4_0() {
				tail call void asm sideeffect "; use $0", "s"(i64 4647714815446351872)
				ret void
				}

				; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i64_m4_0:
				; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], 0{{$}}
				; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], -4.0{{$}}
				; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
				define amdgpu_kernel void @inline_sreg_constraint_imm_i64_m4_0() {
				tail call void asm sideeffect "; use $0", "s"(i64 13871086852301127680)
				ret void
				}

				; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i64_1:
				; GCN-DAG: s_mov_b64 s{{\[}}[[REG_LO:[0-9]+]]:[[REG_HI:[0-9]+]]], 1{{$}}
				; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
				define amdgpu_kernel void @inline_sreg_constraint_imm_i64_1() {
				tail call void asm sideeffect "; use $0", "s"(i64 1)
				ret void
				}

				; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i64_4_m1:
				; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], -1{{$}}
				; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], 4.0{{$}}
				; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
				define amdgpu_kernel void @inline_sreg_constraint_imm_i64_4_m1() {
				tail call void asm sideeffect "; use $0", "s"(i64 4647714819741319167)
				ret void
				}

				; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i64_m1_4:
				; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], 4.0{{$}}
				; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], -1{{$}}
				; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
				define amdgpu_kernel void @inline_sreg_constraint_imm_i64_m1_4() {
				tail call void asm sideeffect "; use $0", "s"(i64 18446744070496714752)
				ret void
				}

				; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i64_m1_m4:
				; GCN-DAG: s_mov_b64 s{{\[}}[[REG_LO:[0-9]+]]:[[REG_HI:[0-9]+]]], 0xffffffffc0800000{{$}}
				; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
				define amdgpu_kernel void @inline_sreg_constraint_imm_i64_m1_m4() {
				tail call void asm sideeffect "; use $0", "s"(i64 18446744072644198400)
				ret void
				}

				; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i64_1_4:
				; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], 4.0{{$}}
				; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], 1{{$}}
				; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
				define amdgpu_kernel void @inline_sreg_constraint_imm_i64_1_4() {
				tail call void asm sideeffect "; use $0", "s"(i64 5377097728)
				ret void
				}

				; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i64_1_m4:
				; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], -4.0{{$}}
				; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], 1{{$}}
				; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
				define amdgpu_kernel void @inline_sreg_constraint_imm_i64_1_m4() {
				tail call void asm sideeffect "; use $0", "s"(i64 7524581376)
				ret void
				}

				; FIXME: Should be able to use s_mov_b64
				; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i64_100:
				; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], 0x42c80000{{$}}
				; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], 0{{$}}
				; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
				define amdgpu_kernel void @inline_sreg_constraint_imm_i64_100() {
				tail call void asm sideeffect "; use $0", "s"(i64 1120403456)
				ret void
				}

				; FIXME: Should be able to use s_mov_b64
				; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i64_m100:
				; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], 0xc2c80000{{$}}
				; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], 0{{$}}
				; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
				define amdgpu_kernel void @inline_sreg_constraint_imm_i64_m100() {
				tail call void asm sideeffect "; use $0", "s"(i64 3267887104)
				ret void
				}

				; FIXME: Should be able to use s_mov_b64
				; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i64_m1_m100:
				; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], 0xc2c80000{{$}}
				; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], -1{{$}}
				; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
				define amdgpu_kernel void @inline_sreg_constraint_imm_i64_m1_m100() {
				tail call void asm sideeffect "; use $0", "s"(i64 18446744072682471424)
				ret void
				}

				; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i64_1_m100:
				; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], 0xc2c80000{{$}}
				; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], 1{{$}}
				; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
				define amdgpu_kernel void @inline_sreg_constraint_imm_i64_1_m100() {
				tail call void asm sideeffect "; use $0", "s"(i64 7562854400)
				ret void
				}

llvm/test/CodeGen/AMDGPU/shl.ll

Show First 20 Lines • Show All 420 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @s_shl_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
ret void		ret void
}		}


; Test with the 64-bit integer bitpattern for a 32-bit float in the		; Test with the 64-bit integer bitpattern for a 32-bit float in the
; low 32-bits, which is not a valid 64-bit inline immmediate.		; low 32-bits, which is not a valid 64-bit inline immmediate.

; FUNC-LABEL: {{^}}s_shl_inline_imm_f32_4.0_i64:		; FUNC-LABEL: {{^}}s_shl_inline_imm_f32_4.0_i64:
; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 4.0		; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 0x40800000, s{{[0-9]+}}
; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0{{$}}
; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, s{{[0-9]+}}
define amdgpu_kernel void @s_shl_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {		define amdgpu_kernel void @s_shl_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
%shl = shl i64 1082130432, %a		%shl = shl i64 1082130432, %a
store i64 %shl, i64 addrspace(1)* %out, align 8		store i64 %shl, i64 addrspace(1)* %out, align 8
ret void		ret void
}		}

; FIXME: Copy of -1 register		; FIXME: Copy of -1 register
; FUNC-LABEL: {{^}}s_shl_inline_imm_f32_neg_4.0_i64:		; FUNC-LABEL: {{^}}s_shl_inline_imm_f32_neg_4.0_i64:
; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], -4.0		; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 0xffffffffc0800000, s{{[0-9]+}}
; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -1{{$}}
; SI-DAG: s_mov_b32 s[[K_HI_COPY:[0-9]+]], s[[K_HI]]
; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI_COPY]]{{\]}}, s{{[0-9]+}}
define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {		define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
%shl = shl i64 -1065353216, %a		%shl = shl i64 -1065353216, %a
store i64 %shl, i64 addrspace(1)* %out, align 8		store i64 %shl, i64 addrspace(1)* %out, align 8
ret void		ret void
}		}

; Shift into upper 32-bits		; Shift into upper 32-bits
; FUNC-LABEL: {{^}}s_shl_inline_high_imm_f32_4.0_i64:		; FUNC-LABEL: {{^}}s_shl_inline_high_imm_f32_4.0_i64:
▲ Show 20 Lines • Show All 52 Lines • Show Last 20 Lines