This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Fold FP clamp as modifier bit
ClosedPublic

Authored by arsenm on Feb 17 2017, 11:29 PM.

Download Raw Diff

Details

Reviewers

Summary

The manual is unclear on the details of this. It's not
clear to me if denormals are not allowed with clamp,
or if that is only omod. Not allowing denorms for
fp16 or fp64 isn't useful so I also question if that
is really a restriction. Same with whether this is valid
without IEEE mode enabled.

Diff Detail

Event Timeline

arsenm created this revision.Feb 17 2017, 11:29 PM

Herald added subscribers: tpr, tony-tye, yaxunl and 3 others. · View Herald TranscriptFeb 17 2017, 11:29 PM

arsenm added a parent revision: D11829: AMDGPU/SI: Re-define AMDGPUISD:CLAMP as always clamping between 0.0. and 1.0.Feb 17 2017, 11:30 PM

arsenm mentioned this in D11830: AMDGPU/SI: Fold AMDGPUISD::CLAMP into VOP3 instructions when possible.

I only know that exceptions won't occur with the clamp modifier. No idea about denormals.

Also, shouldn't this handle MIN as well?

In D30134#681271, @mareko wrote:

I only know that exceptions won't occur with the clamp modifier. No idea about denormals.

Also, shouldn't this handle MIN as well?

There's no practical reason to handle min. The higher level operation minnum(x, x) is folded to x in the IR, so this should only be appearing when we emit this pattern for the clamp operation, where max was arbitrarily chosen.

In D30134#682357, @arsenm wrote:

In D30134#681271, @mareko wrote:

I only know that exceptions won't occur with the clamp modifier. No idea about denormals.

Also, shouldn't this handle MIN as well?

There's no practical reason to handle min. The higher level operation minnum(x, x) is folded to x in the IR, so this should only be appearing when we emit this pattern for the clamp operation, where max was arbitrarily chosen.

I don't understand. FPClamp(x) = min(max(x, 0), 1). I don't see min handled here, that's why I asked.

In D30134#682362, @mareko wrote:

In D30134#682357, @arsenm wrote:

In D30134#681271, @mareko wrote:

I only know that exceptions won't occur with the clamp modifier. No idea about denormals.

Also, shouldn't this handle MIN as well?

There's no practical reason to handle min. The higher level operation minnum(x, x) is folded to x in the IR, so this should only be appearing when we emit this pattern for the clamp operation, where max was arbitrarily chosen.

I don't understand. FPClamp(x) = min(max(x, 0), 1). I don't see min handled here, that's why I asked.

This isn't directly matching the clamp pattern. In the DAG we match that to AMDGPUISD::CLAMP. We emit that as the max(x, x) clamp. We match that here

Allow with denormals enabled

Herald added a subscriber: dstuttard. · View Herald TranscriptFeb 21 2017, 9:05 AM

arsenm added a child revision: D30212: AMDGPU: Use clamp with f64.Feb 21 2017, 9:37 AM

LGTM.

r295905

This revision is now accepted and ready to land.Feb 22 2017, 3:39 PM

arsenm closed this revision.Feb 22 2017, 3:40 PM

foad added a subscriber: foad.Jun 29 2022, 4:02 AM

foad added inline comments.

test/CodeGen/AMDGPU/clamp.ll
108	Does this FIXME still make sense? Not sure what it was trying to say in the first place.
128	Ditto.

Herald added a project: Restricted Project. · View Herald TranscriptJun 29 2022, 4:02 AM

Herald added subscribers: kosarev, jsilvanus, kerbowa and 2 others. · View Herald Transcript

arsenm added inline comments.Jun 29 2022, 6:16 AM

test/CodeGen/AMDGPU/clamp.ll
108	The source modifier is used here, so I don't think so

foad mentioned this in rG7758f3aa9683: [AMDGPU] Remove FIXMEs that were resolved by D30134.Jun 29 2022, 6:43 AM

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

4 lines

76 lines

5 lines

8 lines

1 line

1 line

test/

CodeGen/

AMDGPU/

clamp-modifier.ll

190 lines

clamp-omod-special-case.mir

135 lines

clamp.ll

15 lines

Diff 89227

lib/Target/AMDGPU/SIDefines.h

Show First 20 Lines • Show All 59 Lines • ▼ Show 20 Lines	// TODO: Should this be spilt into VOP3 a and b?
LGKM_CNT = UINT64_C(1) << 34,		LGKM_CNT = UINT64_C(1) << 34,

WQM = UINT64_C(1) << 35,		WQM = UINT64_C(1) << 35,
DisableWQM = UINT64_C(1) << 36,		DisableWQM = UINT64_C(1) << 36,
Gather4 = UINT64_C(1) << 37,		Gather4 = UINT64_C(1) << 37,
SOPK_ZEXT = UINT64_C(1) << 38,		SOPK_ZEXT = UINT64_C(1) << 38,
SCALAR_STORE = UINT64_C(1) << 39,		SCALAR_STORE = UINT64_C(1) << 39,
FIXED_SIZE = UINT64_C(1) << 40,		FIXED_SIZE = UINT64_C(1) << 40,
VOPAsmPrefer32Bit = UINT64_C(1) << 41		VOPAsmPrefer32Bit = UINT64_C(1) << 41,
		HasFPClamp = UINT64_C(1) << 42
};		};

// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.		// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
// The result is true if any of these tests are true.		// The result is true if any of these tests are true.
enum ClassFlags {		enum ClassFlags {
S_NAN = 1 << 0, // Signaling NaN		S_NAN = 1 << 0, // Signaling NaN
Q_NAN = 1 << 1, // Quiet NaN		Q_NAN = 1 << 1, // Quiet NaN
N_INFINITY = 1 << 2, // Negative infinity		N_INFINITY = 1 << 2, // Negative infinity
▲ Show 20 Lines • Show All 318 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIFoldOperands.cpp

Show First 20 Lines • Show All 60 Lines • ▼ Show 20 Lines
};		};

class SIFoldOperands : public MachineFunctionPass {		class SIFoldOperands : public MachineFunctionPass {
public:		public:
static char ID;		static char ID;
MachineRegisterInfo *MRI;		MachineRegisterInfo *MRI;
const SIInstrInfo *TII;		const SIInstrInfo *TII;
const SIRegisterInfo *TRI;		const SIRegisterInfo *TRI;
		const SISubtarget *ST;

void foldOperand(MachineOperand &OpToFold,		void foldOperand(MachineOperand &OpToFold,
MachineInstr *UseMI,		MachineInstr *UseMI,
unsigned UseOpIdx,		unsigned UseOpIdx,
SmallVectorImpl<FoldCandidate> &FoldList,		SmallVectorImpl<FoldCandidate> &FoldList,
SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;		SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;

void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;		void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;

		const MachineOperand *isClamp(const MachineInstr &MI) const;
		bool tryFoldClamp(MachineInstr &MI);

public:		public:
SIFoldOperands() : MachineFunctionPass(ID) {		SIFoldOperands() : MachineFunctionPass(ID) {
initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());		initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
}		}

bool runOnMachineFunction(MachineFunction &MF) override;		bool runOnMachineFunction(MachineFunction &MF) override;

StringRef getPassName() const override { return "SI Fold Operands"; }		StringRef getPassName() const override { return "SI Fold Operands"; }
▲ Show 20 Lines • Show All 595 Lines • ▼ Show 20 Lines	if (updateOperand(Fold, *TRI)) {
MRI->clearKillFlags(Fold.OpToFold->getReg());		MRI->clearKillFlags(Fold.OpToFold->getReg());
}		}
DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<		DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n');		static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n');
}		}
}		}
}		}

		const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
		unsigned Op = MI.getOpcode();
		switch (Op) {
		case AMDGPU::V_MAX_F32_e64:
		case AMDGPU::V_MAX_F16_e64: {
		if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
		return nullptr;

		// Make sure sources are identical.
		const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
		const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
		if (!Src0->isReg() \|\| Src0->getSubReg() != Src1->getSubReg() \|\|
		Src0->getSubReg() != AMDGPU::NoSubRegister)
		return nullptr;

		// Can't fold up if we have modifiers.
		if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) \|\|
		TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) \|\|
		TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
		return nullptr;
		return Src0;
		}
		default:
		return nullptr;
		}
		}

		// We obviously have multiple uses in a clamp since the register is used twice
		// in the same instruction.
		static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) {
		int Count = 0;
		for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end();
		I != E; ++I) {
		if (++Count > 1)
		return false;
		}

		return true;
		}

		// FIXME: Does this need to check IEEE bit on function?
		bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
		const MachineOperand *ClampSrc = isClamp(MI);
		if (!ClampSrc \|\| !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg()))
		return false;

		MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
		if (!TII->hasFPClamp(*Def))
		return false;
		MachineOperand DefClamp = TII->getNamedOperand(Def, AMDGPU::OpName::clamp);
		if (!DefClamp)
		return false;

		DEBUG(dbgs() << "Folding clamp " << DefClamp << " into " << Def << '\n');

		// Clamp is applied after omod, so it is OK if omod is set.
		DefClamp->setImm(1);
		MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
		MI.eraseFromParent();
		return true;
		}

bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {		bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(*MF.getFunction()))		if (skipFunction(*MF.getFunction()))
return false;		return false;

const SISubtarget &ST = MF.getSubtarget<SISubtarget>();

MRI = &MF.getRegInfo();		MRI = &MF.getRegInfo();
TII = ST.getInstrInfo();		ST = &MF.getSubtarget<SISubtarget>();
		TII = ST->getInstrInfo();
TRI = &TII->getRegisterInfo();		TRI = &TII->getRegisterInfo();

for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();		for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; ++BI) {		BI != BE; ++BI) {

MachineBasicBlock &MBB = *BI;		MachineBasicBlock &MBB = *BI;
MachineBasicBlock::iterator I, Next;		MachineBasicBlock::iterator I, Next;
for (I = MBB.begin(); I != MBB.end(); I = Next) {		for (I = MBB.begin(); I != MBB.end(); I = Next) {
Next = std::next(I);		Next = std::next(I);
MachineInstr &MI = *I;		MachineInstr &MI = *I;

if (!isSafeToFold(MI))		if (!isSafeToFold(MI)) {
		// TODO: Try omod also.
		tryFoldClamp(MI);
continue;		continue;
		}

MachineOperand &OpToFold = MI.getOperand(1);		MachineOperand &OpToFold = MI.getOperand(1);
bool FoldingImm = OpToFold.isImm() \|\| OpToFold.isFI();		bool FoldingImm = OpToFold.isImm() \|\| OpToFold.isFI();

// FIXME: We could also be folding things like TargetIndexes.		// FIXME: We could also be folding things like TargetIndexes.
if (!FoldingImm && !OpToFold.isReg())		if (!FoldingImm && !OpToFold.isReg())
continue;		continue;

Show All 20 Lines

lib/Target/AMDGPU/SIInstrFormats.td

Show First 20 Lines • Show All 72 Lines • ▼ Show 20 Lines	class InstSI <dag outs, dag ins, string asm = "",
// Whether the operands can be ignored when computing the		// Whether the operands can be ignored when computing the
// instruction size.		// instruction size.
field bit FixedSize = 0;		field bit FixedSize = 0;

// This bit tells the assembler to use the 32-bit encoding in case it		// This bit tells the assembler to use the 32-bit encoding in case it
// is unable to infer the encoding from the operands.		// is unable to infer the encoding from the operands.
field bit VOPAsmPrefer32Bit = 0;		field bit VOPAsmPrefer32Bit = 0;

		// This bit indicates that this has a floating point result type, so
		// the clamp modifier has floating point semantics.
		field bit FPClamp = 0;

// These need to be kept in sync with the enum in SIInstrFlags.		// These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = SALU;		let TSFlags{0} = SALU;
let TSFlags{1} = VALU;		let TSFlags{1} = VALU;

let TSFlags{2} = SOP1;		let TSFlags{2} = SOP1;
let TSFlags{3} = SOP2;		let TSFlags{3} = SOP2;
let TSFlags{4} = SOPC;		let TSFlags{4} = SOPC;
let TSFlags{5} = SOPK;		let TSFlags{5} = SOPK;
Show All 26 Lines	class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{35} = WQM;		let TSFlags{35} = WQM;
let TSFlags{36} = DisableWQM;		let TSFlags{36} = DisableWQM;
let TSFlags{37} = Gather4;		let TSFlags{37} = Gather4;

let TSFlags{38} = SOPKZext;		let TSFlags{38} = SOPKZext;
let TSFlags{39} = ScalarStore;		let TSFlags{39} = ScalarStore;
let TSFlags{40} = FixedSize;		let TSFlags{40} = FixedSize;
let TSFlags{41} = VOPAsmPrefer32Bit;		let TSFlags{41} = VOPAsmPrefer32Bit;
		let TSFlags{42} = FPClamp;

let SchedRW = [Write32Bit];		let SchedRW = [Write32Bit];

field bits<1> DisableSIDecoder = 0;		field bits<1> DisableSIDecoder = 0;
field bits<1> DisableVIDecoder = 0;		field bits<1> DisableVIDecoder = 0;
field bits<1> DisableDecoder = 0;		field bits<1> DisableDecoder = 0;

let isAsmParserOnly = !if(!eq(DisableDecoder{0}, {0}), 0, 1);		let isAsmParserOnly = !if(!eq(DisableDecoder{0}, {0}), 0, 1);
▲ Show 20 Lines • Show All 155 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIInstrInfo.h

Show First 20 Lines • Show All 468 Lines • ▼ Show 20 Lines	public:
static bool isFixedSize(const MachineInstr &MI) {		static bool isFixedSize(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::FIXED_SIZE;		return MI.getDesc().TSFlags & SIInstrFlags::FIXED_SIZE;
}		}

bool isFixedSize(uint16_t Opcode) const {		bool isFixedSize(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::FIXED_SIZE;		return get(Opcode).TSFlags & SIInstrFlags::FIXED_SIZE;
}		}

		static bool hasFPClamp(const MachineInstr &MI) {
		return MI.getDesc().TSFlags & SIInstrFlags::HasFPClamp;
		}

		bool hasFPClamp(uint16_t Opcode) const {
		return get(Opcode).TSFlags & SIInstrFlags::HasFPClamp;
		}

bool isVGPRCopy(const MachineInstr &MI) const {		bool isVGPRCopy(const MachineInstr &MI) const {
assert(MI.isCopy());		assert(MI.isCopy());
unsigned Dest = MI.getOperand(0).getReg();		unsigned Dest = MI.getOperand(0).getReg();
const MachineFunction &MF = *MI.getParent()->getParent();		const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();		const MachineRegisterInfo &MRI = MF.getRegInfo();
return !RI.isSGPRReg(MRI, Dest);		return !RI.isSGPRReg(MRI, Dest);
}		}

▲ Show 20 Lines • Show All 332 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIInstrInfo.td

Show First 20 Lines • Show All 1,080 Lines • ▼ Show 20 Lines	class VOPProfile <list<ValueType> _ArgVT> {

field bit HasSrc0Mods = HasModifiers;		field bit HasSrc0Mods = HasModifiers;
field bit HasSrc1Mods = !if(HasModifiers, BitOr<HasSrc1FloatMods, HasSrc1IntMods>.ret, 0);		field bit HasSrc1Mods = !if(HasModifiers, BitOr<HasSrc1FloatMods, HasSrc1IntMods>.ret, 0);
field bit HasSrc2Mods = !if(HasModifiers, BitOr<HasSrc2FloatMods, HasSrc2IntMods>.ret, 0);		field bit HasSrc2Mods = !if(HasModifiers, BitOr<HasSrc2FloatMods, HasSrc2IntMods>.ret, 0);

field bit HasOMod = HasModifiers;		field bit HasOMod = HasModifiers;
field bit HasClamp = HasModifiers;		field bit HasClamp = HasModifiers;
field bit HasSDWAClamp = HasSrc0;		field bit HasSDWAClamp = HasSrc0;
		field bit HasFPClamp = BitAnd<isFloatType<DstVT>.ret, HasClamp>.ret;

field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;		field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;

field dag Outs = !if(HasDst,(outs DstRC:$vdst),(outs));		field dag Outs = !if(HasDst,(outs DstRC:$vdst),(outs));

// VOP3b instructions are a special case with a second explicit		// VOP3b instructions are a special case with a second explicit
// output. This is manually overridden for them.		// output. This is manually overridden for them.
field dag Outs32 = Outs;		field dag Outs32 = Outs;
▲ Show 20 Lines • Show All 219 Lines • Show Last 20 Lines

lib/Target/AMDGPU/VOPInstructions.td

Show First 20 Lines • Show All 94 Lines • ▼ Show 20 Lines	class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP3Only = 0> :
// Using complex patterns gives VOP3 patterns a very high complexity rating,		// Using complex patterns gives VOP3 patterns a very high complexity rating,
// but standalone patterns are almost always preferred, so we need to adjust the		// but standalone patterns are almost always preferred, so we need to adjust the
// priority lower. The goal is to use a high number to reduce complexity to		// priority lower. The goal is to use a high number to reduce complexity to
// zero (or less than zero).		// zero (or less than zero).
let AddedComplexity = -1000;		let AddedComplexity = -1000;

let VOP3 = 1;		let VOP3 = 1;
let VALU = 1;		let VALU = 1;
		let FPClamp = P.HasFPClamp;
let Uses = [EXEC];		let Uses = [EXEC];

let AsmVariantName = AMDGPUAsmVariants.VOP3;		let AsmVariantName = AMDGPUAsmVariants.VOP3;
let AsmMatchConverter =		let AsmMatchConverter =
!if(!eq(VOP3Only,1),		!if(!eq(VOP3Only,1),
"cvtVOP3",		"cvtVOP3",
!if(!eq(P.HasModifiers, 1), "cvtVOP3_2_mod", ""));		!if(!eq(P.HasModifiers, 1), "cvtVOP3_2_mod", ""));

▲ Show 20 Lines • Show All 240 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/clamp-modifier.ll

This file was added.

				; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=SI %s
				; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=VI %s

				; GCN-LABEL: {{^}}v_clamp_add_src_f32:
				; GCN: {{buffer\|flat}}_load_dword [[A:v[0-9]+]]
				; GCN-NOT: [[A]]
				; GCN: v_add_f32_e64 v{{[0-9]+}}, [[A]], 1.0 clamp{{$}}
				define amdgpu_kernel void @v_clamp_add_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				%tid = call i32 @llvm.amdgcn.workitem.id.x()
				%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				%a = load float, float addrspace(1)* %gep0
				%add = fadd float %a, 1.0
				%max = call float @llvm.maxnum.f32(float %add, float 0.0)
				%clamp = call float @llvm.minnum.f32(float %max, float 1.0)
				store float %clamp, float addrspace(1)* %out.gep
				ret void
				}

				; GCN-LABEL: {{^}}v_clamp_multi_use_src_f32:
				; GCN: {{buffer\|flat}}_load_dword [[A:v[0-9]+]]
				; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}
				; GCN: v_max_f32_e64 v{{[0-9]+}}, [[ADD]], [[ADD]] clamp{{$}}
				define amdgpu_kernel void @v_clamp_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				%tid = call i32 @llvm.amdgcn.workitem.id.x()
				%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				%a = load float, float addrspace(1)* %gep0
				%add = fadd float %a, 1.0
				%max = call float @llvm.maxnum.f32(float %add, float 0.0)
				%clamp = call float @llvm.minnum.f32(float %max, float 1.0)
				store float %clamp, float addrspace(1)* %out.gep
				store volatile float %add, float addrspace(1)* undef
				ret void
				}

				; GCN-LABEL: {{^}}v_clamp_dbg_use_src_f32:
				; GCN: {{buffer\|flat}}_load_dword [[A:v[0-9]+]]
				; GCN-NOT: [[A]]
				; GCN: v_add_f32_e64 v{{[0-9]+}}, [[A]], 1.0 clamp{{$}}
				define amdgpu_kernel void @v_clamp_dbg_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				%tid = call i32 @llvm.amdgcn.workitem.id.x()
				%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				%a = load float, float addrspace(1)* %gep0
				%add = fadd float %a, 1.0
				call void @llvm.dbg.value(metadata float %add, i64 0, metadata !4, metadata !9), !dbg !10
				%max = call float @llvm.maxnum.f32(float %add, float 0.0)
				%clamp = call float @llvm.minnum.f32(float %max, float 1.0)
				store float %clamp, float addrspace(1)* %out.gep
				ret void
				}

				; GCN-LABEL: {{^}}v_clamp_add_neg_src_f32:
				; GCN: {{buffer\|flat}}_load_dword [[A:v[0-9]+]]
				; GCN: v_floor_f32_e32 [[FLOOR:v[0-9]+]], [[A]]
				; GCN: v_max_f32_e64 v{{[0-9]+}}, -[[FLOOR]], -[[FLOOR]] clamp{{$}}
				define amdgpu_kernel void @v_clamp_add_neg_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				%tid = call i32 @llvm.amdgcn.workitem.id.x()
				%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				%a = load float, float addrspace(1)* %gep0
				%floor = call float @llvm.floor.f32(float %a)
				%neg.floor = fsub float -0.0, %floor
				%max = call float @llvm.maxnum.f32(float %neg.floor, float 0.0)
				%clamp = call float @llvm.minnum.f32(float %max, float 1.0)
				store float %clamp, float addrspace(1)* %out.gep
				ret void
				}

				; GCN-LABEL: {{^}}v_non_clamp_max_f32:
				; GCN: {{buffer\|flat}}_load_dword [[A:v[0-9]+]]
				; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}
				; GCN: v_max_f32_e32 v{{[0-9]+}}, 0, [[ADD]]{{$}}
				define amdgpu_kernel void @v_non_clamp_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
				%tid = call i32 @llvm.amdgcn.workitem.id.x()
				%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				%a = load float, float addrspace(1)* %gep0
				%add = fadd float %a, 1.0
				%max = call float @llvm.maxnum.f32(float %add, float 0.0)
				store float %max, float addrspace(1)* %out.gep
				ret void
				}

				; GCN-LABEL: {{^}}v_clamp_add_src_f32_denormals:
				; GCN: {{buffer\|flat}}_load_dword [[A:v[0-9]+]]
				; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], [[A]], 1.0 clamp{{$}}
				define amdgpu_kernel void @v_clamp_add_src_f32_denormals(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
				%tid = call i32 @llvm.amdgcn.workitem.id.x()
				%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
				%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				%a = load float, float addrspace(1)* %gep0
				%add = fadd float %a, 1.0
				%max = call float @llvm.maxnum.f32(float %add, float 0.0)
				%clamp = call float @llvm.minnum.f32(float %max, float 1.0)
				store float %clamp, float addrspace(1)* %out.gep
				ret void
				}

				; GCN-LABEL: {{^}}v_clamp_add_src_f16_denorm:
				; GCN: {{buffer\|flat}}_load_ushort [[A:v[0-9]+]]
				; VI: v_add_f16_e64 [[ADD:v[0-9]+]], [[A]], 1.0 clamp{{$}}

				; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]]
				; SI: v_add_f32_e64 [[ADD:v[0-9]+]], [[CVT]], 1.0 clamp{{$}}
				; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[ADD]]
				define amdgpu_kernel void @v_clamp_add_src_f16_denorm(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
				%tid = call i32 @llvm.amdgcn.workitem.id.x()
				%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
				%out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
				%a = load half, half addrspace(1)* %gep0
				%add = fadd half %a, 1.0
				%max = call half @llvm.maxnum.f16(half %add, half 0.0)
				%clamp = call half @llvm.minnum.f16(half %max, half 1.0)
				store half %clamp, half addrspace(1)* %out.gep
				ret void
				}

				; GCN-LABEL: {{^}}v_clamp_add_src_f16_no_denormals:
				; GCN: {{buffer\|flat}}_load_ushort [[A:v[0-9]+]]
				; VI-NOT: [[A]]
				; VI: v_add_f16_e64 v{{[0-9]+}}, [[A]], 1.0 clamp{{$}}

				; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]]
				; SI: v_add_f32_e64 [[ADD:v[0-9]+]], [[CVT]], 1.0 clamp{{$}}
				; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[ADD]]
				define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(half addrspace(1)* %out, half addrspace(1)* %aptr) #3 {
				%tid = call i32 @llvm.amdgcn.workitem.id.x()
				%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
				%out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
				%a = load half, half addrspace(1)* %gep0
				%add = fadd half %a, 1.0
				%max = call half @llvm.maxnum.f16(half %add, half 0.0)
				%clamp = call half @llvm.minnum.f16(half %max, half 1.0)
				store half %clamp, half addrspace(1)* %out.gep
				ret void
				}

				; GCN-LABEL: {{^}}v_clamp_add_src_v2f32:
				; GCN: {{buffer\|flat}}_load_dwordx2 v{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}
				; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, v[[A]], 1.0 clamp{{$}}
				; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, v[[B]], 1.0 clamp{{$}}
				define amdgpu_kernel void @v_clamp_add_src_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %aptr) #0 {
				%tid = call i32 @llvm.amdgcn.workitem.id.x()
				%gep0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %aptr, i32 %tid
				%out.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %out, i32 %tid
				%a = load <2 x float>, <2 x float> addrspace(1)* %gep0
				%add = fadd <2 x float> %a, <float 1.0, float 1.0>
				%max = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %add, <2 x float> zeroinitializer)
				%clamp = call <2 x float> @llvm.minnum.v2f32(<2 x float> %max, <2 x float> <float 1.0, float 1.0>)
				store <2 x float> %clamp, <2 x float> addrspace(1)* %out.gep
				ret void
				}

				declare i32 @llvm.amdgcn.workitem.id.x() #1
				declare float @llvm.fabs.f32(float) #1
				declare float @llvm.floor.f32(float) #1
				declare float @llvm.minnum.f32(float, float) #1
				declare float @llvm.maxnum.f32(float, float) #1
				declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
				declare double @llvm.fabs.f64(double) #1
				declare double @llvm.minnum.f64(double, double) #1
				declare double @llvm.maxnum.f64(double, double) #1
				declare half @llvm.fabs.f16(half) #1
				declare half @llvm.minnum.f16(half, half) #1
				declare half @llvm.maxnum.f16(half, half) #1
				declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #1
				declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #1
				declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1

				attributes #0 = { nounwind }
				attributes #1 = { nounwind readnone }
				attributes #2 = { nounwind "target-features"="+fp32-denormals" }
				attributes #3 = { nounwind "target-features"="-fp64-fp16-denormals" }

				!llvm.dbg.cu = !{!0}
				!llvm.module.flags = !{!2, !3}

				!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug)
				!1 = !DIFile(filename: "/tmp/foo.cl", directory: "/dev/null")
				!2 = !{i32 2, !"Dwarf Version", i32 4}
				!3 = !{i32 2, !"Debug Info Version", i32 3}
				!4 = !DILocalVariable(name: "add", arg: 1, scope: !5, file: !1, line: 1)
				!5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
				!6 = !DISubroutineType(types: !7)
				!7 = !{null, !8}
				!8 = !DIBasicType(name: "float", size: 32, align: 32)
				!9 = !DIExpression()
				!10 = !DILocation(line: 1, column: 42, scope: !5)

test/CodeGen/AMDGPU/clamp-omod-special-case.mir

This file was added.

				# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-fold-operands %s -o - \| FileCheck -check-prefix=GCN %s
				--- \|
				define amdgpu_kernel void @v_max_self_clamp_not_set_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) {
				ret void
				}

				define amdgpu_kernel void @v_clamp_omod_already_set_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) {
				ret void
				}

				...
				---
				# GCN-LABEL: name: v_max_self_clamp_not_set_f32
				# GCN: %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
				# GCN-NEXT: %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 0, 0, implicit %exec

				name: v_max_self_clamp_not_set_f32
				tracksRegLiveness: true
				registers:
				- { id: 0, class: sgpr_64 }
				- { id: 1, class: sreg_32_xm0 }
				- { id: 2, class: sgpr_32 }
				- { id: 3, class: vgpr_32 }
				- { id: 4, class: sreg_64_xexec }
				- { id: 5, class: sreg_64_xexec }
				- { id: 6, class: sreg_32 }
				- { id: 7, class: sreg_32 }
				- { id: 8, class: sreg_32_xm0 }
				- { id: 9, class: sreg_64 }
				- { id: 10, class: sreg_32_xm0 }
				- { id: 11, class: sreg_32_xm0 }
				- { id: 12, class: sgpr_64 }
				- { id: 13, class: sgpr_128 }
				- { id: 14, class: sreg_32_xm0 }
				- { id: 15, class: sreg_64 }
				- { id: 16, class: sgpr_128 }
				- { id: 17, class: vgpr_32 }
				- { id: 18, class: vreg_64 }
				- { id: 19, class: vgpr_32 }
				- { id: 20, class: vgpr_32 }
				- { id: 21, class: vgpr_32 }
				- { id: 22, class: vgpr_32 }
				- { id: 23, class: vreg_64 }
				- { id: 24, class: vgpr_32 }
				- { id: 25, class: vreg_64 }
				- { id: 26, class: vreg_64 }
				liveins:
				- { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
				- { reg: '%vgpr0', virtual-reg: '%3' }
				body: \|
				bb.0 (%ir-block.0):
				liveins: %sgpr0_sgpr1, %vgpr0

				%3 = COPY %vgpr0
				%0 = COPY %sgpr0_sgpr1
				%4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
				%5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
				%24 = V_ASHRREV_I32_e32 31, %3, implicit %exec
				%25 = REG_SEQUENCE %3, 1, %24, 2
				%10 = S_MOV_B32 61440
				%11 = S_MOV_B32 0
				%12 = REG_SEQUENCE killed %11, 1, killed %10, 2
				%13 = REG_SEQUENCE killed %5, 17, %12, 18
				%14 = S_MOV_B32 2
				%26 = V_LSHL_B64 killed %25, 2, implicit %exec
				%16 = REG_SEQUENCE killed %4, 17, %12, 18
				%18 = COPY %26
				%17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, implicit %exec
				%20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
				%21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 0, 0, implicit %exec
				BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, implicit %exec
				S_ENDPGM

				...
				---
				# GCN-LABEL: name: v_clamp_omod_already_set_f32
				# GCN: %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
				# GCN: %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 1, 3, implicit %exec
				name: v_clamp_omod_already_set_f32
				tracksRegLiveness: true
				registers:
				- { id: 0, class: sgpr_64 }
				- { id: 1, class: sreg_32_xm0 }
				- { id: 2, class: sgpr_32 }
				- { id: 3, class: vgpr_32 }
				- { id: 4, class: sreg_64_xexec }
				- { id: 5, class: sreg_64_xexec }
				- { id: 6, class: sreg_32 }
				- { id: 7, class: sreg_32 }
				- { id: 8, class: sreg_32_xm0 }
				- { id: 9, class: sreg_64 }
				- { id: 10, class: sreg_32_xm0 }
				- { id: 11, class: sreg_32_xm0 }
				- { id: 12, class: sgpr_64 }
				- { id: 13, class: sgpr_128 }
				- { id: 14, class: sreg_32_xm0 }
				- { id: 15, class: sreg_64 }
				- { id: 16, class: sgpr_128 }
				- { id: 17, class: vgpr_32 }
				- { id: 18, class: vreg_64 }
				- { id: 19, class: vgpr_32 }
				- { id: 20, class: vgpr_32 }
				- { id: 21, class: vgpr_32 }
				- { id: 22, class: vgpr_32 }
				- { id: 23, class: vreg_64 }
				- { id: 24, class: vgpr_32 }
				- { id: 25, class: vreg_64 }
				- { id: 26, class: vreg_64 }
				liveins:
				- { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
				- { reg: '%vgpr0', virtual-reg: '%3' }
				body: \|
				bb.0 (%ir-block.0):
				liveins: %sgpr0_sgpr1, %vgpr0

				%3 = COPY %vgpr0
				%0 = COPY %sgpr0_sgpr1
				%4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
				%5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
				%24 = V_ASHRREV_I32_e32 31, %3, implicit %exec
				%25 = REG_SEQUENCE %3, 1, %24, 2
				%10 = S_MOV_B32 61440
				%11 = S_MOV_B32 0
				%12 = REG_SEQUENCE killed %11, 1, killed %10, 2
				%13 = REG_SEQUENCE killed %5, 17, %12, 18
				%14 = S_MOV_B32 2
				%26 = V_LSHL_B64 killed %25, 2, implicit %exec
				%16 = REG_SEQUENCE killed %4, 17, %12, 18
				%18 = COPY %26
				%17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, implicit %exec
				%20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
				%21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 1, 3, implicit %exec
				BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, implicit %exec
				S_ENDPGM
				...

test/CodeGen/AMDGPU/clamp.ll

Show First 20 Lines • Show All 81 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
store volatile float %max, float addrspace(1)* undef		store volatile float %max, float addrspace(1)* undef
ret void		ret void
}		}

; GCN-LABEL: {{^}}v_clamp_f16:		; GCN-LABEL: {{^}}v_clamp_f16:
; GCN: {{buffer\|flat}}_load_ushort [[A:v[0-9]+]]		; GCN: {{buffer\|flat}}_load_ushort [[A:v[0-9]+]]
; VI: v_max_f16_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}		; VI: v_max_f16_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}

; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]]		; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], [[A]] clamp{{$}}
; SI: v_max_f32_e64 v{{[0-9]+}}, [[CVT]], [[CVT]] clamp{{$}}		; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
; SI: v_cvt_f16_f32_e32
define amdgpu_kernel void @v_clamp_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {		define amdgpu_kernel void @v_clamp_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()		%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid		%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid		%out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
%a = load half, half addrspace(1)* %gep0		%a = load half, half addrspace(1)* %gep0
%max = call half @llvm.maxnum.f16(half %a, half 0.0)		%max = call half @llvm.maxnum.f16(half %a, half 0.0)
%med = call half @llvm.minnum.f16(half %max, half 1.0)		%med = call half @llvm.minnum.f16(half %max, half 1.0)

store half %med, half addrspace(1)* %out.gep		store half %med, half addrspace(1)* %out.gep
ret void		ret void
}		}

; GCN-LABEL: {{^}}v_clamp_neg_f16:		; GCN-LABEL: {{^}}v_clamp_neg_f16:
; GCN: {{buffer\|flat}}_load_ushort [[A:v[0-9]+]]		; GCN: {{buffer\|flat}}_load_ushort [[A:v[0-9]+]]
; VI: v_max_f16_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}}		; VI: v_max_f16_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}}

; FIXME: Better to fold neg into max		; FIXME: Better to fold neg into max
		foadUnsubmitted Not Done Reply Inline Actions Does this FIXME still make sense? Not sure what it was trying to say in the first place. foad: Does this FIXME still make sense? Not sure what it was trying to say in the first place.
		arsenmAuthorUnsubmitted Done Reply Inline Actions The source modifier is used here, so I don't think so arsenm: The source modifier is used here, so I don't think so
; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]]		; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] clamp{{$}}
; SI: v_max_f32_e64 v{{[0-9]+}}, [[CVT]], [[CVT]] clamp{{$}}		; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
; SI: v_cvt_f16_f32
define amdgpu_kernel void @v_clamp_neg_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {		define amdgpu_kernel void @v_clamp_neg_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()		%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid		%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid		%out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
%a = load half, half addrspace(1)* %gep0		%a = load half, half addrspace(1)* %gep0
%fneg.a = fsub half -0.0, %a		%fneg.a = fsub half -0.0, %a
%max = call half @llvm.maxnum.f16(half %fneg.a, half 0.0)		%max = call half @llvm.maxnum.f16(half %fneg.a, half 0.0)
%med = call half @llvm.minnum.f16(half %max, half 1.0)		%med = call half @llvm.minnum.f16(half %max, half 1.0)

store half %med, half addrspace(1)* %out.gep		store half %med, half addrspace(1)* %out.gep
ret void		ret void
}		}

; GCN-LABEL: {{^}}v_clamp_negabs_f16:		; GCN-LABEL: {{^}}v_clamp_negabs_f16:
; GCN: {{buffer\|flat}}_load_ushort [[A:v[0-9]+]]		; GCN: {{buffer\|flat}}_load_ushort [[A:v[0-9]+]]
; VI: v_max_f16_e64 v{{[0-9]+}}, -\|[[A]]\|, -\|[[A]]\| clamp{{$}}		; VI: v_max_f16_e64 v{{[0-9]+}}, -\|[[A]]\|, -\|[[A]]\| clamp{{$}}

; FIXME: Better to fold neg/abs into max		; FIXME: Better to fold neg/abs into max
		foadUnsubmitted Not Done Reply Inline Actions Ditto. foad: Ditto.

; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -\|[[A]]\|		; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -\|[[A]]\| clamp{{$}}
; SI: v_max_f32_e64 v{{[0-9]+}}, [[CVT]], [[CVT]] clamp{{$}}		; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
; SI: v_cvt_f16_f32_e32
define amdgpu_kernel void @v_clamp_negabs_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {		define amdgpu_kernel void @v_clamp_negabs_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()		%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid		%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid		%out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
%a = load half, half addrspace(1)* %gep0		%a = load half, half addrspace(1)* %gep0
%fabs.a = call half @llvm.fabs.f16(half %a)		%fabs.a = call half @llvm.fabs.f16(half %a)
%fneg.fabs.a = fsub half -0.0, %fabs.a		%fneg.fabs.a = fsub half -0.0, %fabs.a

▲ Show 20 Lines • Show All 393 Lines • Show Last 20 Lines