Diff 334679

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Show First 20 Lines • Show All 1,400 Lines • ▼ Show 20 Lines	bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
DefClamp->setImm(1);		DefClamp->setImm(1);
MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());		MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
MI.eraseFromParent();		MI.eraseFromParent();
return true;		return true;
}		}

static int getOModValue(unsigned Opc, int64_t Val) {		static int getOModValue(unsigned Opc, int64_t Val) {
switch (Opc) {		switch (Opc) {
		case AMDGPU::V_MUL_F64_e64: {
		switch (Val) {
		case 0x3fe0000000000000: // 0.5
		return SIOutMods::DIV2;
		case 0x4000000000000000: // 2.0
		return SIOutMods::MUL2;
		case 0x4010000000000000: // 4.0
		return SIOutMods::MUL4;
		default:
		return SIOutMods::NONE;
		}
		}
case AMDGPU::V_MUL_F32_e64: {		case AMDGPU::V_MUL_F32_e64: {
switch (static_cast<uint32_t>(Val)) {		switch (static_cast<uint32_t>(Val)) {
case 0x3f000000: // 0.5		case 0x3f000000: // 0.5
return SIOutMods::DIV2;		return SIOutMods::DIV2;
case 0x40000000: // 2.0		case 0x40000000: // 2.0
return SIOutMods::MUL2;		return SIOutMods::MUL2;
case 0x40800000: // 4.0		case 0x40800000: // 4.0
return SIOutMods::MUL4;		return SIOutMods::MUL4;
Show All 20 Lines

// FIXME: Does this really not support denormals with f16?		// FIXME: Does this really not support denormals with f16?
// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not		// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
// handled, so will anything other than that break?		// handled, so will anything other than that break?
std::pair<const MachineOperand *, int>		std::pair<const MachineOperand *, int>
SIFoldOperands::isOMod(const MachineInstr &MI) const {		SIFoldOperands::isOMod(const MachineInstr &MI) const {
unsigned Op = MI.getOpcode();		unsigned Op = MI.getOpcode();
switch (Op) {		switch (Op) {
		case AMDGPU::V_MUL_F64_e64:
case AMDGPU::V_MUL_F32_e64:		case AMDGPU::V_MUL_F32_e64:
case AMDGPU::V_MUL_F16_e64: {		case AMDGPU::V_MUL_F16_e64: {
// If output denormals are enabled, omod is ignored.		// If output denormals are enabled, omod is ignored.
if ((Op == AMDGPU::V_MUL_F32_e64 && MFI->getMode().FP32OutputDenormals) \|\|		if ((Op == AMDGPU::V_MUL_F32_e64 && MFI->getMode().FP32OutputDenormals) \|\|
(Op == AMDGPU::V_MUL_F16_e64 && MFI->getMode().FP64FP16OutputDenormals))		((Op == AMDGPU::V_MUL_F64_e64 \|\| Op == AMDGPU::V_MUL_F16_e64) &&
		MFI->getMode().FP64FP16OutputDenormals))
return std::make_pair(nullptr, SIOutMods::NONE);		return std::make_pair(nullptr, SIOutMods::NONE);

const MachineOperand *RegOp = nullptr;		const MachineOperand *RegOp = nullptr;
const MachineOperand *ImmOp = nullptr;		const MachineOperand *ImmOp = nullptr;
const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);		const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);		const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
if (Src0->isImm()) {		if (Src0->isImm()) {
ImmOp = Src0;		ImmOp = Src0;
Show All 9 Lines	if (OMod == SIOutMods::NONE \|\|
TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) \|\|		TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) \|\|
TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) \|\|		TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) \|\|
TII->hasModifiersSet(MI, AMDGPU::OpName::omod) \|\|		TII->hasModifiersSet(MI, AMDGPU::OpName::omod) \|\|
TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))		TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
return std::make_pair(nullptr, SIOutMods::NONE);		return std::make_pair(nullptr, SIOutMods::NONE);

return std::make_pair(RegOp, OMod);		return std::make_pair(RegOp, OMod);
}		}
		case AMDGPU::V_ADD_F64_e64:
case AMDGPU::V_ADD_F32_e64:		case AMDGPU::V_ADD_F32_e64:
case AMDGPU::V_ADD_F16_e64: {		case AMDGPU::V_ADD_F16_e64: {
// If output denormals are enabled, omod is ignored.		// If output denormals are enabled, omod is ignored.
if ((Op == AMDGPU::V_ADD_F32_e64 && MFI->getMode().FP32OutputDenormals) \|\|		if ((Op == AMDGPU::V_ADD_F32_e64 && MFI->getMode().FP32OutputDenormals) \|\|
(Op == AMDGPU::V_ADD_F16_e64 && MFI->getMode().FP64FP16OutputDenormals))		((Op == AMDGPU::V_ADD_F64_e64 \|\| Op == AMDGPU::V_ADD_F16_e64) &&
		MFI->getMode().FP64FP16OutputDenormals))
return std::make_pair(nullptr, SIOutMods::NONE);		return std::make_pair(nullptr, SIOutMods::NONE);

// Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x		// Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);		const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);		const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);

if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&		if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
Src0->getSubReg() == Src1->getSubReg() &&		Src0->getSubReg() == Src1->getSubReg() &&
▲ Show 20 Lines • Show All 265 Lines • ▼ Show 20 Lines	for (I = MBB->begin(); I != MBB->end(); I = Next) {
if (!TII->isFoldableCopy(MI)) {		if (!TII->isFoldableCopy(MI)) {
// Saw an unknown clobber of m0, so we no longer know what it is.		// Saw an unknown clobber of m0, so we no longer know what it is.
if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))		if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
CurrentKnownM0Val = nullptr;		CurrentKnownM0Val = nullptr;

// TODO: Omod might be OK if there is NSZ only on the source		// TODO: Omod might be OK if there is NSZ only on the source
// instruction, and not the omod multiply.		// instruction, and not the omod multiply.
if (IsIEEEMode \|\| (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) \|\|		if (IsIEEEMode \|\| (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) \|\|
!tryFoldOMod(MI))		!tryFoldOMod(MI))
		arsenmUnsubmitted Not Done Reply Inline Actions This is a separate change arsenm: This is a separate change
		arsenmUnsubmitted Not Done Reply Inline Actions Also would need a comment explaining that ieee mode only changes snan behavior, nnan lets us ignore it. arsenm: Also would need a comment explaining that ieee mode only changes snan behavior, nnan lets us…
tryFoldClamp(MI);		tryFoldClamp(MI);

continue;		continue;
}		}

// Specially track simple redefs of m0 to the same value in a block, so we		// Specially track simple redefs of m0 to the same value in a block, so we
// can erase the later ones.		// can erase the later ones.
if (MI.getOperand(0).getReg() == AMDGPU::M0) {		if (MI.getOperand(0).getReg() == AMDGPU::M0) {
Show All 38 Lines

llvm/test/CodeGen/AMDGPU/omod.ll

	; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s			; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s
	; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s \| FileCheck --check-prefixes=GCN,VI %s			; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s \| FileCheck --check-prefixes=GCN,VI %s

	; IEEE bit enabled for compute kernel, no shouldn't use.			; IEEE bit enabled for compute kernel, so shouldn't use.
	; GCN-LABEL: {{^}}v_omod_div2_f32_enable_ieee_signed_zeros:			; GCN-LABEL: {{^}}v_omod_div2_f32_enable_ieee_signed_zeros:
	; GCN: {{buffer\|flat}}_load_dword [[A:v[0-9]+]]			; GCN: {{buffer\|flat}}_load_dword [[A:v[0-9]+]]
	; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}			; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}
	; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}			; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
	define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {			define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
	%tid = call i32 @llvm.amdgcn.workitem.id.x()			%tid = call i32 @llvm.amdgcn.workitem.id.x()
	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid			%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid			%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
	%a = load float, float addrspace(1)* %gep0			%a = load float, float addrspace(1)* %gep0
	%add = fadd float %a, 1.0			%add = fadd float %a, 1.0
	%div2 = fmul float %add, 0.5			%div2 = fmul float %add, 0.5
	store float %div2, float addrspace(1)* %out.gep			store float %div2, float addrspace(1)* %out.gep
	ret void			ret void
	}			}

	; IEEE bit enabled for compute kernel, no shouldn't use even though nsz is allowed			; IEEE bit enabled for compute kernel, so shouldn't use.
				; GCN-LABEL: {{^}}v_omod_div2_f64_enable_ieee_signed_zeros:
				; GCN: {{buffer\|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
				; GCN: v_add_f64 [[ADD:v\[[0-9]+:[0-9]+\]]], [[A]], 1.0{{$}}
				; GCN: v_mul_f64 v{{\[[0-9]+:[0-9]+\]}}, [[ADD]], 0.5{{$}}
				define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(double addrspace(1)* %out, double addrspace(1)* %aptr) #4 {
				%tid = call i32 @llvm.amdgcn.workitem.id.x()
				%gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
				%out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
				%a = load double, double addrspace(1)* %gep0
				%add = fadd double %a, 1.0
				%div2 = fmul double %add, 0.5
				store double %div2, double addrspace(1)* %out.gep
				ret void
				}

				; IEEE bit enabled for compute kernel, so shouldn't use even though nsz is allowed
	; GCN-LABEL: {{^}}v_omod_div2_f32_enable_ieee_nsz:			; GCN-LABEL: {{^}}v_omod_div2_f32_enable_ieee_nsz:
	; GCN: {{buffer\|flat}}_load_dword [[A:v[0-9]+]]			; GCN: {{buffer\|flat}}_load_dword [[A:v[0-9]+]]
	; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}			; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}
	; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}			; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
	define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {			define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
	%tid = call i32 @llvm.amdgcn.workitem.id.x()			%tid = call i32 @llvm.amdgcn.workitem.id.x()
	%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid			%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
	%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid			%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
	%a = load float, float addrspace(1)* %gep0			%a = load float, float addrspace(1)* %gep0
	%add = fadd float %a, 1.0			%add = fadd float %a, 1.0
	%div2 = fmul float %add, 0.5			%div2 = fmul float %add, 0.5
	store float %div2, float addrspace(1)* %out.gep			store float %div2, float addrspace(1)* %out.gep
	ret void			ret void
	}			}

				; IEEE bit enabled for compute kernel, so shouldn't use even though nsz is allowed.
				; GCN-LABEL: {{^}}v_omod_div2_f64_enable_ieee_nsz:
				; GCN: {{buffer\|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
				; GCN: v_add_f64 [[ADD:v\[[0-9]+:[0-9]+\]]], [[A]], 1.0{{$}}
				; GCN: v_mul_f64 v{{\[[0-9]+:[0-9]+\]}}, [[ADD]], 0.5{{$}}
				define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(double addrspace(1)* %out, double addrspace(1)* %aptr) #5 {
				%tid = call i32 @llvm.amdgcn.workitem.id.x()
				%gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
				%out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
				%a = load double, double addrspace(1)* %gep0
				%add = fadd double %a, 1.0
				%div2 = fmul double %add, 0.5
				store double %div2, double addrspace(1)* %out.gep
				ret void
				}

	; Only allow without IEEE bit if signed zeros are significant.			; Only allow without IEEE bit if signed zeros are significant.
	; GCN-LABEL: {{^}}v_omod_div2_f32_signed_zeros:			; GCN-LABEL: {{^}}v_omod_div2_f32_signed_zeros:
	; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}			; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
	; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}			; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
	define amdgpu_ps void @v_omod_div2_f32_signed_zeros(float %a) #4 {			define amdgpu_ps void @v_omod_div2_f32_signed_zeros(float %a) #4 {
	%add = fadd float %a, 1.0			%add = fadd float %a, 1.0
	%div2 = fmul float %add, 0.5			%div2 = fmul float %add, 0.5
	store float %div2, float addrspace(1)* undef			store float %div2, float addrspace(1)* undef
	ret void			ret void
	}			}

				; Only allow without IEEE bit if signed zeros are significant.
				; GCN-LABEL: {{^}}v_omod_div2_f64_signed_zeros:
				; GCN: v_add_f64 [[ADD:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, 1.0{{$}}
				; GCN: v_mul_f64 v{{\[[0-9]+:[0-9]+\]}}, [[ADD]], 0.5{{$}}
				define amdgpu_ps void @v_omod_div2_f64_signed_zeros(double %a) #4 {
				%add = fadd double %a, 1.0
				%div2 = fmul double %add, 0.5
				store double %div2, double addrspace(1)* undef
				ret void
				}

	; GCN-LABEL: {{^}}v_omod_div2_f32:			; GCN-LABEL: {{^}}v_omod_div2_f32:
	; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 div:2{{$}}			; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 div:2{{$}}
	define amdgpu_ps void @v_omod_div2_f32(float %a) #0 {			define amdgpu_ps void @v_omod_div2_f32(float %a) #0 {
	%add = fadd float %a, 1.0			%add = fadd float %a, 1.0
	%div2 = fmul float %add, 0.5			%div2 = fmul float %add, 0.5
	store float %div2, float addrspace(1)* undef			store float %div2, float addrspace(1)* undef
	ret void			ret void
	}			}

				; GCN-LABEL: {{^}}v_omod_div2_f64:
				; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, 1.0 div:2{{$}}
				define amdgpu_ps void @v_omod_div2_f64(double %a) #5 {
				%add = fadd nsz double %a, 1.0
				%div2 = fmul nsz double %add, 0.5
				arsenmUnsubmitted Not Done Reply Inline Actions Should just use the minimum set of fast flags arsenm: Should just use the minimum set of fast flags
				store double %div2, double addrspace(1)* undef
				ret void
				}

	; GCN-LABEL: {{^}}v_omod_mul2_f32:			; GCN-LABEL: {{^}}v_omod_mul2_f32:
	; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:2{{$}}			; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:2{{$}}
	define amdgpu_ps void @v_omod_mul2_f32(float %a) #0 {			define amdgpu_ps void @v_omod_mul2_f32(float %a) #0 {
	%add = fadd float %a, 1.0			%add = fadd float %a, 1.0
	%div2 = fmul float %add, 2.0			%div2 = fmul float %add, 2.0
	store float %div2, float addrspace(1)* undef			store float %div2, float addrspace(1)* undef
	ret void			ret void
	}			}

				; GCN-LABEL: {{^}}v_omod_mul2_f64:
				; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, 1.0 mul:2{{$}}
				define amdgpu_ps void @v_omod_mul2_f64(double %a) #5 {
				%add = fadd nsz double %a, 1.0
				%div2 = fmul nsz double %add, 2.0
				arsenmUnsubmitted Not Done Reply Inline Actions Should just use the minimum set of fast flags arsenm: Should just use the minimum set of fast flags
				store double %div2, double addrspace(1)* undef
				ret void
				}

	; GCN-LABEL: {{^}}v_omod_mul4_f32:			; GCN-LABEL: {{^}}v_omod_mul4_f32:
	; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:4{{$}}			; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:4{{$}}
	define amdgpu_ps void @v_omod_mul4_f32(float %a) #0 {			define amdgpu_ps void @v_omod_mul4_f32(float %a) #0 {
	%add = fadd float %a, 1.0			%add = fadd float %a, 1.0
	%div2 = fmul float %add, 4.0			%div2 = fmul float %add, 4.0
	store float %div2, float addrspace(1)* undef			store float %div2, float addrspace(1)* undef
	ret void			ret void
	}			}

				; GCN-LABEL: {{^}}v_omod_mul4_f64:
				; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, 1.0 mul:4{{$}}
				define amdgpu_ps void @v_omod_mul4_f64(double %a) #5 {
				%add = fadd nsz double %a, 1.0
				%div2 = fmul nsz double %add, 4.0
				arsenmUnsubmitted Not Done Reply Inline Actions Should just use the minimum set of fast flags arsenm: Should just use the minimum set of fast flags
				store double %div2, double addrspace(1)* undef
				ret void
				}

	; GCN-LABEL: {{^}}v_omod_mul4_multi_use_f32:			; GCN-LABEL: {{^}}v_omod_mul4_multi_use_f32:
	; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}			; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
	; GCN: v_mul_f32_e32 v{{[0-9]+}}, 4.0, [[ADD]]{{$}}			; GCN: v_mul_f32_e32 v{{[0-9]+}}, 4.0, [[ADD]]{{$}}
	define amdgpu_ps void @v_omod_mul4_multi_use_f32(float %a) #0 {			define amdgpu_ps void @v_omod_mul4_multi_use_f32(float %a) #0 {
	%add = fadd float %a, 1.0			%add = fadd float %a, 1.0
	%div2 = fmul float %add, 4.0			%div2 = fmul float %add, 4.0
	store float %div2, float addrspace(1)* undef			store float %div2, float addrspace(1)* undef
	store volatile float %add, float addrspace(1)* undef			store volatile float %add, float addrspace(1)* undef
	▲ Show 20 Lines • Show All 120 Lines • ▼ Show 20 Lines
	; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}			; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
	define amdgpu_ps void @v_omod_div2_f32_denormals(float %a) #2 {			define amdgpu_ps void @v_omod_div2_f32_denormals(float %a) #2 {
	%add = fadd float %a, 1.0			%add = fadd float %a, 1.0
	%div2 = fmul float %add, 0.5			%div2 = fmul float %add, 0.5
	store float %div2, float addrspace(1)* undef			store float %div2, float addrspace(1)* undef
	ret void			ret void
	}			}

				; Don't fold omod if denorms enabled.
				; GCN-LABEL: {{^}}v_omod_div2_f64_denormals:
				; GCN: v_add_f64 [[ADD:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, 1.0{{$}}
				; GCN: v_mul_f64 v{{\[[0-9]+:[0-9]+\]}}, [[ADD]], 0.5{{$}}
				define amdgpu_ps void @v_omod_div2_f64_denormals(double %a) #6 {
				%add = fadd double %a, 1.0
				%div2 = fmul double %add, 0.5
				store double %div2, double addrspace(1)* undef
				ret void
				}

	; Don't fold omod if denorms enabled for add form.			; Don't fold omod if denorms enabled for add form.
	; GCN-LABEL: {{^}}v_omod_mul2_f32_denormals:			; GCN-LABEL: {{^}}v_omod_mul2_f32_denormals:
	; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}			; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
	; GCN: v_add_f32_e32 v{{[0-9]+}}, [[ADD]], [[ADD]]{{$}}			; GCN: v_add_f32_e32 v{{[0-9]+}}, [[ADD]], [[ADD]]{{$}}
	define amdgpu_ps void @v_omod_mul2_f32_denormals(float %a) #2 {			define amdgpu_ps void @v_omod_mul2_f32_denormals(float %a) #2 {
	%add = fadd float %a, 1.0			%add = fadd float %a, 1.0
	%mul2 = fadd float %add, %add			%mul2 = fadd float %add, %add
	store float %mul2, float addrspace(1)* undef			store float %mul2, float addrspace(1)* undef
	ret void			ret void
	}			}

				; Don't fold omod if denorms enabled for add form.
				; GCN-LABEL: {{^}}v_omod_mul2_f64_denormals:
				; GCN: v_add_f64 [[ADD:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, 1.0{{$}}
				; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, [[ADD]], [[ADD]]{{$}}
				define amdgpu_ps void @v_omod_mul2_f64_denormals(double %a) #2 {
				%add = fadd double %a, 1.0
				%mul2 = fadd double %add, %add
				store double %mul2, double addrspace(1)* undef
				ret void
				}

	; Don't fold omod if denorms enabled			; Don't fold omod if denorms enabled
	; GCN-LABEL: {{^}}v_omod_div2_f16_denormals:			; GCN-LABEL: {{^}}v_omod_div2_f16_denormals:
	; VI: v_add_f16_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}			; VI: v_add_f16_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
	; VI: v_mul_f16_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}			; VI: v_mul_f16_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
	define amdgpu_ps void @v_omod_div2_f16_denormals(half %a) #0 {			define amdgpu_ps void @v_omod_div2_f16_denormals(half %a) #0 {
	%add = fadd half %a, 1.0			%add = fadd half %a, 1.0
	%div2 = fmul half %add, 0.5			%div2 = fmul half %add, 0.5
	store half %div2, half addrspace(1)* undef			store half %div2, half addrspace(1)* undef
	▲ Show 20 Lines • Show All 46 Lines • ▼ Show 20 Lines
	declare half @llvm.maxnum.f16(half, half) #1			declare half @llvm.maxnum.f16(half, half) #1
	declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1			declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1

	attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-signed-zeros-fp-math"="true" }			attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-signed-zeros-fp-math"="true" }
	attributes #1 = { nounwind readnone }			attributes #1 = { nounwind readnone }
	attributes #2 = { nounwind "denormal-fp-math-f32"="ieee,ieee" "no-signed-zeros-fp-math"="true" }			attributes #2 = { nounwind "denormal-fp-math-f32"="ieee,ieee" "no-signed-zeros-fp-math"="true" }
	attributes #3 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" "no-signed-zeros-fp-math"="true" }			attributes #3 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" "no-signed-zeros-fp-math"="true" }
	attributes #4 = { nounwind "no-signed-zeros-fp-math"="false" }			attributes #4 = { nounwind "no-signed-zeros-fp-math"="false" }
				attributes #5 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" }
				attributes #6 = { nounwind "denormal-fp-math"="ieee,ieee" "no-signed-zeros-fp-math"="true" }
				arsenmUnsubmitted Not Done Reply Inline Actions You're using the fast math flags, so you don't need the global no-signed-zeros-fp-math arsenm: You're using the fast math flags, so you don't need the global no-signed-zeros-fp-math

	!llvm.dbg.cu = !{!0}			!llvm.dbg.cu = !{!0}
	!llvm.module.flags = !{!2, !3}			!llvm.module.flags = !{!2, !3}

	!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug)			!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug)
	!1 = !DIFile(filename: "/tmp/foo.cl", directory: "/dev/null")			!1 = !DIFile(filename: "/tmp/foo.cl", directory: "/dev/null")
	!2 = !{i32 2, !"Dwarf Version", i32 4}			!2 = !{i32 2, !"Dwarf Version", i32 4}
	!3 = !{i32 2, !"Debug Info Version", i32 3}			!3 = !{i32 2, !"Debug Info Version", i32 3}
	!4 = !DILocalVariable(name: "add", arg: 1, scope: !5, file: !1, line: 1)			!4 = !DILocalVariable(name: "add", arg: 1, scope: !5, file: !1, line: 1)
	!5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0)			!5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
	!6 = !DISubroutineType(types: !7)			!6 = !DISubroutineType(types: !7)
	!7 = !{null, !8}			!7 = !{null, !8}
	!8 = !DIBasicType(name: "float", size: 32, align: 32)			!8 = !DIBasicType(name: "float", size: 32, align: 32)
	!9 = !DIExpression()			!9 = !DIExpression()
	!10 = !DILocation(line: 1, column: 42, scope: !5)			!10 = !DILocation(line: 1, column: 42, scope: !5)

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Enable output modifiers for double precision instructions
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 334679

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

llvm/test/CodeGen/AMDGPU/omod.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Enable output modifiers for double precision instructionsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 334679

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

llvm/test/CodeGen/AMDGPU/omod.ll

[AMDGPU] Enable output modifiers for double precision instructions
ClosedPublic