Index: llvm/lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1406,6 +1406,18 @@ static int getOModValue(unsigned Opc, int64_t Val) { switch (Opc) { + case AMDGPU::V_MUL_F64_e64: { + switch (Val) { + case 0x3fe0000000000000: // 0.5 + return SIOutMods::DIV2; + case 0x4000000000000000: // 2.0 + return SIOutMods::MUL2; + case 0x4010000000000000: // 4.0 + return SIOutMods::MUL4; + default: + return SIOutMods::NONE; + } + } case AMDGPU::V_MUL_F32_e64: { switch (static_cast(Val)) { case 0x3f000000: // 0.5 @@ -1442,11 +1454,13 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const { unsigned Op = MI.getOpcode(); switch (Op) { + case AMDGPU::V_MUL_F64_e64: case AMDGPU::V_MUL_F32_e64: case AMDGPU::V_MUL_F16_e64: { // If output denormals are enabled, omod is ignored. if ((Op == AMDGPU::V_MUL_F32_e64 && MFI->getMode().FP32OutputDenormals) || - (Op == AMDGPU::V_MUL_F16_e64 && MFI->getMode().FP64FP16OutputDenormals)) + ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F16_e64) && + MFI->getMode().FP64FP16OutputDenormals)) return std::make_pair(nullptr, SIOutMods::NONE); const MachineOperand *RegOp = nullptr; @@ -1472,11 +1486,13 @@ return std::make_pair(RegOp, OMod); } + case AMDGPU::V_ADD_F64_e64: case AMDGPU::V_ADD_F32_e64: case AMDGPU::V_ADD_F16_e64: { // If output denormals are enabled, omod is ignored. if ((Op == AMDGPU::V_ADD_F32_e64 && MFI->getMode().FP32OutputDenormals) || - (Op == AMDGPU::V_ADD_F16_e64 && MFI->getMode().FP64FP16OutputDenormals)) + ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F16_e64) && + MFI->getMode().FP64FP16OutputDenormals)) return std::make_pair(nullptr, SIOutMods::NONE); // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x @@ -1757,8 +1773,8 @@ // TODO: Omod might be OK if there is NSZ only on the source // instruction, and not the omod multiply. - if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) || - !tryFoldOMod(MI)) + if ((IsIEEEMode && !MI.getFlag(MachineInstr::FmNoNans)) || + (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) || !tryFoldOMod(MI)) tryFoldClamp(MI); continue; Index: llvm/test/CodeGen/AMDGPU/omod.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/omod.ll +++ llvm/test/CodeGen/AMDGPU/omod.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI %s -; IEEE bit enabled for compute kernel, no shouldn't use. +; IEEE bit enabled for compute kernel, so shouldn't use. ; GCN-LABEL: {{^}}v_omod_div2_f32_enable_ieee_signed_zeros: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}} @@ -17,7 +17,23 @@ ret void } -; IEEE bit enabled for compute kernel, no shouldn't use even though nsz is allowed +; IEEE bit enabled for compute kernel, so shouldn't use. +; GCN-LABEL: {{^}}v_omod_div2_f64_enable_ieee_signed_zeros: +; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] +; GCN: v_add_f64 [[ADD:v\[[0-9]+:[0-9]+\]]], [[A]], 1.0{{$}} +; GCN: v_mul_f64 v{{\[[0-9]+:[0-9]+\]}}, [[ADD]], 0.5{{$}} +define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(double addrspace(1)* %out, double addrspace(1)* %aptr) #4 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid + %a = load double, double addrspace(1)* %gep0 + %add = fadd double %a, 1.0 + %div2 = fmul double %add, 0.5 + store double %div2, double addrspace(1)* %out.gep + ret void +} + +; IEEE bit enabled for compute kernel, so shouldn't use even though nsz is allowed ; GCN-LABEL: {{^}}v_omod_div2_f32_enable_ieee_nsz: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}} @@ -33,6 +49,22 @@ ret void } +; IEEE bit enabled for compute kernel, so shouldn't use even though nsz is allowed. +; GCN-LABEL: {{^}}v_omod_div2_f64_enable_ieee_nsz: +; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] +; GCN: v_add_f64 [[ADD:v\[[0-9]+:[0-9]+\]]], [[A]], 1.0{{$}} +; GCN: v_mul_f64 v{{\[[0-9]+:[0-9]+\]}}, [[ADD]], 0.5{{$}} +define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(double addrspace(1)* %out, double addrspace(1)* %aptr) #5 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid + %a = load double, double addrspace(1)* %gep0 + %add = fadd double %a, 1.0 + %div2 = fmul double %add, 0.5 + store double %div2, double addrspace(1)* %out.gep + ret void +} + ; Only allow without IEEE bit if signed zeros are significant. ; GCN-LABEL: {{^}}v_omod_div2_f32_signed_zeros: ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}} @@ -44,6 +76,17 @@ ret void } +; Only allow without IEEE bit if signed zeros are significant. +; GCN-LABEL: {{^}}v_omod_div2_f64_signed_zeros: +; GCN: v_add_f64 [[ADD:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, 1.0{{$}} +; GCN: v_mul_f64 v{{\[[0-9]+:[0-9]+\]}}, [[ADD]], 0.5{{$}} +define amdgpu_ps void @v_omod_div2_f64_signed_zeros(double %a) #4 { + %add = fadd double %a, 1.0 + %div2 = fmul double %add, 0.5 + store double %div2, double addrspace(1)* undef + ret void +} + ; GCN-LABEL: {{^}}v_omod_div2_f32: ; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 div:2{{$}} define amdgpu_ps void @v_omod_div2_f32(float %a) #0 { @@ -53,6 +96,25 @@ ret void } +; GCN-LABEL: {{^}}v_omod_div2_f64: +; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, 1.0 div:2{{$}} +define amdgpu_ps void @v_omod_div2_f64(double %a) #5 { + %add = fadd nsz double %a, 1.0 + %div2 = fmul nsz double %add, 0.5 + store double %div2, double addrspace(1)* undef + ret void +} + +; IEEE mode is enabled, but nnan flag is set. +; GCN-LABEL: {{^}}v_omod_div2_f64_nnan: +; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0 div:2{{$}} +define amdgpu_kernel void @v_omod_div2_f64_nnan(double %a) #5 { + %add = fadd nnan nsz double %a, 1.0 + %div2 = fmul nnan nsz double %add, 0.5 + store double %div2, double addrspace(1)* undef + ret void +} + ; GCN-LABEL: {{^}}v_omod_mul2_f32: ; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:2{{$}} define amdgpu_ps void @v_omod_mul2_f32(float %a) #0 { @@ -62,6 +124,25 @@ ret void } +; GCN-LABEL: {{^}}v_omod_mul2_f64: +; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, 1.0 mul:2{{$}} +define amdgpu_ps void @v_omod_mul2_f64(double %a) #5 { + %add = fadd nsz double %a, 1.0 + %div2 = fmul nsz double %add, 2.0 + store double %div2, double addrspace(1)* undef + ret void +} + +; IEEE mode is enabled, but nnan flag is set. +; GCN-LABEL: {{^}}v_omod_mul2_f64_nnan: +; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0 mul:2{{$}} +define amdgpu_kernel void @v_omod_mul2_f64_nnan(double %a) #5 { + %add = fadd nnan nsz double %a, 1.0 + %div2 = fmul nnan nsz double %add, 2.0 + store double %div2, double addrspace(1)* undef + ret void +} + ; GCN-LABEL: {{^}}v_omod_mul4_f32: ; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:4{{$}} define amdgpu_ps void @v_omod_mul4_f32(float %a) #0 { @@ -71,6 +152,25 @@ ret void } +; GCN-LABEL: {{^}}v_omod_mul4_f64: +; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, 1.0 mul:4{{$}} +define amdgpu_ps void @v_omod_mul4_f64(double %a) #5 { + %add = fadd nsz double %a, 1.0 + %div2 = fmul nsz double %add, 4.0 + store double %div2, double addrspace(1)* undef + ret void +} + +; IEEE mode is enabled, but nnan flag is set. +; GCN-LABEL: {{^}}v_omod_mul4_f64_nnan: +; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0 mul:4{{$}} +define amdgpu_kernel void @v_omod_mul4_f64_nnan(double %a) #5 { + %add = fadd nnan nsz double %a, 1.0 + %div2 = fmul nnan nsz double %add, 4.0 + store double %div2, double addrspace(1)* undef + ret void +} + ; GCN-LABEL: {{^}}v_omod_mul4_multi_use_f32: ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}} ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 4.0, [[ADD]]{{$}} @@ -207,6 +307,17 @@ ret void } +; Don't fold omod if denorms enabled. +; GCN-LABEL: {{^}}v_omod_div2_f64_denormals: +; GCN: v_add_f64 [[ADD:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, 1.0{{$}} +; GCN: v_mul_f64 v{{\[[0-9]+:[0-9]+\]}}, [[ADD]], 0.5{{$}} +define amdgpu_ps void @v_omod_div2_f64_denormals(double %a) #6 { + %add = fadd double %a, 1.0 + %div2 = fmul double %add, 0.5 + store double %div2, double addrspace(1)* undef + ret void +} + ; Don't fold omod if denorms enabled for add form. ; GCN-LABEL: {{^}}v_omod_mul2_f32_denormals: ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}} @@ -218,6 +329,17 @@ ret void } +; Don't fold omod if denorms enabled for add form. +; GCN-LABEL: {{^}}v_omod_mul2_f64_denormals: +; GCN: v_add_f64 [[ADD:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, 1.0{{$}} +; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, [[ADD]], [[ADD]]{{$}} +define amdgpu_ps void @v_omod_mul2_f64_denormals(double %a) #2 { + %add = fadd double %a, 1.0 + %mul2 = fadd double %add, %add + store double %mul2, double addrspace(1)* undef + ret void +} + ; Don't fold omod if denorms enabled ; GCN-LABEL: {{^}}v_omod_div2_f16_denormals: ; VI: v_add_f16_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}} @@ -280,6 +402,8 @@ attributes #2 = { nounwind "denormal-fp-math-f32"="ieee,ieee" "no-signed-zeros-fp-math"="true" } attributes #3 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" "no-signed-zeros-fp-math"="true" } attributes #4 = { nounwind "no-signed-zeros-fp-math"="false" } +attributes #5 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" } +attributes #6 = { nounwind "denormal-fp-math"="ieee,ieee" "no-signed-zeros-fp-math"="true" } !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!2, !3}