Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -4616,15 +4616,93 @@ return SDValue(); } +static bool isCanonicalized(SDValue Op, const SISubtarget *ST, + unsigned MaxDepth=5) { + // If source is a result of another standard FP operation it is already in + // canonical form. + + switch (Op.getOpcode()) { + default: + break; + + // These will flush denorms if required. + case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: + case ISD::FSQRT: + case ISD::FCEIL: + case ISD::FFLOOR: + case ISD::FMA: + case ISD::FMAD: + + case ISD::FCANONICALIZE: + return true; + + case ISD::FP_ROUND: + return Op.getValueType().getScalarType() != MVT::f16 || + ST->hasFP16Denormals(); + + case ISD::FP_EXTEND: + return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 || + ST->hasFP16Denormals(); + + case ISD::FP16_TO_FP: + case ISD::FP_TO_FP16: + return ST->hasFP16Denormals(); + + // It can/will be lowered or combined as a bit operation. + // Need to check their input recursively to handle. + case ISD::FNEG: + case ISD::FABS: + return (MaxDepth > 0) && + isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1); + + case ISD::FSIN: + case ISD::FCOS: + case ISD::FSINCOS: + return Op.getValueType().getScalarType() != MVT::f16; + + // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. + // For such targets need to check their input recursively. + case ISD::FMINNUM: + case ISD::FMAXNUM: + case ISD::FMINNAN: + case ISD::FMAXNAN: + if (ST->getGeneration() >= SISubtarget::GFX9) + return true; + + return (MaxDepth > 0) && + isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1) && + isCanonicalized(Op.getOperand(1), ST, MaxDepth - 1); + + case ISD::ConstantFP: { + auto F = cast(Op)->getValueAPF(); + return !F.isDenormal() && !(F.isNaN() && F.isSignaling()); + } + } + return false; +} + // Constant fold canonicalize. SDValue SITargetLowering::performFCanonicalizeCombine( SDNode *N, DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; ConstantFPSDNode *CFP = isConstOrConstSplatFP(N->getOperand(0)); - if (!CFP) + + if (!CFP) { + SDValue N0 = N->getOperand(0); + + bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction()) || + getTargetMachine().Options.NoNaNsFPMath || + N->getFlags().hasNoNaNs(); + + if (IsIEEEMode && isCanonicalized(N0, getSubtarget())) + return N0; + return SDValue(); + } - SelectionDAG &DAG = DCI.DAG; const APFloat &C = CFP->getValueAPF(); // Flush denormals to 0 if not enabled. Index: test/CodeGen/AMDGPU/fcanonicalize-elimination.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -0,0 +1,483 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs -mattr=+fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s + +; GCN-LABEL: {{^}}test_no_fold_canonicalize_loaded_value_f32: +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +define amdgpu_kernel void @test_no_fold_canonicalize_loaded_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %v = load float, float addrspace(1)* %gep, align 4 + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_fold_canonicalize_fmul_value_f32: +; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fmul_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = fmul float %load, 15.0 + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_fold_canonicalize_sub_value_f32: +; GCN: v_sub_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_sub_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = fsub float 15.0, %load + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_fold_canonicalize_add_value_f32: +; GCN: v_add_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_add_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = fadd float %load, 15.0 + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_fold_canonicalize_sqrt_value_f32: +; GCN: v_sqrt_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_sqrt_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = call float @llvm.sqrt.f32(float %load) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fceil_value_f32: +; GCN: v_ceil_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fceil_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = call float @llvm.ceil.f32(float %load) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_floor_value_f32: +; GCN: v_floor_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_floor_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = call float @llvm.floor.f32(float %load) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fma_value_f32: +; GCN: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = call float @llvm.fma.f32(float %load, float 15.0, float 15.0) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fmuladd_value_f32: +; VI: v_mac_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; GFX9: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = call float @llvm.fmuladd.f32(float %load, float 15.0, float 15.0) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_canonicalize_value_f32: +; GCN: flat_load_dword [[LOAD:v[0-9]+]], +; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 1.0, [[LOAD]] +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = call float @llvm.canonicalize.f32(float %load) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fpextend_value_f64_f32: +; GCN: v_cvt_f64_f32_e32 [[V:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} +; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f64_f32(float addrspace(1)* %arg, double addrspace(1)* %out) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = fpext float %load to double + %canonicalized = tail call double @llvm.canonicalize.f64(double %v) + %gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id + store double %canonicalized, double addrspace(1)* %gep2, align 8 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fpextend_value_f32_f16: +; GCN: v_cvt_f32_f16_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16(half addrspace(1)* %arg, float addrspace(1)* %out) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id + %load = load half, half addrspace(1)* %gep, align 2 + %v = fpext half %load to float + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id + store float %canonicalized, float addrspace(1)* %gep2, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fpround_value_f32_f64: +; GCN: v_cvt_f32_f64_e32 [[V:v[0-9]+]], v[{{[0-9:]+}}] +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f32_f64(double addrspace(1)* %arg, float addrspace(1)* %out) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id + %load = load double, double addrspace(1)* %gep, align 8 + %v = fptrunc double %load to float + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id + store float %canonicalized, float addrspace(1)* %gep2, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fpround_value_f16_f32: +; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN: flat_store_short v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32(float addrspace(1)* %arg, half addrspace(1)* %out) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = fptrunc float %load to half + %canonicalized = tail call half @llvm.canonicalize.f16(half %v) + %gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id + store half %canonicalized, half addrspace(1)* %gep2, align 2 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fpround_value_v2f16_v2f32: +; GCN-DAG: v_cvt_f16_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}} +; VI-DAG: v_cvt_f16_f32_sdwa [[V1:v[0-9]+]], v{{[0-9]+}} +; VI: v_or_b32_e32 [[V:v[0-9]+]], [[V1]], [[V0]] +; GFX9: v_cvt_f16_f32_e32 [[V1:v[0-9]+]], v{{[0-9]+}} +; GFX9: v_and_b32_e32 [[V0_16:v[0-9]+]], 0xffff, [[V0]] +; GFX9: v_lshl_or_b32 [[V:v[0-9]+]], [[V1]], 16, [[V0_16]] +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(<2 x float> addrspace(1)* %arg, <2 x half> addrspace(1)* %out) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %arg, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %v = fptrunc <2 x float> %load to <2 x half> + %canonicalized = tail call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %v) + %gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i32 %id + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %gep2, align 4 + ret void +} + +; GCN-LABEL: test_no_fold_canonicalize_fneg_value_f32: +; GCN: v_mul_f32_e64 v{{[0-9]+}}, 1.0, -v{{[0-9]+}} +define amdgpu_kernel void @test_no_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = fsub float -0.0, %load + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fneg_value_f32: +; GCN: v_xor_b32_e32 [[V:v[0-9]+]], 0x80000000, v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v0 = fadd float %load, 0.0 + %v = fsub float -0.0, %v0 + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_no_fold_canonicalize_fabs_value_f32: +; GCN: v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}| +define amdgpu_kernel void @test_no_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = tail call float @llvm.fabs.f32(float %load) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fabs_value_f32: +; GCN: v_and_b32_e32 [[V:v[0-9]+]], 0x7fffffff, v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v0 = fadd float %load, 0.0 + %v = tail call float @llvm.fabs.f32(float %v0) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_sin_value_f32: +; GCN: v_sin_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_sin_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = tail call float @llvm.sin.f32(float %load) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_cos_value_f32: +; GCN: v_cos_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_cos_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = tail call float @llvm.cos.f32(float %load) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_sin_value_f16: +; GCN: v_sin_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}} +; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]] +; GCN: flat_store_short v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_sin_value_f16(half addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id + %load = load half, half addrspace(1)* %gep, align 2 + %v = tail call half @llvm.sin.f16(half %load) + %canonicalized = tail call half @llvm.canonicalize.f16(half %v) + store half %canonicalized, half addrspace(1)* %gep, align 2 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_cos_value_f16: +; GCN: v_cos_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}} +; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]] +; GCN: flat_store_short v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_cos_value_f16(half addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id + %load = load half, half addrspace(1)* %gep, align 2 + %v = tail call half @llvm.cos.f16(half %load) + %canonicalized = tail call half @llvm.canonicalize.f16(half %v) + store half %canonicalized, half addrspace(1)* %gep, align 2 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_qNaN_value_f32: +; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x7fc00000 +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %canonicalized = tail call float @llvm.canonicalize.f32(float 0x7FF8000000000000) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32: +; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +; GFX9: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}} +; GFX9: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GFX9-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = tail call float @llvm.minnum.f32(float %load, float 0.0) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_minnum_value_f32: +; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v0 = fadd float %load, 0.0 + %v = tail call float @llvm.minnum.f32(float %v0, float 0.0) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_sNaN_value_f32: +; GCN: v_min_f32_e32 [[V0:v[0-9]+]], 0x7f800001, v{{[0-9]+}} +; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]] +; VI: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GFX9: flat_store_dword v[{{[0-9:]+}}], [[V0]] +; GFX9-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 2139095041 to float)) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_denorm_value_f32: +; GCN: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}} +; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]] +; VI: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GFX9: flat_store_dword v[{{[0-9:]+}}], [[V0]] +; GFX9-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float)) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32: +; GCN: v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}} +; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]] +; VI: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GFX9: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GFX9-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = tail call float @llvm.maxnum.f32(float %load, float 0.0) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_maxnum_value_f32: +; GCN: v_max_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v0 = fadd float %load, 0.0 + %v = tail call float @llvm.maxnum.f32(float %v0, float 0.0) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_maxnum_value_f64: +; GCN: v_max_f64 [[V:v\[[0-9]+:[0-9]+\]]], v[{{[0-9:]+}}], 0 +; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f64(double addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id + %load = load double, double addrspace(1)* %gep, align 8 + %v0 = fadd double %load, 0.0 + %v = tail call double @llvm.maxnum.f64(double %v0, double 0.0) + %canonicalized = tail call double @llvm.canonicalize.f64(double %v) + store double %canonicalized, double addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: test_no_fold_canonicalize_fmul_value_f32_no_ieee: +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +define amdgpu_ps float @test_no_fold_canonicalize_fmul_value_f32_no_ieee(float %arg) { +entry: + %v = fmul float %arg, 15.0 + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + ret float %canonicalized +} + +declare float @llvm.canonicalize.f32(float) #0 +declare double @llvm.canonicalize.f64(double) #0 +declare half @llvm.canonicalize.f16(half) #0 +declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0 +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare float @llvm.sqrt.f32(float) #0 +declare float @llvm.ceil.f32(float) #0 +declare float @llvm.floor.f32(float) #0 +declare float @llvm.fma.f32(float, float, float) #0 +declare float @llvm.fmuladd.f32(float, float, float) #0 +declare float @llvm.fabs.f32(float) #0 +declare float @llvm.sin.f32(float) #0 +declare float @llvm.cos.f32(float) #0 +declare half @llvm.sin.f16(half) #0 +declare half @llvm.cos.f16(half) #0 +declare float @llvm.minnum.f32(float, float) #0 +declare float @llvm.maxnum.f32(float, float) #0 +declare double @llvm.maxnum.f64(double, double) #0 + +attributes #0 = { nounwind readnone }