Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -44,63 +44,6 @@ namespace { -// Instructions that will be lowered with a final instruction that zeros the -// high result bits. -// XXX - only need to list legal operations. -static bool fp16SrcZerosHighBits(unsigned Opc) { - switch (Opc) { - case ISD::FADD: - case ISD::FSUB: - case ISD::FMUL: - case ISD::FDIV: - case ISD::FREM: - case ISD::FMA: - case ISD::FMAD: - case ISD::FCANONICALIZE: - case ISD::FP_ROUND: - case ISD::UINT_TO_FP: - case ISD::SINT_TO_FP: - case ISD::FABS: - // Fabs is lowered to a bit operation, but it's an and which will clear the - // high bits anyway. - case ISD::FSQRT: - case ISD::FSIN: - case ISD::FCOS: - case ISD::FPOWI: - case ISD::FPOW: - case ISD::FLOG: - case ISD::FLOG2: - case ISD::FLOG10: - case ISD::FEXP: - case ISD::FEXP2: - case ISD::FCEIL: - case ISD::FTRUNC: - case ISD::FRINT: - case ISD::FNEARBYINT: - case ISD::FROUND: - case ISD::FFLOOR: - case ISD::FMINNUM: - case ISD::FMAXNUM: - case AMDGPUISD::FRACT: - case AMDGPUISD::CLAMP: - case AMDGPUISD::COS_HW: - case AMDGPUISD::SIN_HW: - case AMDGPUISD::FMIN3: - case AMDGPUISD::FMAX3: - case AMDGPUISD::FMED3: - case AMDGPUISD::FMAD_FTZ: - case AMDGPUISD::RCP: - case AMDGPUISD::RSQ: - case AMDGPUISD::RCP_IFLAG: - case AMDGPUISD::LDEXP: - return true; - default: - // fcopysign, select and others may be lowered to 32-bit bit operations - // which don't zero the high bits. - return false; - } -} - static bool isNullConstantOrUndef(SDValue V) { if (V.isUndef()) return true; @@ -164,6 +107,10 @@ bool EnableLateStructurizeCFG; + // Instructions that will be lowered with a final instruction that zeros the + // high result bits. + bool fp16SrcZerosHighBits(unsigned Opc) const; + public: explicit AMDGPUDAGToDAGISel(TargetMachine *TM = nullptr, CodeGenOpt::Level OptLevel = CodeGenOpt::Default) @@ -459,6 +406,68 @@ return SelectionDAGISel::runOnMachineFunction(MF); } +bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const { + // XXX - only need to list legal operations. + switch (Opc) { + case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: + case ISD::FDIV: + case ISD::FREM: + case ISD::FCANONICALIZE: + case ISD::UINT_TO_FP: + case ISD::SINT_TO_FP: + case ISD::FABS: + // Fabs is lowered to a bit operation, but it's an and which will clear the + // high bits anyway. + case ISD::FSQRT: + case ISD::FSIN: + case ISD::FCOS: + case ISD::FPOWI: + case ISD::FPOW: + case ISD::FLOG: + case ISD::FLOG2: + case ISD::FLOG10: + case ISD::FEXP: + case ISD::FEXP2: + case ISD::FCEIL: + case ISD::FTRUNC: + case ISD::FRINT: + case ISD::FNEARBYINT: + case ISD::FROUND: + case ISD::FFLOOR: + case ISD::FMINNUM: + case ISD::FMAXNUM: + case AMDGPUISD::FRACT: + case AMDGPUISD::CLAMP: + case AMDGPUISD::COS_HW: + case AMDGPUISD::SIN_HW: + case AMDGPUISD::FMIN3: + case AMDGPUISD::FMAX3: + case AMDGPUISD::FMED3: + case AMDGPUISD::FMAD_FTZ: + case AMDGPUISD::RCP: + case AMDGPUISD::RSQ: + case AMDGPUISD::RCP_IFLAG: + case AMDGPUISD::LDEXP: + // On gfx10, all 16-bit instructions preserve the high bits. + return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9; + case ISD::FP_ROUND: + // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the + // high bits on gfx9. + // TODO: If we had the source node we could see if the source was fma/mad + return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; + case ISD::FMA: + case ISD::FMAD: + case AMDGPUISD::DIV_FIXUP: + return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; + default: + // fcopysign, select and others may be lowered to 32-bit bit operations + // which don't zero the high bits. + return false; + } +} + bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const { assert(Subtarget->d16PreservesUnusedBits()); MVT VT = N->getValueType(0).getSimpleVT(); Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1995,10 +1995,16 @@ // Eliminate a zero extension from an fp16 operation if it already // zeros the high bits of the 32-bit register. +// +// This is complicated on gfx9+. Some instructions maintain the legacy +// zeroing behavior, but others preserve the high bits. Some have a +// control bit to change the behavior. We can't simply say with +// certainty what the source behavior is without more context on how +// the src is lowered. e.g. fptrunc + fma may be lowered to a +// v_fma_mix* instruction which does not zero, or may not. def : GCNPat< (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))), - (COPY VSrc_b16:$src) ->; + (COPY VSrc_b16:$src)>; def : GCNPat < (i32 (trunc i64:$a)), Index: llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -140,7 +140,8 @@ ; GCN-LABEL: {{^}}fptrunc_f32_to_f16_zext_i32: ; GCN: buffer_load_dword v[[A_F32:[0-9]+]] ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] -; GCN-NOT: v[[R_F16]] +; SIVI-NOT: v[[R_F16]] +; GFX9-NEXT: v_and_b32_e32 v[[R_F16]], 0xffff, v[[R_F16]] ; GCN: buffer_store_dword v[[R_F16]] define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( i32 addrspace(1)* %r, @@ -157,7 +158,8 @@ ; GCN-LABEL: {{^}}fptrunc_fabs_f32_to_f16_zext_i32: ; GCN: buffer_load_dword v[[A_F32:[0-9]+]] ; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], |v[[A_F32]]| -; GCN-NOT: v[[R_F16]] +; SIVI-NOT: v[[R_F16]] +; GFX9-NEXT: v_and_b32_e32 v[[R_F16]], 0xffff, v[[R_F16]] ; GCN: buffer_store_dword v[[R_F16]] define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( i32 addrspace(1)* %r, Index: llvm/test/CodeGen/AMDGPU/preserve-hi16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/preserve-hi16.ll +++ llvm/test/CodeGen/AMDGPU/preserve-hi16.ll @@ -1,4 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX900 %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX906 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s ; GCN-LABEL: {{^}}shl_i16: @@ -188,3 +190,93 @@ %zext = zext i16 %res to i32 ret i32 %zext } + +; GCN-LABEL: {{^}}zext_fadd_f16: +; GFX8: v_add_f16_e32 [[ADD:v[0-9]+]], v0, v1 +; GFX8-NEXT: s_setpc_b64 + +; GFX9: v_add_f16_e32 [[ADD:v[0-9]+]], v0, v1 +; GFX9-NEXT: s_setpc_b64 + +; GFX10: v_add_f16_e32 [[ADD:v[0-9]+]], v0, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, [[ADD]] +define i32 @zext_fadd_f16(half %x, half %y) { + %add = fadd half %x, %y + %cast = bitcast half %add to i16 + %zext = zext i16 %cast to i32 + ret i32 %zext +} + +; GCN-LABEL: {{^}}zext_fma_f16: +; GFX8: v_fma_f16 [[FMA:v[0-9]+]], v0, v1, v2 +; GFX8-NEXT: s_setpc_b64 + +; GFX9: v_fma_f16 [[FMA:v[0-9]+]], v0, v1, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, [[FMA]] + +; GFX10: v_fmac_f16_e32 [[FMA:v[0-9]+]], v0, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, [[FMA]] +define i32 @zext_fma_f16(half %x, half %y, half %z) { + %fma = call half @llvm.fma.f16(half %x, half %y, half %z) + %cast = bitcast half %fma to i16 + %zext = zext i16 %cast to i32 + ret i32 %zext +} + +; GCN-LABEL: {{^}}zext_div_fixup_f16: +; GFX8: v_div_fixup_f16 v0, v0, v1, v2 +; GFX8-NEXT: s_setpc_b64 + +; GFX9: v_div_fixup_f16 v0, v0, v1, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 + +; GFX10: v_div_fixup_f16 v0, v0, v1, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +define i32 @zext_div_fixup_f16(half %x, half %y, half %z) { + %div.fixup = call half @llvm.amdgcn.div.fixup.f16(half %x, half %y, half %z) + %cast = bitcast half %div.fixup to i16 + %zext = zext i16 %cast to i32 + ret i32 %zext +} + +; We technically could eliminate the and on gfx9 here but we don't try +; to inspect the source of the fptrunc. We're only worried about cases +; that lower to v_fma_mix* instructions. + +; GCN-LABEL: {{^}}zext_fptrunc_f16: +; GFX8: v_cvt_f16_f32_e32 v0, v0 +; GFX8-NEXT: s_setpc_b64 + +; GFX9: v_cvt_f16_f32_e32 v0, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 + +; GFX10: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +define i32 @zext_fptrunc_f16(float %x) { + %fptrunc = fptrunc float %x to half + %cast = bitcast half %fptrunc to i16 + %zext = zext i16 %cast to i32 + ret i32 %zext +} + +; GCN-LABEL: {{^}}zext_fptrunc_fma_f16: +; GFX900: v_fma_f32 v0, v0, v1, v2 +; GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 + +; GFX906: v_fma_mixlo_f16 v0, v0, v1, v2 +; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 + +; GFX10: v_fma_mixlo_f16 v0, v0, v1, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +define i32 @zext_fptrunc_fma_f16(float %x, float %y, float %z) { + %fma = call float @llvm.fma.f32(float %x, float %y, float %z) + %fptrunc = fptrunc float %fma to half + %cast = bitcast half %fptrunc to i16 + %zext = zext i16 %cast to i32 + ret i32 %zext +} + +declare half @llvm.amdgcn.div.fixup.f16(half, half, half) +declare half @llvm.fma.f16(half, half, half) +declare float @llvm.fma.f32(float, float, float)