Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -44,6 +44,63 @@ namespace { +// Instructions that will be lowered with a final instruction that zeros the +// high result bits. +// XXX - only need to list legal operations. +static bool fp16SrcZerosHighBits(unsigned Opc) { + switch (Opc) { + case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: + case ISD::FDIV: + case ISD::FREM: + case ISD::FMA: + case ISD::FMAD: + case ISD::FCANONICALIZE: + case ISD::FP_ROUND: + case ISD::UINT_TO_FP: + case ISD::SINT_TO_FP: + case ISD::FABS: + // Fabs is lowered to a bit operation, but it's an and which will clear the + // high bits anyway. + case ISD::FSQRT: + case ISD::FSIN: + case ISD::FCOS: + case ISD::FPOWI: + case ISD::FPOW: + case ISD::FLOG: + case ISD::FLOG2: + case ISD::FLOG10: + case ISD::FEXP: + case ISD::FEXP2: + case ISD::FCEIL: + case ISD::FTRUNC: + case ISD::FRINT: + case ISD::FNEARBYINT: + case ISD::FROUND: + case ISD::FFLOOR: + case ISD::FMINNUM: + case ISD::FMAXNUM: + case AMDGPUISD::FRACT: + case AMDGPUISD::CLAMP: + case AMDGPUISD::COS_HW: + case AMDGPUISD::SIN_HW: + case AMDGPUISD::FMIN3: + case AMDGPUISD::FMAX3: + case AMDGPUISD::FMED3: + case AMDGPUISD::FMAD_FTZ: + case AMDGPUISD::RCP: + case AMDGPUISD::RSQ: + case AMDGPUISD::RCP_IFLAG: + case AMDGPUISD::LDEXP: + return true; + default: + // fcopysign, select and others may be lowered to 32-bit bit operations + // which don't zero the high bits. + return false; + } +} + static bool isNullConstantOrUndef(SDValue V) { if (V.isUndef()) return true; Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -457,9 +457,6 @@ // are known 0. FP_TO_FP16, - // Wrapper around fp16 results that are known to zero the high bits. - FP16_ZEXT, - /// This node is for VLIW targets and it is used to represent a vector /// that is stored in consecutive registers with the same channel. /// For example: Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4309,7 +4309,6 @@ NODE_NAME_CASE(CVT_PK_I16_I32) NODE_NAME_CASE(CVT_PK_U16_U32) NODE_NAME_CASE(FP_TO_FP16) - NODE_NAME_CASE(FP16_ZEXT) NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) NODE_NAME_CASE(CONST_DATA_PTR) NODE_NAME_CASE(PC_ADD_REL_OFFSET) @@ -4439,8 +4438,7 @@ break; } - case AMDGPUISD::FP_TO_FP16: - case AMDGPUISD::FP16_ZEXT: { + case AMDGPUISD::FP_TO_FP16: { unsigned BitWidth = Known.getBitWidth(); // High bits are zero. @@ -4587,7 +4585,6 @@ case AMDGPUISD::BUFFER_LOAD_USHORT: return 16; case AMDGPUISD::FP_TO_FP16: - case AMDGPUISD::FP16_ZEXT: return 16; default: return 1; Index: llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -132,7 +132,6 @@ def AMDGPUpk_i16_i32_impl : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>; def AMDGPUpk_u16_u32_impl : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>; def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>; -def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>; def AMDGPUfp_class_impl : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9378,63 +9378,6 @@ return SDValue(); } -// Instructions that will be lowered with a final instruction that zeros the -// high result bits. -// XXX - probably only need to list legal operations. -static bool fp16SrcZerosHighBits(unsigned Opc) { - switch (Opc) { - case ISD::FADD: - case ISD::FSUB: - case ISD::FMUL: - case ISD::FDIV: - case ISD::FREM: - case ISD::FMA: - case ISD::FMAD: - case ISD::FCANONICALIZE: - case ISD::FP_ROUND: - case ISD::UINT_TO_FP: - case ISD::SINT_TO_FP: - case ISD::FABS: - // Fabs is lowered to a bit operation, but it's an and which will clear the - // high bits anyway. - case ISD::FSQRT: - case ISD::FSIN: - case ISD::FCOS: - case ISD::FPOWI: - case ISD::FPOW: - case ISD::FLOG: - case ISD::FLOG2: - case ISD::FLOG10: - case ISD::FEXP: - case ISD::FEXP2: - case ISD::FCEIL: - case ISD::FTRUNC: - case ISD::FRINT: - case ISD::FNEARBYINT: - case ISD::FROUND: - case ISD::FFLOOR: - case ISD::FMINNUM: - case ISD::FMAXNUM: - case AMDGPUISD::FRACT: - case AMDGPUISD::CLAMP: - case AMDGPUISD::COS_HW: - case AMDGPUISD::SIN_HW: - case AMDGPUISD::FMIN3: - case AMDGPUISD::FMAX3: - case AMDGPUISD::FMED3: - case AMDGPUISD::FMAD_FTZ: - case AMDGPUISD::RCP: - case AMDGPUISD::RSQ: - case AMDGPUISD::RCP_IFLAG: - case AMDGPUISD::LDEXP: - return true; - default: - // fcopysign, select and others may be lowered to 32-bit bit operations - // which don't zero the high bits. - return false; - } -} - SDValue SITargetLowering::performZeroExtendCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (!Subtarget->has16BitInsts() || @@ -9449,15 +9392,6 @@ if (Src.getValueType() != MVT::i16) return SDValue(); - // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src - // FIXME: It is not universally true that the high bits are zeroed on gfx9. - if (Src.getOpcode() == ISD::BITCAST) { - SDValue BCSrc = Src.getOperand(0); - if (BCSrc.getValueType() == MVT::f16 && - fp16SrcZerosHighBits(BCSrc.getOpcode())) - return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc); - } - return SDValue(); } Index: llvm/lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -814,6 +814,12 @@ (isNullConstantOrUndef(Src1) && isNegInlineImmediate(Src0.getNode())); }], getNegV2I16Imm>; + +def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{ + return fp16SrcZerosHighBits(N->getOpcode()); +}]>; + + //===----------------------------------------------------------------------===// // MUBUF/SMEM Patterns //===----------------------------------------------------------------------===// Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1992,11 +1992,13 @@ //===----------------------------------------------------------------------===// // Miscellaneous Patterns //===----------------------------------------------------------------------===// -def : GCNPat < - (i32 (AMDGPUfp16_zext f16:$src)), - (COPY $src) ->; +// Eliminate a zero extension from an fp16 operation if it already +// zeros the high bits of the 32-bit register. +def : GCNPat< + (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))), + (COPY VSrc_b16:$src) +>; def : GCNPat < (i32 (trunc i64:$a)),