Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -193,8 +193,18 @@ bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const; bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectLo16Elt(SDValue In, SDValue &Src) const; bool SelectHi16Elt(SDValue In, SDValue &Src) const; + bool SelectCvtRtzF16F32(SDValue In, SDValue &Src) const; + bool SelectCvtRtzF16F32Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + + bool SelectCvtRtzF16F32Impl(SDValue In, SDValue &Src, bool hi) const; + bool SelectCvtRtzF16F32Lo(SDValue In, SDValue &Src) const; + bool SelectCvtRtzF16F32Hi(SDValue In, SDValue &Src) const; + bool SelectCvtRtzF16F32LoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectCvtRtzF16F32HiMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + void SelectADD_SUB_I64(SDNode *N); void SelectUADDO_USUBO(SDNode *N); void SelectDIV_SCALE(SDNode *N); @@ -1833,6 +1843,16 @@ return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val; } +// Figure out if this is really an extract of the low 16-bits of a dword. +static bool isExtractLoElt(SDValue In, SDValue &Out) { + In = stripBitcast(In); + if (In.getOpcode() != ISD::TRUNCATE) + return false; + + Out = stripBitcast(In.getOperand(0)); + return true; +} + // Figure out if this is really an extract of the high 16-bits of a dword. static bool isExtractHiElt(SDValue In, SDValue &Out) { In = stripBitcast(In); @@ -2016,6 +2036,17 @@ return true; } +bool AMDGPUDAGToDAGISel::SelectLo16Elt(SDValue In, SDValue &Src) const { + if (In.isUndef()) { + Src = In; + return true; + } + + // no constant handling unlike SelectHi16Elt() due to lack of need + + return isExtractLoElt(In, Src); +} + // TODO: Can we identify things like v_mad_mixhi_f16? bool AMDGPUDAGToDAGISel::SelectHi16Elt(SDValue In, SDValue &Src) const { if (In.isUndef()) { @@ -2045,6 +2076,137 @@ return isExtractHiElt(In, Src); } +bool AMDGPUDAGToDAGISel::SelectCvtRtzF16F32(SDValue In, SDValue &Src) const { + ConstantSDNode *CI = dyn_cast(In); + ConstantFPSDNode *CF = dyn_cast(In); + if (CI || CF) { + uint64_t Val = CI ? CI->getLimitedValue() : 0; + if (CF) + Val = CF->getValueAPF().bitcastToAPInt().getLimitedValue(); + + if (Val >> 16) + return false; + + bool LostInfo; + APFloat FVal = APFloat(APFloatBase::IEEEhalf(), APInt(16, Val)); + FVal.convert(APFloatBase::IEEEsingle(), APFloatBase::rmNearestTiesToAway, &LostInfo); + if (LostInfo) + return false; + + SDLoc SL(In); + SDValue K = CurDAG->getTargetConstant(FVal.bitcastToAPInt(), SL, MVT::i32); + MachineSDNode *MovK = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::f32, K); + Src = SDValue(MovK, 0); + return true; + } + + SDValue CvtPkRtz; + + if (SelectHi16Elt(In, CvtPkRtz) && CvtPkRtz.getOpcode() == AMDGPUISD::CVT_PKRTZ_F16_F32) { + Src = CvtPkRtz.getOperand(1); + return true; + } + + if (SelectLo16Elt(In, CvtPkRtz) && CvtPkRtz.getOpcode() == AMDGPUISD::CVT_PKRTZ_F16_F32) { + Src = CvtPkRtz.getOperand(0); + return true; + } + + return false; +} + +bool AMDGPUDAGToDAGISel::SelectCvtRtzF16F32Mods(SDValue In, SDValue &Src, SDValue &Mods) const { + SDValue Tmp; + if (SelectCvtRtzF16F32(In, Tmp)) + return SelectVOP3Mods(Tmp, Src, Mods); + return false; +} + +bool AMDGPUDAGToDAGISel::SelectCvtRtzF16F32Impl(SDValue In, SDValue &Src, bool hi) const { + In = stripBitcast(In); + + ConstantSDNode *CI = dyn_cast(In); + ConstantFPSDNode *CF = dyn_cast(In); + if (CI || CF) { + uint32_t Val = CI ? CI->getLimitedValue() : 0; + if (CF) + Val = CF->getValueAPF().bitcastToAPInt().getLimitedValue(); + if (hi && Val & 0xffff) + return false; + if (!hi && Val >> 16) + return false; + + Val = hi ? Val >> 16 : Val; + + bool LostInfo; + APFloat FVal = APFloat(APFloatBase::IEEEhalf(), APInt(16, Val)); + FVal.convert(APFloatBase::IEEEsingle(), APFloatBase::rmNearestTiesToAway, &LostInfo); + if (LostInfo) + return false; + + SDLoc SL(In); + SDValue K = CurDAG->getTargetConstant(FVal.bitcastToAPInt(), SL, MVT::i32); + MachineSDNode *MovK = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::f32, K); + Src = SDValue(MovK, 0); + return true; + } + + unsigned shiftOpcode = hi ? ISD::SHL : ISD::SRL; + int shiftOperand = hi ? 0 : 1; + uint32_t andMask = hi ? 0xffff0000u : 0xffffu; + int andOperand = hi ? 1 : 0; + + if (In.getOpcode() == ISD::AND) { + // low: and(cvt_pkrtz(v, 0), 0xffff) + // high: and(cvt_pkrtz(0, v), 0xffff0000) + for (int i = 0; i < 2; i++) { + if (ConstantSDNode *C = dyn_cast(In.getOperand(i))) { + uint32_t v = C->getZExtValue(); + SDValue CvtPkRtz = stripBitcast(In.getOperand(!i)); + if (v == andMask && CvtPkRtz.getOpcode() == AMDGPUISD::CVT_PKRTZ_F16_F32) { + Src = CvtPkRtz.getOperand(andOperand); + return true; + } + } + } + } else if (In.getOpcode() == shiftOpcode) { + // low: srl(cvt_pkrtz(0, v), 16) + // high: shl(cvt_pkrtz(v, 0), 16) + if (ConstantSDNode *C = dyn_cast(In.getOperand(1))) { + uint32_t v = C->getZExtValue(); + SDValue CvtPkRtz = stripBitcast(In.getOperand(0)); + if (v == 16 && CvtPkRtz.getOpcode() == AMDGPUISD::CVT_PKRTZ_F16_F32) { + Src = CvtPkRtz.getOperand(shiftOperand); + return true; + } + } + } + + return false; +} + +bool AMDGPUDAGToDAGISel::SelectCvtRtzF16F32Lo(SDValue In, SDValue &Src) const { + return SelectCvtRtzF16F32Impl(In, Src, false); +} + +bool AMDGPUDAGToDAGISel::SelectCvtRtzF16F32Hi(SDValue In, SDValue &Src) const { + return SelectCvtRtzF16F32Impl(In, Src, true); +} + +bool AMDGPUDAGToDAGISel::SelectCvtRtzF16F32LoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const { + SDValue Tmp; + if (SelectCvtRtzF16F32Lo(In, Tmp)) + return SelectVOP3Mods(Tmp, Src, SrcMods); + return false; +} + +bool AMDGPUDAGToDAGISel::SelectCvtRtzF16F32HiMods(SDValue In, SDValue &Src, SDValue &SrcMods) const { + SDValue Tmp; + if (SelectCvtRtzF16F32Hi(In, Tmp)) + return SelectVOP3Mods(Tmp, Src, SrcMods); + return false; +} + bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) { return false; Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -958,6 +958,15 @@ def Hi16Elt : ComplexPattern; +def CvtRtzF16F32 : ComplexPattern; +def CvtRtzF16F32Mods : ComplexPattern; + +def CvtRtzF16F32Lo : ComplexPattern; +def CvtRtzF16F32LoMods : ComplexPattern; + +def CvtRtzF16F32Hi : ComplexPattern; +def CvtRtzF16F32HiMods : ComplexPattern; + //===----------------------------------------------------------------------===// // SI assembler operands //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -1594,6 +1594,43 @@ // Miscellaneous Optimization Patterns //============================================================================// +let SubtargetPredicate = isGCN in { + +def : GCNPat < + (v2f16 (build_vector (f16 (CvtRtzF16F32Mods f32:$src0, i32:$src0_mods)), + (f16 (CvtRtzF16F32Mods f32:$src1, i32:$src1_mods)))), + (V_CVT_PKRTZ_F16_F32_e64 $src0_mods, $src0, + $src1_mods, $src1, + DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +def : GCNPat < + (i32 (or (CvtRtzF16F32LoMods f32:$src0, i32:$src0_mods), + (CvtRtzF16F32HiMods f32:$src1, i32:$src1_mods))), + (V_CVT_PKRTZ_F16_F32_e64 $src0_mods, $src0, + $src1_mods, $src1, + DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +} // End SubtargetPredicates = isGCN + +let SubtargetPredicate = isSICI in { + +def : GCNPat < + (v2f16 (build_vector (f16 (CvtRtzF16F32 f32:$src0)), + (f16 (CvtRtzF16F32 f32:$src1)))), + (V_CVT_PKRTZ_F16_F32_e32 $src0, $src1) +>; + +def : GCNPat < + (i32 (or (i32 (CvtRtzF16F32Lo f32:$src0)), + (i32 (CvtRtzF16F32Hi f32:$src1)))), + (V_CVT_PKRTZ_F16_F32_e32 $src0, $src1) +>; + +} // End SubtargetPredicates = isSICI + + // Undo sub x, c -> add x, -c canonicalization since c is more likely // an inline immediate than -c. // TODO: Also do for 64-bit. Index: test/CodeGen/AMDGPU/cvt_pkrtz_f16_f32_combine.ll =================================================================== --- test/CodeGen/AMDGPU/cvt_pkrtz_f16_f32_combine.ll +++ test/CodeGen/AMDGPU/cvt_pkrtz_f16_f32_combine.ll @@ -0,0 +1,126 @@ +; RUN: llc -march=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes ALL,GFX67 %s +; RUN: llc -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes ALL,GFX67 %s +; RUN: llc -march=amdgcn -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes ALL,GFX89 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes ALL,GFX89 %s + +; ALL-LABEL: {{^}}packed_convert_low: +; ALL: v_cvt_pkrtz_f16_f32{{(_e32|_e64)?}} v{{[0-9]*}}, v0, v1 +; ALL-NOT: v_cvt_pkrtz_f16_f32 +define i32 @packed_convert_low(float %a, float %b) #0 { + %a_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float 0.0) + %a_half = extractelement <2 x half> %a_half_vec, i32 0 + %b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %b, float 0.0) + %b_half = extractelement <2 x half> %b_half_vec, i32 0 + + %vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0 + %vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1 + + %vec_i32 = bitcast <2 x half> %vec to i32 + ret i32 %vec_i32 +} + +; ALL-LABEL: {{^}}packed_convert_high: +; ALL: v_cvt_pkrtz_f16_f32{{(_e32|_e64)?}} v{{[0-9]*}}, v0, v1 +; ALL-NOT: v_cvt_pkrtz_f16_f32 +define i32 @packed_convert_high(float %a, float %b) #0 { + %a_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %a) + %a_half = extractelement <2 x half> %a_half_vec, i32 1 + %b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %b) + %b_half = extractelement <2 x half> %b_half_vec, i32 1 + + %vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0 + %vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1 + + %vec_i32 = bitcast <2 x half> %vec to i32 + ret i32 %vec_i32 +} + +; ALL-LABEL: {{^}}packed_convert_low_high: +; ALL: v_cvt_pkrtz_f16_f32{{(_e32|_e64)?}} v{{[0-9]*}}, v0, v1 +; ALL-NOT: v_cvt_pkrtz_f16_f32 +define i32 @packed_convert_low_high(float %a, float %b) #0 { + %a_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float 0.0) + %a_half = extractelement <2 x half> %a_half_vec, i32 0 + %b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %b) + %b_half = extractelement <2 x half> %b_half_vec, i32 1 + + %vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0 + %vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1 + + %vec_i32 = bitcast <2 x half> %vec to i32 + ret i32 %vec_i32 +} + +; ALL-LABEL: {{^}}packed_convert_high_low: +; ALL: v_cvt_pkrtz_f16_f32{{(_e32|_e64)?}} v{{[0-9]*}}, v0, v1 +; ALL-NOT: v_cvt_pkrtz_f16_f32 +define i32 @packed_convert_high_low(float %a, float %b) #0 { + %a_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %a) + %a_half = extractelement <2 x half> %a_half_vec, i32 1 + %b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %b, float 0.0) + %b_half = extractelement <2 x half> %b_half_vec, i32 0 + + %vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0 + %vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1 + + %vec_i32 = bitcast <2 x half> %vec to i32 + ret i32 %vec_i32 +} + +; ALL-LABEL: {{^}}packed_convert_imm: +; GFX89: s_mov_b32 s{{[0-9]*}}, 0x40a00000 +; GFX89: v_cvt_pkrtz_f16_f32{{(_e32|_e64)?}} v{{[0-9]*}}, s{{[0-9]*}}, v1 +; GFX67: v_cvt_pkrtz_f16_f32{{(_e32|_e64)?}} v{{[0-9]*}}, 0x40a00000, v1 +; GFX67-NOT: v_cvt_pkrtz_f16_f32 +define i32 @packed_convert_imm(float %a, float %b) #0 { + %a_half = fptrunc float 5.0 to half + %b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %b, float 0.0) + %b_half = extractelement <2 x half> %b_half_vec, i32 0 + + %vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0 + %vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1 + + %vec_i32 = bitcast <2 x half> %vec to i32 + ret i32 %vec_i32 +} + +; ALL-LABEL: {{^}}packed_convert_low_neg: +; ALL: v_cvt_pkrtz_f16_f32{{(_e64)?}} v{{[0-9]*}}, -v0, v1 +; ALL-NOT: v_cvt_pkrtz_f16_f32 +define i32 @packed_convert_low_neg(float %a, float %b) #0 { + %a_neg = fneg float %a + + %a_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a_neg, float 0.0) + %a_half = extractelement <2 x half> %a_half_vec, i32 0 + %b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %b, float 0.0) + %b_half = extractelement <2 x half> %b_half_vec, i32 0 + + %vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0 + %vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1 + + %vec_i32 = bitcast <2 x half> %vec to i32 + ret i32 %vec_i32 +} + +; ALL-LABEL: {{^}}packed_convert_high_neg: +; ALL: v_cvt_pkrtz_f16_f32{{(_e64)?}} v{{[0-9]*}}, -v0, v1 +; ALL-NOT: v_cvt_pkrtz_f16_f32 +define i32 @packed_convert_high_neg(float %a, float %b) #0 { + %a_neg = fneg float %a + + %a_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %a_neg) + %a_half = extractelement <2 x half> %a_half_vec, i32 1 + %b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %b) + %b_half = extractelement <2 x half> %b_half_vec, i32 1 + + %vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0 + %vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1 + + %vec_i32 = bitcast <2 x half> %vec to i32 + ret i32 %vec_i32 +} + +declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone }