Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -193,8 +193,15 @@ bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const; bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectLo16Elt(SDValue In, SDValue &Src) const; bool SelectHi16Elt(SDValue In, SDValue &Src) const; + bool SelectCvtRtzF16F32(SDValue In, SDValue &Src, SDValue &SrcMods) const; + + bool SelectCvtRtzF16F32LoHiImpl(SDValue In, SDValue &Src, SDValue &SrcMods, bool hi) const; + bool SelectCvtRtzF16F32Lo(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectCvtRtzF16F32Hi(SDValue In, SDValue &Src, SDValue &SrcMods) const; + void SelectADD_SUB_I64(SDNode *N); void SelectUADDO_USUBO(SDNode *N); void SelectDIV_SCALE(SDNode *N); @@ -1893,6 +1900,16 @@ return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val; } +// Figure out if this is really an extract of the low 16-bits of a dword. +static bool isExtractLoElt(SDValue In, SDValue &Out) { + In = stripBitcast(In); + if (In.getOpcode() != ISD::TRUNCATE) + return false; + + Out = stripBitcast(In.getOperand(0)); + return true; +} + // Figure out if this is really an extract of the high 16-bits of a dword. static bool isExtractHiElt(SDValue In, SDValue &Out) { In = stripBitcast(In); @@ -2076,6 +2093,17 @@ return true; } +bool AMDGPUDAGToDAGISel::SelectLo16Elt(SDValue In, SDValue &Src) const { + if (In.isUndef()) { + Src = In; + return true; + } + + // No constant handling unlike SelectHi16Elt() due to lack of need. + + return isExtractLoElt(In, Src); +} + // TODO: Can we identify things like v_mad_mixhi_f16? bool AMDGPUDAGToDAGISel::SelectHi16Elt(SDValue In, SDValue &Src) const { if (In.isUndef()) { @@ -2105,6 +2133,88 @@ return isExtractHiElt(In, Src); } +bool AMDGPUDAGToDAGISel::SelectCvtRtzF16F32(SDValue In, SDValue &Src, SDValue &SrcMods) const { + In = stripBitcast(In); + if (In.getOpcode() == ISD::TRUNCATE) + In = In.getOperand(0); + return SelectCvtRtzF16F32Lo(In, Src, SrcMods); +} + +bool AMDGPUDAGToDAGISel::SelectCvtRtzF16F32LoHiImpl(SDValue In, SDValue &Src, SDValue &SrcMods, bool Hi) const { + In = stripBitcast(In); + + uint32_t Val; + if (getConstantValue(In, Val)) { + if (Hi && Val & 0xffff) + return false; + if (!Hi && Val >> 16) + return false; + + Val = Hi ? Val >> 16 : Val; + + bool LostInfo; + APFloat FVal(APFloatBase::IEEEhalf(), APInt(16, Val)); + FVal.convert(APFloatBase::IEEEsingle(), APFloatBase::rmNearestTiesToAway, &LostInfo); + if (LostInfo) + return false; + + SDLoc SL(In); + SDValue K = CurDAG->getTargetConstant(FVal.bitcastToAPInt(), SL, MVT::i32); + MachineSDNode *MovK = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::f32, K); + Src = SDValue(MovK, 0); + SrcMods = CurDAG->getTargetConstant(0, SL, MVT::i32); + return true; + } + + unsigned shiftOpcode = Hi ? ISD::SHL : ISD::SRL; + int shiftOperand = Hi ? 0 : 1; + uint32_t andMask = Hi ? 0xffff0000u : 0xffffu; + int andOperand = Hi ? 1 : 0; + + if (In.getOpcode() == ISD::AND) { + // low: and(cvt_pkrtz(v, ), 0xffff) + // high: and(cvt_pkrtz(, v), 0xffff0000) + for (int i = 0; i < 2; i++) { + if (ConstantSDNode *C = dyn_cast(In.getOperand(i))) { + uint32_t v = C->getZExtValue(); + SDValue CvtPkRtz = stripBitcast(In.getOperand(!i)); + if (v == andMask && CvtPkRtz.getOpcode() == AMDGPUISD::CVT_PKRTZ_F16_F32) + return SelectVOP3Mods(CvtPkRtz.getOperand(andOperand), Src, SrcMods); + } + } + } else if (In.getOpcode() == shiftOpcode) { + // low: srl(cvt_pkrtz(, v), 16) + // high: shl(cvt_pkrtz(v, ), 16) + if (ConstantSDNode *C = dyn_cast(In.getOperand(1))) { + uint32_t v = C->getZExtValue(); + SDValue CvtPkRtz = stripBitcast(In.getOperand(0)); + if (v == 16 && CvtPkRtz.getOpcode() == AMDGPUISD::CVT_PKRTZ_F16_F32) + return SelectVOP3Mods(CvtPkRtz.getOperand(shiftOperand), Src, SrcMods); + } + } else if (In.getOpcode() == AMDGPUISD::CVT_PKRTZ_F16_F32) { + // low: cvt_pkrtz(v, 0) + // high: cvt_pkrtz(0, v) + uint32_t Val; + if (getConstantValue(In.getOperand(!Hi), Val)) { + APFloat FVal(APFloatBase::IEEEsingle(), APInt(32, Val)); + bool LostInfo; + FVal.convert(APFloatBase::IEEEhalf(), APFloatBase::rmTowardZero, &LostInfo); + if (FVal.bitcastToAPInt().getZExtValue() == 0) + return SelectVOP3Mods(In.getOperand(Hi), Src, SrcMods); + } + } + + return false; +} + +bool AMDGPUDAGToDAGISel::SelectCvtRtzF16F32Lo(SDValue In, SDValue &Src, SDValue &SrcMods) const { + return SelectCvtRtzF16F32LoHiImpl(In, Src, SrcMods, false); +} + +bool AMDGPUDAGToDAGISel::SelectCvtRtzF16F32Hi(SDValue In, SDValue &Src, SDValue &SrcMods) const { + return SelectCvtRtzF16F32LoHiImpl(In, Src, SrcMods, true); +} + bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) { return false; Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -958,6 +958,10 @@ def Hi16Elt : ComplexPattern; +def CvtRtzF16F32 : ComplexPattern; +def CvtRtzF16F32Lo : ComplexPattern; +def CvtRtzF16F32Hi : ComplexPattern; + //===----------------------------------------------------------------------===// // SI assembler operands //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -1580,6 +1580,22 @@ // Miscellaneous Optimization Patterns //============================================================================// +def : GCNPat < + (v2f16 (build_vector (f16 (CvtRtzF16F32 f32:$src0, i32:$src0_mods)), + (f16 (CvtRtzF16F32 f32:$src1, i32:$src1_mods)))), + (V_CVT_PKRTZ_F16_F32_e64 $src0_mods, $src0, + $src1_mods, $src1, + DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +def : GCNPat < + (i32 (or (CvtRtzF16F32Lo f32:$src0, i32:$src0_mods), + (CvtRtzF16F32Hi f32:$src1, i32:$src1_mods))), + (V_CVT_PKRTZ_F16_F32_e64 $src0_mods, $src0, + $src1_mods, $src1, + DSTCLAMP.NONE, DSTOMOD.NONE) +>; + // Undo sub x, c -> add x, -c canonicalization since c is more likely // an inline immediate than -c. // TODO: Also do for 64-bit. Index: test/CodeGen/AMDGPU/cvt_pkrtz_f16_f32_combine.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/cvt_pkrtz_f16_f32_combine.ll @@ -0,0 +1,132 @@ +; RUN: llc -march=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes ALL,GFX67 %s +; RUN: llc -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes ALL,GFX67 %s +; RUN: llc -march=amdgcn -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes ALL,GFX89 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes ALL,GFX89 %s + +; ALL-LABEL: {{^}}packed_convert_low: +; GFX67: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]*}}, v0, v1 +; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]*}}, v0, v1 +; ALL-NOT: v_cvt_pkrtz_f16_f32 +define i32 @packed_convert_low(float %a, float %b) #0 { + %a_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float 0.0) + %a_half = extractelement <2 x half> %a_half_vec, i32 0 + %b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %b, float 0.0) + %b_half = extractelement <2 x half> %b_half_vec, i32 0 + + %vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0 + %vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1 + + %vec_i32 = bitcast <2 x half> %vec to i32 + ret i32 %vec_i32 +} + +; ALL-LABEL: {{^}}packed_convert_high: +; GFX67: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]*}}, v0, v1 +; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]*}}, v0, v1 +; ALL-NOT: v_cvt_pkrtz_f16_f32 +define i32 @packed_convert_high(float %a, float %b) #0 { + %a_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %a) + %a_half = extractelement <2 x half> %a_half_vec, i32 1 + %b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %b) + %b_half = extractelement <2 x half> %b_half_vec, i32 1 + + %vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0 + %vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1 + + %vec_i32 = bitcast <2 x half> %vec to i32 + ret i32 %vec_i32 +} + +; ALL-LABEL: {{^}}packed_convert_low_high: +; GFX67: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]*}}, v0, v1 +; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]*}}, v0, v1 +; ALL-NOT: v_cvt_pkrtz_f16_f32 +define i32 @packed_convert_low_high(float %a, float %b) #0 { + %a_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float 0.0) + %a_half = extractelement <2 x half> %a_half_vec, i32 0 + %b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %b) + %b_half = extractelement <2 x half> %b_half_vec, i32 1 + + %vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0 + %vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1 + + %vec_i32 = bitcast <2 x half> %vec to i32 + ret i32 %vec_i32 +} + +; ALL-LABEL: {{^}}packed_convert_high_low: +; GFX67: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]*}}, v0, v1 +; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]*}}, v0, v1 +; ALL-NOT: v_cvt_pkrtz_f16_f32 +define i32 @packed_convert_high_low(float %a, float %b) #0 { + %a_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %a) + %a_half = extractelement <2 x half> %a_half_vec, i32 1 + %b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %b, float 0.0) + %b_half = extractelement <2 x half> %b_half_vec, i32 0 + + %vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0 + %vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1 + + %vec_i32 = bitcast <2 x half> %vec to i32 + ret i32 %vec_i32 +} + +; ALL-LABEL: {{^}}packed_convert_imm: +; GFX67: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]*}}, 0x40a00000, v1 +; GFX89: s_mov_b32 s{{[0-9]*}}, 0x40a00000 +; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]*}}, s{{[0-9]*}}, v1 +; ALL-NOT: v_cvt_pkrtz_f16_f32 +define i32 @packed_convert_imm(float %a, float %b) #0 { + %a_half = fptrunc float 5.0 to half + %b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %b, float 0.0) + %b_half = extractelement <2 x half> %b_half_vec, i32 0 + + %vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0 + %vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1 + + %vec_i32 = bitcast <2 x half> %vec to i32 + ret i32 %vec_i32 +} + +; ALL-LABEL: {{^}}packed_convert_low_neg: +; GFX67: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]*}}, -v0, v1 +; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]*}}, -v0, v1 +; ALL-NOT: v_cvt_pkrtz_f16_f32 +define i32 @packed_convert_low_neg(float %a, float %b) #0 { + %a_neg = fneg float %a + + %a_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a_neg, float 0.0) + %a_half = extractelement <2 x half> %a_half_vec, i32 0 + %b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %b, float 0.0) + %b_half = extractelement <2 x half> %b_half_vec, i32 0 + + %vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0 + %vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1 + + %vec_i32 = bitcast <2 x half> %vec to i32 + ret i32 %vec_i32 +} + +; ALL-LABEL: {{^}}packed_convert_high_neg: +; GFX67: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]*}}, -v0, v1 +; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]*}}, -v0, v1 +; ALL-NOT: v_cvt_pkrtz_f16_f32 +define i32 @packed_convert_high_neg(float %a, float %b) #0 { + %a_neg = fneg float %a + + %a_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %a_neg) + %a_half = extractelement <2 x half> %a_half_vec, i32 1 + %b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %b) + %b_half = extractelement <2 x half> %b_half_vec, i32 1 + + %vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0 + %vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1 + + %vec_i32 = bitcast <2 x half> %vec to i32 + ret i32 %vec_i32 +} + +declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone }