Index: lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- lib/Target/AMDGPU/SIFoldOperands.cpp +++ lib/Target/AMDGPU/SIFoldOperands.cpp @@ -775,6 +775,7 @@ return true; } +// FIXME: Clamp for v_mad_mixhi_f16 handled during isel. bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) { const MachineOperand *ClampSrc = isClamp(MI); if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg())) Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -111,6 +111,7 @@ SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performBuildVectorCombine(SDNode *N, DAGCombinerInfo &DCI) const; unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -502,6 +502,7 @@ setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::BUILD_VECTOR); // All memory operations. Some folding on the pointer operand is done to help // matching the constant offsets in the addressing modes. @@ -5853,7 +5854,7 @@ SDNode *N, DAGCombinerInfo &DCI) const { SDValue Vec = N->getOperand(0); - SelectionDAG &DAG= DCI.DAG; + SelectionDAG &DAG = DCI.DAG; if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) { SDLoc SL(N); EVT EltVT = N->getValueType(0); @@ -5866,6 +5867,47 @@ return SDValue(); } +static bool convertBuildVectorCastElt(SelectionDAG &DAG, + SDValue &Lo, SDValue &Hi) { + if (Hi.getOpcode() == ISD::BITCAST && + Hi.getOperand(0).getValueType() == MVT::f16 && + (isa(Lo) || Lo.isUndef())) { + Lo = DAG.getNode(ISD::BITCAST, SDLoc(Lo), MVT::f16, Lo); + Hi = Hi.getOperand(0); + return true; + } + + return false; +} + +SDValue SITargetLowering::performBuildVectorCombine( + SDNode *N, DAGCombinerInfo &DCI) const { + SDLoc SL(N); + + if (!isTypeLegal(MVT::v2i16)) + return SDValue(); + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + + if (VT == MVT::v2i16) { + SDValue Lo = N->getOperand(0); + SDValue Hi = N->getOperand(1); + + // v2i16 build_vector (const|undef), (bitcast f16:$x) + // -> bitcast (v2f16 build_vector const|undef, $x + if (convertBuildVectorCastElt(DAG, Lo, Hi)) { + SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Lo, Hi }); + return DAG.getNode(ISD::BITCAST, SL, VT, NewVec); + } + + if (convertBuildVectorCastElt(DAG, Hi, Lo)) { + SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Hi, Lo }); + return DAG.getNode(ISD::BITCAST, SL, VT, NewVec); + } + } + + return SDValue(); +} unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, @@ -6287,6 +6329,8 @@ } case ISD::EXTRACT_VECTOR_ELT: return performExtractVectorEltCombine(N, DCI); + case ISD::BUILD_VECTOR: + return performBuildVectorCombine(N, DCI); } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } Index: lib/Target/AMDGPU/VOP3PInstructions.td =================================================================== --- lib/Target/AMDGPU/VOP3PInstructions.td +++ lib/Target/AMDGPU/VOP3PInstructions.td @@ -84,8 +84,11 @@ // Clamp modifier is applied after conversion to f16. def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile, 1>; + +let ClampLo = 0, ClampHi = 1 in { def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile, 1>; } +} let Predicates = [HasMadMix] in { @@ -96,10 +99,56 @@ (V_MAD_MIXLO_F16 $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, - 0, + DSTCLAMP.NONE, (i32 (IMPLICIT_DEF))) >; +// FIXME: Special case handling for maxhi (especially for clamp) +// because dealing with the write to high half of the register is +// difficult. +def : Pat < + (build_vector f16:$elt0, (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), + (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0, + $src1_modifiers, $src1, + $src2_modifiers, $src2, + DSTCLAMP.NONE, + $elt0)) +>; + +def : Pat < + (build_vector + f16:$elt0, + (AMDGPUclamp (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), + (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0, + $src1_modifiers, $src1, + $src2_modifiers, $src2, + DSTCLAMP.ENABLE, + $elt0)) +>; + +def : Pat < + (AMDGPUclamp (build_vector + (fpround (fmad (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))), + (fpround (fmad (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))), + (v2f16 (V_MAD_MIXHI_F16 $hi_src0_modifiers, $hi_src0, + $hi_src1_modifiers, $hi_src1, + $hi_src2_modifiers, $hi_src2, + DSTCLAMP.ENABLE, + (V_MAD_MIXLO_F16 $lo_src0_modifiers, $lo_src0, + $lo_src1_modifiers, $lo_src1, + $lo_src2_modifiers, $lo_src2, + DSTCLAMP.ENABLE, + (i32 (IMPLICIT_DEF))))) +>; + } // End Predicates = [HasMadMix] multiclass VOP3P_Real_vi op> { Index: test/CodeGen/AMDGPU/mad-mix-hi.ll =================================================================== --- test/CodeGen/AMDGPU/mad-mix-hi.ll +++ test/CodeGen/AMDGPU/mad-mix-hi.ll @@ -2,12 +2,10 @@ ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI %s ; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s -; FIXME: These cases should be able to use v_mad_mixhi_f16 and avoid -; the packing. - ; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo: -; GFX9: v_mad_mixlo_f16 -; GFX9: v_lshl_or_b32 +; GFX9: s_waitcnt +; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 +; GFX9-NEXT: s_setpc_b64 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo(half %src0, half %src1, half %src2) #0 { %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float @@ -19,8 +17,11 @@ } ; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: -; GFX9: v_mad_mixlo_f16 -; GFX9: v_lshl_or_b32 +; GFX9: s_waitcnt +; GFX9-NEXT: v_mov_b32_e32 v3, 0x3c00 +; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: s_setpc_b64 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %src1, half %src2) #0 { %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float @@ -32,8 +33,10 @@ } ; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: -; GFX9: v_mad_mixlo_f16 -; GFX9: v_lshl_or_b32 +; GFX9: s_waitcnt +; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: s_setpc_b64 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src1, half %src2, half %lo) #0 { %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float @@ -46,8 +49,10 @@ } ; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: -; GFX9: v_mad_mixlo_f16 v0, v0, v1, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: s_setpc_b64 define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, half %src2) #0 { %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float @@ -61,8 +66,10 @@ } ; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: -; GFX9: v_mad_mixlo_f16 v0, v0, v1, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: s_setpc_b64 define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src1, half %src2) #0 { %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float @@ -90,12 +97,9 @@ ret <2 x half> %vec.result } -; FIXME: Unnecessary junk to pack, and packing undef? ; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: -; GFX9: v_mad_mixlo_f16 v0, v0, v1, v2 clamp{{$}} -; GFX9-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}} -; GFX9-NEXT: v_and_b32_e32 [[AND:v[0-9]+]], s6, [[MASK]] -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, [[AND]] +; GCN: s_waitcnt +; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 clamp{{$}} ; GFX9-NEXT: s_setpc_b64 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half %src0, half %src1, half %src2) #0 { %src0.ext = fpext half %src0 to float @@ -109,6 +113,27 @@ ret <2 x half> %vec.result } + +; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: +; GCN: s_waitcnt +; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2{{$}} +; GFX9-NEXT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, v3 +; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 clamp{{$}} +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 +define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use(half %src0, half %src1, half %src2) #0 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src2.ext = fpext half %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + %cvt.result = fptrunc float %result to half + store volatile half %cvt.result, half addrspace(1)* undef + %max = call half @llvm.maxnum.f16(half %cvt.result, half 0.0) + %clamp = call half @llvm.minnum.f16(half %max, half 1.0) + %vec.result = insertelement <2 x half> undef, half %clamp, i32 1 + ret <2 x half> %vec.result +} + declare half @llvm.minnum.f16(half, half) #1 declare half @llvm.maxnum.f16(half, half) #1 declare float @llvm.minnum.f32(float, float) #1 Index: test/CodeGen/AMDGPU/mad-mix-lo.ll =================================================================== --- test/CodeGen/AMDGPU/mad-mix-lo.ll +++ test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -75,13 +75,15 @@ ret half %cvt.result } -; GCN-LABEL: {{^}}v_mad_mixlo_v2f32: -; GFX9: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel:[1,1,1] -; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; FIXME: Should abe able to avoid extra register because first +; operation only clobbers relevant lane. +; GCN-LABEL: {{^}}v_mad_mix_v2f32: +; GCN: s_waitcnt +; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2{{$}} +; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1]{{$}} +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 -define <2 x half> @v_mad_mixlo_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { +define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> %src2.ext = fpext <2 x half> %src2 to <2 x float> @@ -90,13 +92,13 @@ ret <2 x half> %cvt.result } -; GCN-LABEL: {{^}}v_mad_mixlo_v3f32: +; GCN-LABEL: {{^}}v_mad_mix_v3f32: ; GCN: s_waitcnt ; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v3, v6 ; GFX9-NEXT: v_mad_mixlo_f16 v1, v1, v4, v7 ; GFX9-NEXT: v_mad_mixlo_f16 v2, v2, v5, v8 ; GFX9-NEXT: s_setpc_b64 -define <3 x half> @v_mad_mixlo_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { +define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { %src0.ext = fpext <3 x half> %src0 to <3 x float> %src1.ext = fpext <3 x half> %src1 to <3 x float> %src2.ext = fpext <3 x half> %src2 to <3 x float> @@ -105,19 +107,16 @@ ret <3 x half> %cvt.result } -; GCN-LABEL: {{^}}v_mad_mixlo_v4f32: +; GCN-LABEL: {{^}}v_mad_mix_v4f32: ; GCN: s_waitcnt -; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel:[1,1,1] -; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v2, v4 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_mad_mixlo_f16 v4, v1, v3, v5 op_sel:[1,1,1] -; GFX9-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 -; GFX9-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX9-NEXT: v_and_b32_e32 v1, v2, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 +; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 +; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] +; GFX9-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 +; GFX9-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] +; GFX9-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_setpc_b64 -define <4 x half> @v_mad_mixlo_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { +define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { %src0.ext = fpext <4 x half> %src0 to <4 x float> %src1.ext = fpext <4 x half> %src1 to <4 x float> %src2.ext = fpext <4 x half> %src2 to <4 x float> @@ -128,10 +127,9 @@ ; FIXME: Fold clamp ; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_postcvt: -; GFX9: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel:[1,1,1] -; GFX9: v_mad_mixlo_f16 v0, v0, v1, v2 -; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]] -; GFX9: v_pk_max_f16 v0, [[PACKED]], [[PACKED]] clamp{{$}} +; GFX9: v_mad_mixlo_f16 v3, v0, v1, v2 clamp{{$}} +; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] clamp{{$}} +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { %src0.ext = fpext <2 x half> %src0 to <2 x float> @@ -144,14 +142,162 @@ ret <2 x half> %clamp } +; FIXME: Should be packed into 2 registers per argument? +; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_postcvt: +; GCN: s_waitcnt +; GFX9-NEXT: v_mad_mixlo_f16 v2, v2, v5, v8 clamp +; GFX9-NEXT: v_mad_mixhi_f16 v2, v0, v0, v0 op_sel_hi:[0,0,0] clamp +; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v3, v6 clamp +; GFX9-NEXT: v_mad_mixhi_f16 v0, v1, v4, v7 clamp +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: s_setpc_b64 +define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { + %src0.ext = fpext <3 x half> %src0 to <3 x float> + %src1.ext = fpext <3 x half> %src1 to <3 x float> + %src2.ext = fpext <3 x half> %src2 to <3 x float> + %result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext) + %cvt.result = fptrunc <3 x float> %result to <3 x half> + %max = call <3 x half> @llvm.maxnum.v3f16(<3 x half> %cvt.result, <3 x half> zeroinitializer) + %clamp = call <3 x half> @llvm.minnum.v3f16(<3 x half> %max, <3 x half> ) + ret <3 x half> %clamp +} + +; GCN-LABEL: {{^}}v_mad_mix_v4f32_clamp_postcvt: +; GCN: s_waitcnt +; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 clamp +; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] clamp +; GFX9-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 clamp +; GFX9-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] clamp +; GFX9-DAG: v_mov_b32_e32 v0, v6 +; GFX9-DAG: v_mov_b32_e32 v1, v2 +; GFX9: s_setpc_b64 +define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { + %src0.ext = fpext <4 x half> %src0 to <4 x float> + %src1.ext = fpext <4 x half> %src1 to <4 x float> + %src2.ext = fpext <4 x half> %src2 to <4 x float> + %result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext) + %cvt.result = fptrunc <4 x float> %result to <4 x half> + %max = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %cvt.result, <4 x half> zeroinitializer) + %clamp = call <4 x half> @llvm.minnum.v4f16(<4 x half> %max, <4 x half> ) + ret <4 x half> %clamp +} + +; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_postcvt_lo: +; GCN: s_waitcnt +; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 clamp +; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: s_setpc_b64 +define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { + %src0.ext = fpext <2 x half> %src0 to <2 x float> + %src1.ext = fpext <2 x half> %src1 to <2 x float> + %src2.ext = fpext <2 x half> %src2 to <2 x float> + %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext) + %cvt.result = fptrunc <2 x float> %result to <2 x half> + %cvt.lo = extractelement <2 x half> %cvt.result, i32 0 + %max.lo = call half @llvm.maxnum.f16(half %cvt.lo, half 0.0) + %clamp.lo = call half @llvm.minnum.f16(half %max.lo, half 1.0) + %insert = insertelement <2 x half> %cvt.result, half %clamp.lo, i32 0 + ret <2 x half> %insert +} + +; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_postcvt_hi: +; GCN: s_waitcnt +; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 +; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] clamp +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: s_setpc_b64 +define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { + %src0.ext = fpext <2 x half> %src0 to <2 x float> + %src1.ext = fpext <2 x half> %src1 to <2 x float> + %src2.ext = fpext <2 x half> %src2 to <2 x float> + %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext) + %cvt.result = fptrunc <2 x float> %result to <2 x half> + %cvt.hi = extractelement <2 x half> %cvt.result, i32 1 + %max.hi = call half @llvm.maxnum.f16(half %cvt.hi, half 0.0) + %clamp.hi = call half @llvm.minnum.f16(half %max.hi, half 1.0) + %insert = insertelement <2 x half> %cvt.result, half %clamp.hi, i32 1 + ret <2 x half> %insert +} + +; FIXME: Should be able to use mixlo/mixhi +; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_precvt: +; GFX9: v_mad_mix_f32 v3, v0, v1, v2 clamp +; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] clamp +; GFX9: v_cvt_f16_f32_e32 v1, v3 +; GFX9: v_cvt_f16_f32_e32 v0, v0 +; GFX9: v_and_b32_e32 v1, 0xffff, v1 +; GFX9: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9: s_setpc_b64 +define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { + %src0.ext = fpext <2 x half> %src0 to <2 x float> + %src1.ext = fpext <2 x half> %src1 to <2 x float> + %src2.ext = fpext <2 x half> %src2 to <2 x float> + %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext) + %max = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %result, <2 x float> zeroinitializer) + %clamp = call <2 x float> @llvm.minnum.v2f32(<2 x float> %max, <2 x float> ) + %cvt.result = fptrunc <2 x float> %clamp to <2 x half> + ret <2 x half> %cvt.result +} + +; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_precvt: +; GFX9: v_mad_mix_f32 v0, v0, v3, v6 clamp +; GFX9: v_mad_mix_f32 v1, v1, v4, v7 clamp +; GFX9: v_mad_mix_f32 v2, v2, v5, v8 clamp +; GFX9: v_cvt_f16_f32 +; GFX9: v_cvt_f16_f32 +; GFX9: v_cvt_f16_f32 +define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { + %src0.ext = fpext <3 x half> %src0 to <3 x float> + %src1.ext = fpext <3 x half> %src1 to <3 x float> + %src2.ext = fpext <3 x half> %src2 to <3 x float> + %result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext) + %max = call <3 x float> @llvm.maxnum.v3f32(<3 x float> %result, <3 x float> zeroinitializer) + %clamp = call <3 x float> @llvm.minnum.v3f32(<3 x float> %max, <3 x float> ) + %cvt.result = fptrunc <3 x float> %clamp to <3 x half> + ret <3 x half> %cvt.result +} + +; GCN-LABEL: {{^}}v_mad_mix_v4f32_clamp_precvt: +; GFX9: v_mad_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] clamp +; GFX9: v_mad_mix_f32 v0, v0, v2, v4 clamp +; GFX9: v_mad_mix_f32 v2, v1, v3, v5 op_sel:[1,1,1] clamp +; GFX9: v_mad_mix_f32 v1, v1, v3, v5 clamp +; GFX9: v_cvt_f16_f32 +; GFX9: v_cvt_f16_f32 +; GFX9: v_cvt_f16_f32 +; GFX9: v_cvt_f16_f32 +define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { + %src0.ext = fpext <4 x half> %src0 to <4 x float> + %src1.ext = fpext <4 x half> %src1 to <4 x float> + %src2.ext = fpext <4 x half> %src2 to <4 x float> + %result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext) + %max = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %result, <4 x float> zeroinitializer) + %clamp = call <4 x float> @llvm.minnum.v4f32(<4 x float> %max, <4 x float> ) + %cvt.result = fptrunc <4 x float> %clamp to <4 x half> + ret <4 x half> %cvt.result +} + declare half @llvm.minnum.f16(half, half) #1 declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1 +declare <3 x half> @llvm.minnum.v3f16(<3 x half>, <3 x half>) #1 +declare <4 x half> @llvm.minnum.v4f16(<4 x half>, <4 x half>) #1 declare half @llvm.maxnum.f16(half, half) #1 declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #1 +declare <3 x half> @llvm.maxnum.v3f16(<3 x half>, <3 x half>) #1 +declare <4 x half> @llvm.maxnum.v4f16(<4 x half>, <4 x half>) #1 declare float @llvm.minnum.f32(float, float) #1 +declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #1 +declare <3 x float> @llvm.minnum.v3f32(<3 x float>, <3 x float>) #1 +declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #1 + declare float @llvm.maxnum.f32(float, float) #1 +declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #1 +declare <3 x float> @llvm.maxnum.v3f32(<3 x float>, <3 x float>) #1 +declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #1 + declare float @llvm.fmuladd.f32(float, float, float) #1 declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #1 declare <3 x float> @llvm.fmuladd.v3f32(<3 x float>, <3 x float>, <3 x float>) #1