Index: llvm/lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -200,6 +200,12 @@ }]; } +class FoldTernaryOpPat : PatFrag< + (ops node:$src0, node:$src1, node:$src2), + (op2 (op1 node:$src0, node:$src1), node:$src2) +>; + +def imad : FoldTernaryOpPat; let Properties = [SDNPCommutative, SDNPAssociative] in { def smax_oneuse : HasOneUseBinOp; Index: llvm/lib/Target/AMDGPU/VOP3Instructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -381,36 +381,43 @@ def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>; } // End SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC], OtherPredicates = [isNotGFX90APlus] -let Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9] in { +// Note: 16-bit instructions produce a 0 result in the high 16-bits +// on GFX8 and GFX9 and preserve high 16 bits on GFX10+ +multiclass Arithmetic_i16_0Hi_TernaryPats { + def : GCNPat< + (i32 (zext (op i16:$src0, i16:$src1, i16:$src2))), + (inst VSrc_b16:$src0, VSrc_b16:$src1, VSrc_b16:$src2) + >; +} -multiclass Ternary_i16_Pats { -def : GCNPat < - (op2 (op1 i16:$src0, i16:$src1), i16:$src2), - (inst i16:$src0, i16:$src1, i16:$src2, (i1 0)) ->; +let Predicates = [Has16BitInsts, isGFX8GFX9] in { +defm : Arithmetic_i16_0Hi_TernaryPats; +} + +let Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9] in { +// FIXME: Should be able to just pass imad to the instruction +// definition pattern, but the implied clamp input interferes. +multiclass Ternary_i16_Pats { + def : GCNPat < + (op i16:$src0, i16:$src1, i16:$src2), + (inst i16:$src0, i16:$src1, i16:$src2, (i1 0)) + >; } -defm: Ternary_i16_Pats; -defm: Ternary_i16_Pats; +defm: Ternary_i16_Pats; } // End Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9] -let Predicates = [Has16BitInsts, isGFX10Plus] in { -multiclass Ternary_i16_Pats_gfx9 { -def : GCNPat < +class Ternary_i16_Pats_gfx9 : GCNPat < (op2 (op1 i16:$src0, i16:$src1), i16:$src2), (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE) >; -} - -defm: Ternary_i16_Pats_gfx9; -defm: Ternary_i16_Pats_gfx9; - +let Predicates = [Has16BitInsts, isGFX10Plus] in { +def: Ternary_i16_Pats_gfx9; } // End Predicates = [Has16BitInsts, isGFX10Plus] class ThreeOpFragSDAG : PatFrag< Index: llvm/test/CodeGen/AMDGPU/mad.u16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mad.u16.ll +++ llvm/test/CodeGen/AMDGPU/mad.u16.ll @@ -136,15 +136,13 @@ ; GFX8-LABEL: v_mad_u16_zext: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v2 +; GFX8-NEXT: v_mad_u16 v0, v0, v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mad_u16_zext: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v2 +; GFX9-NEXT: v_mad_legacy_u16 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mad_u16_zext: @@ -173,16 +171,14 @@ ; GFX8-LABEL: v_mad_u16_zext64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v2 +; GFX8-NEXT: v_mad_u16 v0, v0, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mad_u16_zext64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v2 +; GFX9-NEXT: v_mad_legacy_u16 v0, v0, v1, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ;