Index: llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -321,7 +321,7 @@ def AMDGPUfmed3_impl : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; -def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2", +def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2", SDTypeProfile<1, 4, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>, SDTCisFP<0>, SDTCisVec<1>, SDTCisInt<4>]>, @@ -468,3 +468,7 @@ def AMDGPUbfe_u32 : PatFrags<(ops node:$src0, node:$src1, node:$src2), [(int_amdgcn_ubfe node:$src0, node:$src1, node:$src2), (AMDGPUbfe_u32_impl node:$src0, node:$src1, node:$src2)]>; + +def AMDGPUfdot2 : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$src3), + [(int_amdgcn_fdot2 node:$src0, node:$src1, node:$src2, node:$src3), + (AMDGPUfdot2_impl node:$src0, node:$src1, node:$src2, node:$src3)]>; Index: llvm/lib/Target/AMDGPU/VOP3PInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -286,8 +286,10 @@ def : GCNPat < (dot_op (dot_inst.Pfl.Src0VT (VOP3PMods0 dot_inst.Pfl.Src0VT:$src0, i32:$src0_modifiers)), (dot_inst.Pfl.Src1VT (VOP3PMods dot_inst.Pfl.Src1VT:$src1, i32:$src1_modifiers)), - (dot_inst.Pfl.Src2VT (VOP3PMods dot_inst.Pfl.Src2VT:$src2, i32:$src2_modifiers)), i1:$clamp), - (dot_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, (as_i1timm $clamp))>; + (dot_inst.Pfl.Src2VT (VOP3PMods dot_inst.Pfl.Src2VT:$src2, i32:$src2_modifiers)), timm:$clamp), + (dot_inst $src0_modifiers, VSrc_v2f16:$src0, + $src1_modifiers, VSrc_v2f16:$src1, + $src2_modifiers, VSrc_f32:$src2, timm:$clamp)>; } defm : DotPats; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll @@ -0,0 +1,183 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s + +define float @v_fdot2(<2 x half> %a, <2 x half> %b, float %c) { +; GFX906-LABEL: v_fdot2: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdot2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_setpc_b64 s[30:31] + %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false) + ret float %r +} + +define float @v_fdot2_clamp(<2 x half> %a, <2 x half> %b, float %c) { +; GFX906-LABEL: v_fdot2_clamp: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 clamp +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdot2_clamp: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 clamp +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_setpc_b64 s[30:31] + %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 true) + ret float %r +} + +define float @v_fdot2_neg_a(<2 x half> %a, <2 x half> %b, float %c) { +; GFX906-LABEL: v_fdot2_neg_a: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdot2_neg_a: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_setpc_b64 s[30:31] + %neg.a = fneg <2 x half> %a + %r = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %b, float %c, i1 false) + ret float %r +} + +define float @v_fdot2_neg_b(<2 x half> %a, <2 x half> %b, float %c) { +; GFX906-LABEL: v_fdot2_neg_b: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdot2_neg_b: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_setpc_b64 s[30:31] + %neg.b = fneg <2 x half> %b + %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %neg.b, float %c, i1 false) + ret float %r +} + +define float @v_fdot2_neg_a_neg_b(<2 x half> %a, <2 x half> %b, float %c) { +; GFX906-LABEL: v_fdot2_neg_a_neg_b: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_dot2_f32_f16 v0, v1, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdot2_neg_a_neg_b: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_dot2_f32_f16 v0, v1, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_setpc_b64 s[30:31] + %neg.a = fneg <2 x half> %b + %neg.b = fneg <2 x half> %b + %r = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %neg.b, float %c, i1 false) + ret float %r +} + +define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) { +; GFX906-LABEL: v_fdot2_neg_c: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdot2_neg_c: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_setpc_b64 s[30:31] + %neg.c = fneg float %c + %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false) + ret float %r +} + +define float @v_fdot2_inline_literal_a(<2 x half> %b, float %c) { +; GFX906-LABEL: v_fdot2_inline_literal_a: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: s_movk_i32 s4, 0x4000 +; GFX906-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX906-NEXT: v_dot2_f32_f16 v0, s4, v0, v1 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdot2_inline_literal_a: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_movk_i32 s4, 0x4000 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX10-NEXT: v_dot2_f32_f16 v0, s4, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> , <2 x half> %b, float %c, i1 false) + ret float %ret +} + +define float @v_fdot2_inline_literal_b(<2 x half> %a, float %c) { +; GFX906-LABEL: v_fdot2_inline_literal_b: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: s_movk_i32 s4, 0x4000 +; GFX906-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX906-NEXT: v_dot2_f32_f16 v0, v0, s4, v1 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdot2_inline_literal_b: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_movk_i32 s4, 0x4000 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX10-NEXT: v_dot2_f32_f16 v0, v0, s4, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> , float %c, i1 false) + ret float %ret +} + +define float @v_fdot2_inline_literal_c(<2 x half> %a, <2 x half> %b) { +; GFX906-LABEL: v_fdot2_inline_literal_c: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, 1.0 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdot2_inline_literal_c: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_dot2_f32_f16 v0, v0, v1, 1.0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_setpc_b64 s[30:31] + %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 1.0, i1 false) + ret float %ret +} + +declare float @llvm.amdgcn.fdot2(<2 x half>, <2 x half>, float, i1 immarg) #0 + +attributes #0 = { nounwind readnone speculatable }