Index: lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -168,6 +168,7 @@ bool visitIntrinsicInst(IntrinsicInst &I); bool visitBitreverseIntrinsicInst(IntrinsicInst &I); + bool visitFdot2IntrinsicInst(IntrinsicInst &I); bool doInitialization(Module &M) override; bool runOnFunction(Function &F) override; @@ -866,6 +867,8 @@ switch (I.getIntrinsicID()) { case Intrinsic::bitreverse: return visitBitreverseIntrinsicInst(I); + case Intrinsic::amdgcn_fdot2: + return visitFdot2IntrinsicInst(I); default: return false; } @@ -881,6 +884,40 @@ return Changed; } +bool AMDGPUCodeGenPrepare::visitFdot2IntrinsicInst(IntrinsicInst &I) { + if (ST->hasDLInsts()) + return false; + + // Expand fdot2 if unsupported by HW: + // half2 a, b; + // float c, d; + // d = (float)a.x * (float)b.x + (float)a.y * (float)b.y + c; + IRBuilder<> Builder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); + Type *F32Ty = Builder.getFloatTy(); + ConstantInt *Zero = Builder.getInt32(0); + ConstantInt *One = Builder.getInt32(1); + + Value *A = I.getOperand(0); + Value *B = I.getOperand(1); + Value *C = I.getOperand(2); + Value *AX = Builder.CreateFPExt(Builder.CreateExtractElement(A, Zero), F32Ty); + Value *AY = Builder.CreateFPExt(Builder.CreateExtractElement(A, One), F32Ty); + Value *BX = Builder.CreateFPExt(Builder.CreateExtractElement(B, Zero), F32Ty); + Value *BY = Builder.CreateFPExt(Builder.CreateExtractElement(B, One), F32Ty); + Value *F1 = Builder.CreateIntrinsic(Intrinsic::fmuladd, { F32Ty }, + {AY, BY, C}); + Value *F2 = Builder.CreateIntrinsic(Intrinsic::fmuladd, { F32Ty }, + {AX, BX, F1}); + + // Clamp argument is ignored by v_dot2_f32_f16. + + I.replaceAllUsesWith(F2); + I.eraseFromParent(); + + return true; +} + bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { Mod = &M; return false; Index: test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll @@ -1,9 +1,35 @@ -; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX906 +; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX906,GFX9 +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX900,GFX9 +; RUN: llc -march=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX8 declare float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 %clamp) -; GFX906-LABEL: {{^}}test_llvm_amdgcn_fdot2_clamp -; GFX906: v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} +; GCN-LABEL: {{^}}test_llvm_amdgcn_fdot2_clamp +; GCN: s_load_dword [[A:s[0-9]+]], +; GCN: s_load_dword [[B:s[0-9]+]], +; GCN: s_load_dword [[C:s[0-9]+]], + +; GFX9-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]] +; GFX9-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[C]] +; GFX906: v_dot2_f32_f16 [[RES:v[0-9]+]], [[A]], [[VB]], [[VC]] clamp{{$}} + +; GFX8-DAG: s_lshr_b32 [[AY16:s[0-9]+]], [[A]], 16 +; GFX8-DAG: s_lshr_b32 [[BY16:s[0-9]+]], [[B]], 16 +; GFX8-DAG: v_cvt_f32_f16_e32 [[AY:v[0-9]+]], [[AY16]] +; GFX8-DAG: v_cvt_f32_f16_e32 [[BY:v[0-9]+]], [[BY16]] +; GFX8-DAG: v_cvt_f32_f16_e32 [[AX:v[0-9]+]], [[A]] +; GFX8-DAG: v_cvt_f32_f16_e32 [[BX:v[0-9]+]], [[B]] +; GFX8: v_mad_f32 [[RES:v[0-9]+]], [[AY]], [[BY]], [[C]] +; GFX8: v_mac_f32_e32 [[RES]], [[AX]], [[BX]] + +; GFX900: v_mad_mix_f32 [[FMA:v[0-9]+]], [[A]], [[VB]], [[VC]] op_sel:[1,1,0] op_sel_hi:[1,1,0] + +; FIXME: This mov below is not needed, but first max_mix overrides [[VB]] +; GFX900-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]] +; GFX900: v_mad_mix_f32 [[RES:v[0-9]+]], [[A]], [[VB]], [[FMA]] op_sel_hi:[1,1,0] + +; GCN: store_dword v[{{[0-9:]+}}], [[RES]] + define amdgpu_kernel void @test_llvm_amdgcn_fdot2_clamp( float addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -18,8 +44,32 @@ ret void } -; GFX906-LABEL: {{^}}test_llvm_amdgcn_fdot2_no_clamp -; GFX906: v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GCN-LABEL: {{^}}test_llvm_amdgcn_fdot2_no_clamp +; GCN: s_load_dword [[A:s[0-9]+]], +; GCN: s_load_dword [[B:s[0-9]+]], +; GCN: s_load_dword [[C:s[0-9]+]], + +; GFX9-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]] +; GFX9-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[C]] +; GFX906: v_dot2_f32_f16 [[RES:v[0-9]+]], [[A]], [[VB]], [[VC]]{{$}} + +; GFX8-DAG: s_lshr_b32 [[AY16:s[0-9]+]], [[A]], 16 +; GFX8-DAG: s_lshr_b32 [[BY16:s[0-9]+]], [[B]], 16 +; GFX8-DAG: v_cvt_f32_f16_e32 [[AY:v[0-9]+]], [[AY16]] +; GFX8-DAG: v_cvt_f32_f16_e32 [[BY:v[0-9]+]], [[BY16]] +; GFX8-DAG: v_cvt_f32_f16_e32 [[AX:v[0-9]+]], [[A]] +; GFX8-DAG: v_cvt_f32_f16_e32 [[BX:v[0-9]+]], [[B]] +; GFX8: v_mad_f32 [[RES:v[0-9]+]], [[AY]], [[BY]], [[C]] +; GFX8: v_mac_f32_e32 [[RES]], [[AX]], [[BX]] + +; GFX900: v_mad_mix_f32 [[FMA:v[0-9]+]], [[A]], [[VB]], [[VC]] op_sel:[1,1,0] op_sel_hi:[1,1,0] + +; FIXME: This mov below is not needed, but first max_mix overrides [[VB]] +; GFX900-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]] +; GFX900: v_mad_mix_f32 [[RES:v[0-9]+]], [[A]], [[VB]], [[FMA]] op_sel_hi:[1,1,0] + +; GCN: store_dword v[{{[0-9:]+}}], [[RES]] + define amdgpu_kernel void @test_llvm_amdgcn_fdot2_no_clamp( float addrspace(1)* %r, <2 x half> addrspace(1)* %a,