Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h @@ -331,6 +331,7 @@ bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const; SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const; + SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; void ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp @@ -653,6 +653,7 @@ setOperationAction(ISD::FADD, MVT::v4f16, Custom); setOperationAction(ISD::FMUL, MVT::v4f16, Custom); + setOperationAction(ISD::FMA, MVT::v4f16, Custom); setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom); setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom); @@ -3971,6 +3972,30 @@ return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); } +SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op, + SelectionDAG &DAG) const { + unsigned Opc = Op.getOpcode(); + EVT VT = Op.getValueType(); + assert(VT == MVT::v4i16 || VT == MVT::v4f16); + + SDValue Lo0, Hi0; + std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0); + SDValue Lo1, Hi1; + std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1); + SDValue Lo2, Hi2; + std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2); + + SDLoc SL(Op); + + SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Lo2, + Op->getFlags()); + SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Hi2, + Op->getFlags()); + + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); +} + + SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); @@ -4023,6 +4048,8 @@ case ISD::FMINNUM: case ISD::FMAXNUM: return lowerFMINNUM_FMAXNUM(Op, DAG); + case ISD::FMA: + return splitTernaryVectorOp(Op, DAG); case ISD::SHL: case ISD::SRA: case ISD::SRL: Index: llvm/trunk/test/CodeGen/AMDGPU/fmac.sdwa.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/fmac.sdwa.ll +++ llvm/trunk/test/CodeGen/AMDGPU/fmac.sdwa.ll @@ -3,74 +3,96 @@ ; GCN-LABEL: {{^}}addMul2D: ; GFX1010: v_fmac_f16 ; GFX1010: v_fmac_f16 -define hidden <4 x half> @addMul2D(<4 x i8>* nocapture readonly, float addrspace(4)* nocapture readonly, <2 x i32>, i32) local_unnamed_addr #0 { - %5 = extractelement <2 x i32> %2, i64 1 - %6 = icmp sgt i32 %5, 0 - br i1 %6, label %7, label %38 - -7: ; preds = %4 - %8 = extractelement <2 x i32> %2, i64 0 - %9 = icmp sgt i32 %8, 0 - br label %10 - -10: ; preds = %34, %7 - %11 = phi <4 x half> [ zeroinitializer, %7 ], [ %35, %34 ] - %12 = phi i32 [ 0, %7 ], [ %36, %34 ] - br i1 %9, label %13, label %34 - -13: ; preds = %10 - %14 = mul nsw i32 %12, %3 - %15 = mul nsw i32 %12, %8 - br label %16 - -16: ; preds = %16, %13 - %17 = phi <4 x half> [ %11, %13 ], [ %31, %16 ] - %18 = phi i32 [ 0, %13 ], [ %32, %16 ] - %19 = add nsw i32 %18, %14 - %20 = sext i32 %19 to i64 - %21 = getelementptr inbounds <4 x i8>, <4 x i8>* %0, i64 %20 - %22 = load <4 x i8>, <4 x i8>* %21, align 4 - %23 = tail call <4 x half> @_Z13convert_half4Dv4_h(<4 x i8> %22) #8 - %24 = add nsw i32 %18, %15 - %25 = sext i32 %24 to i64 - %26 = getelementptr inbounds float, float addrspace(4)* %1, i64 %25 - %27 = load float, float addrspace(4)* %26, align 4 - %28 = fptrunc float %27 to half - %29 = insertelement <4 x half> undef, half %28, i32 0 - %30 = shufflevector <4 x half> %29, <4 x half> undef, <4 x i32> zeroinitializer - %31 = tail call <4 x half> @llvm.fmuladd.v4f16(<4 x half> %23, <4 x half> %30, <4 x half> %17) - %32 = add nuw nsw i32 %18, 1 - %33 = icmp eq i32 %32, %8 - br i1 %33, label %34, label %16 - -34: ; preds = %16, %10 - %35 = phi <4 x half> [ %11, %10 ], [ %31, %16 ] - %36 = add nuw nsw i32 %12, 1 - %37 = icmp eq i32 %36, %5 - br i1 %37, label %38, label %10 - -38: ; preds = %34, %4 - %39 = phi <4 x half> [ zeroinitializer, %4 ], [ %35, %34 ] - ret <4 x half> %39 +define hidden <4 x half> @addMul2D(<4 x i8>* nocapture readonly %arg, float addrspace(4)* nocapture readonly %arg1, <2 x i32> %arg2, i32 %arg3) local_unnamed_addr #0 { +bb: + %tmp = extractelement <2 x i32> %arg2, i64 1 + %tmp4 = icmp sgt i32 %tmp, 0 + br i1 %tmp4, label %bb5, label %bb36 + +bb5: ; preds = %bb + %tmp6 = extractelement <2 x i32> %arg2, i64 0 + %tmp7 = icmp sgt i32 %tmp6, 0 + br label %bb8 + +bb8: ; preds = %bb32, %bb5 + %tmp9 = phi <4 x half> [ zeroinitializer, %bb5 ], [ %tmp33, %bb32 ] + %tmp10 = phi i32 [ 0, %bb5 ], [ %tmp34, %bb32 ] + br i1 %tmp7, label %bb11, label %bb32 + +bb11: ; preds = %bb8 + %tmp12 = mul nsw i32 %tmp10, %arg3 + %tmp13 = mul nsw i32 %tmp10, %tmp6 + br label %bb14 + +bb14: ; preds = %bb14, %bb11 + %tmp15 = phi <4 x half> [ %tmp9, %bb11 ], [ %tmp29, %bb14 ] + %tmp16 = phi i32 [ 0, %bb11 ], [ %tmp30, %bb14 ] + %tmp17 = add nsw i32 %tmp16, %tmp12 + %tmp18 = sext i32 %tmp17 to i64 + %tmp19 = getelementptr inbounds <4 x i8>, <4 x i8>* %arg, i64 %tmp18 + %tmp20 = load <4 x i8>, <4 x i8>* %tmp19, align 4 + %tmp21 = tail call <4 x half> @_Z13convert_half4Dv4_h(<4 x i8> %tmp20) + %tmp22 = add nsw i32 %tmp16, %tmp13 + %tmp23 = sext i32 %tmp22 to i64 + %tmp24 = getelementptr inbounds float, float addrspace(4)* %arg1, i64 %tmp23 + %tmp25 = load float, float addrspace(4)* %tmp24, align 4 + %tmp26 = fptrunc float %tmp25 to half + %tmp27 = insertelement <4 x half> undef, half %tmp26, i32 0 + %tmp28 = shufflevector <4 x half> %tmp27, <4 x half> undef, <4 x i32> zeroinitializer + %vec.A.0 = extractelement <4 x half> %tmp21, i32 0 + %vec.B.0 = extractelement <4 x half> %tmp28, i32 0 + %vec.C.0 = extractelement <4 x half> %tmp15, i32 0 + %vec.res.0 = tail call half @llvm.fmuladd.f16(half %vec.A.0, half %vec.B.0, half %vec.C.0) + %vec.A.1 = extractelement <4 x half> %tmp21, i32 1 + %vec.B.1 = extractelement <4 x half> %tmp28, i32 1 + %vec.C.1 = extractelement <4 x half> %tmp15, i32 1 + %vec.res.1 = tail call half @llvm.fmuladd.f16(half %vec.A.1, half %vec.B.1, half %vec.C.1) + %vec.A.2 = extractelement <4 x half> %tmp21, i32 2 + %vec.B.2 = extractelement <4 x half> %tmp28, i32 2 + %vec.C.2 = extractelement <4 x half> %tmp15, i32 2 + %vec.res.2 = tail call half @llvm.fmuladd.f16(half %vec.A.2, half %vec.B.2, half %vec.C.2) + %vec.A.3 = extractelement <4 x half> %tmp21, i32 3 + %vec.B.3 = extractelement <4 x half> %tmp28, i32 3 + %vec.C.3 = extractelement <4 x half> %tmp15, i32 3 + %vec.res.3 = tail call half @llvm.fmuladd.f16(half %vec.A.3, half %vec.B.3, half %vec.C.3) + %full.res.0 = insertelement <4 x half> undef, half %vec.res.0, i32 0 + %full.res.1 = insertelement <4 x half> %full.res.0, half %vec.res.1, i32 1 + %full.res.2 = insertelement <4 x half> %full.res.1, half %vec.res.2, i32 2 + %tmp29 = insertelement <4 x half> %full.res.2, half %vec.res.3, i32 3 + %tmp30 = add nuw nsw i32 %tmp16, 1 + %tmp31 = icmp eq i32 %tmp30, %tmp6 + br i1 %tmp31, label %bb32, label %bb14 + +bb32: ; preds = %bb14, %bb8 + %tmp33 = phi <4 x half> [ %tmp9, %bb8 ], [ %tmp29, %bb14 ] + %tmp34 = add nuw nsw i32 %tmp10, 1 + %tmp35 = icmp eq i32 %tmp34, %tmp + br i1 %tmp35, label %bb36, label %bb8 + +bb36: ; preds = %bb32, %bb + %tmp37 = phi <4 x half> [ zeroinitializer, %bb ], [ %tmp33, %bb32 ] + ret <4 x half> %tmp37 } -define linkonce_odr hidden <4 x half> @_Z13convert_half4Dv4_h(<4 x i8>) local_unnamed_addr #1 { - %2 = extractelement <4 x i8> %0, i64 0 - %3 = uitofp i8 %2 to half - %4 = insertelement <4 x half> undef, half %3, i32 0 - %5 = extractelement <4 x i8> %0, i64 1 - %6 = uitofp i8 %5 to half - %7 = insertelement <4 x half> %4, half %6, i32 1 - %8 = extractelement <4 x i8> %0, i64 2 - %9 = uitofp i8 %8 to half - %10 = insertelement <4 x half> %7, half %9, i32 2 - %11 = extractelement <4 x i8> %0, i64 3 - %12 = uitofp i8 %11 to half - %13 = insertelement <4 x half> %10, half %12, i32 3 - ret <4 x half> %13 +; Function Attrs: norecurse nounwind readnone +define linkonce_odr hidden <4 x half> @_Z13convert_half4Dv4_h(<4 x i8> %arg) local_unnamed_addr #1 { +bb: + %tmp = extractelement <4 x i8> %arg, i64 0 + %tmp1 = uitofp i8 %tmp to half + %tmp2 = insertelement <4 x half> undef, half %tmp1, i32 0 + %tmp3 = extractelement <4 x i8> %arg, i64 1 + %tmp4 = uitofp i8 %tmp3 to half + %tmp5 = insertelement <4 x half> %tmp2, half %tmp4, i32 1 + %tmp6 = extractelement <4 x i8> %arg, i64 2 + %tmp7 = uitofp i8 %tmp6 to half + %tmp8 = insertelement <4 x half> %tmp5, half %tmp7, i32 2 + %tmp9 = extractelement <4 x i8> %arg, i64 3 + %tmp10 = uitofp i8 %tmp9 to half + %tmp11 = insertelement <4 x half> %tmp8, half %tmp10, i32 3 + ret <4 x half> %tmp11 } -declare <4 x half> @llvm.fmuladd.v4f16(<4 x half>, <4 x half>, <4 x half>) +declare half @llvm.fmuladd.f16(half, half, half) attributes #0 = { convergent nounwind readonly} attributes #1 = { norecurse nounwind readnone } Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.fma.f16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.fma.f16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.fma.f16.ll @@ -1,8 +1,10 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI,VIGFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VIGFX9 %s declare half @llvm.fma.f16(half %a, half %b, half %c) declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) +declare <4 x half> @llvm.fma.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) ; GCN-LABEL: {{^}}fma_f16 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] @@ -13,7 +15,7 @@ ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] ; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] +; VIGFX9: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fma_f16( @@ -38,8 +40,8 @@ ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] ; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[B_F32:[0-9]]], s[[A_F32:[0-9]]], v[[C_F32:[0-9]]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}} -; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], s[[A_F16]], v[[C_F16]] +; VIGFX9: s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}} +; VIGFX9: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], s[[A_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fma_f16_imm_a( @@ -61,8 +63,8 @@ ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] ; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], s[[B_F32:[0-9]]], v[[C_F32:[0-9]]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}} -; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], s[[B_F16]], v[[C_F16]] +; VIGFX9: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}} +; VIGFX9: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], s[[B_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fma_f16_imm_b( @@ -84,8 +86,8 @@ ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], s[[C_F32:[0-9]]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}} -; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], s[[C_F16]] +; VIGFX9: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}} +; VIGFX9: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], s[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fma_f16_imm_c( @@ -127,9 +129,11 @@ ; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]] ; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]] -; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]] + +; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fma_v2f16( @@ -150,14 +154,14 @@ ; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]] -; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; VIGFX9: buffer_load_dword v[[C_V2_F16:[0-9]+]] +; VIGFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; SI: s_mov_b32 s[[A_F32:[0-9]+]], 0x40400000{{$}} -; VI: s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}} -; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; GCN-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] +; VIGFX9: s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}} +; SIVI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SIVI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] @@ -172,9 +176,11 @@ ; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], s[[A_F16]], v[[B_F16_1]] ; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[C_V2_F16]], s[[A_F16]], v[[B_V2_F16]] -; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], s[[A_F16]], v[[B_V2_F16]] + +; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fma_v2f16_imm_a( @@ -192,11 +198,11 @@ ; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]] ; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]] +; VIGFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; VIGFX9: buffer_load_dword v[[C_V2_F16:[0-9]+]] ; SI: s_mov_b32 s[[B_F32:[0-9]+]], 0x40400000{{$}} -; VI: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}} +; VIGFX9: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}} ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] @@ -215,9 +221,11 @@ ; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], s[[B_F16]], v[[C_V2_F16]] ; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], s[[B_F16]], v[[C_F16_1]] -; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], s[[B_F16]], v[[C_V2_F16]] + +; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fma_v2f16_imm_b( @@ -235,11 +243,11 @@ ; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; VIGFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; VIGFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; SI: s_mov_b32 s[[C_F32:[0-9]+]], 0x40400000{{$}} -; VI: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}} +; VIGFX9: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}} ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] @@ -265,6 +273,7 @@ ; GCN-NOT: and ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] +; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], s[[C_F16]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm @@ -278,3 +287,74 @@ store <2 x half> %r.val, <2 x half> addrspace(1)* %r ret void } + +; GCN-LABEL: {{^}}fma_v4f16 +; GCN: buffer_load_dwordx2 v{{\[}}[[A_V4_F16_LO:[0-9]+]]:[[A_V4_F16_HI:[0-9]+]]{{\]}} +; GCN: buffer_load_dwordx2 v{{\[}}[[B_V4_F16_LO:[0-9]+]]:[[B_V4_F16_HI:[0-9]+]]{{\]}} +; GCN: buffer_load_dwordx2 v{{\[}}[[C_V4_F16_LO:[0-9]+]]:[[C_V4_F16_HI:[0-9]+]]{{\]}} + +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V4_F16_LO]] +; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_0:[0-9]+]], 16, v[[A_V4_F16_LO]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_V4_F16_HI]] +; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_2:[0-9]+]], 16, v[[A_V4_F16_HI]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V4_F16_LO]] +; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V4_F16_LO]] +; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_0:[0-9]+]], 16, v[[B_V4_F16_LO]] +; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_0:[0-9]+]], 16, v[[C_V4_F16_LO]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_V4_F16_HI]] +; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_V4_F16_HI]] +; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V4_F16_HI]] +; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V4_F16_HI]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_2:[0-9]+]], v[[A_V4_F16_LO]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_3:[0-9]+]], v[[A_V4_F16_HI]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_2:[0-9]+]], v[[B_V4_F16_LO]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_3:[0-9]+]], v[[B_V4_F16_HI]] +; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_2:[0-9]+]], v[[C_V4_F16_LO]] +; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_3:[0-9]+]], v[[C_V4_F16_HI]] + +; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32_0]] +; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32_1]] +; SI-DAG: v_fma_f32 v[[R_F32_2:[0-9]+]], v[[A_F32_2]], v[[B_F32_2]], v[[C_F32_2]] +; SI-DAG: v_fma_f32 v[[R_F32_3:[0-9]+]], v[[A_F32_3]], v[[B_F32_3]], v[[C_F32_3]] + +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_2:[0-9]+]], v[[R_F32_2]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_3:[0-9]+]], v[[R_F32_3]] + +; SI-DAG: v_lshlrev_b32_e32 v[[R1_F16_0:[0-9]]], 16, v[[R_F16_2]] +; SI-DAG: v_lshlrev_b32_e32 v[[R1_F16_1:[0-9]]], 16, v[[R_F16_3]] + +; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_0:[0-9]+]], 16, v[[A_V4_F16_LO]] +; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V4_F16_HI]] +; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_0:[0-9]+]], 16, v[[B_V4_F16_LO]] +; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V4_F16_HI]] +; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_0:[0-9]+]], 16, v[[C_V4_F16_LO]] +; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V4_F16_HI]] + +; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V4_F16_LO]], v[[B_V4_F16_LO]], v[[C_V4_F16_LO]] +; VI-DAG: v_fma_f16 v[[R1_F16_0:[0-9]+]], v[[A_F16_0]], v[[B_F16_0]], v[[C_F16_0]] +; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_V4_F16_HI]], v[[B_V4_F16_HI]], v[[C_V4_F16_HI]] +; VI-DAG: v_fma_f16 v[[R1_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]] + +; SIVI-DAG: v_or_b32_e32 v[[R_V4_F16_LO:[0-9]+]], v[[R_F16_0]], v[[R1_F16_0]] +; SIVI-DAG: v_or_b32_e32 v[[R_V4_F16_HI:[0-9]+]], v[[R_F16_1]], v[[R1_F16_1]] + +; GFX9-DAG: v_pk_fma_f16 v[[R_V4_F16_LO:[0-9]+]], v[[A_V4_F16_LO]], v[[B_V4_F16_LO]], v[[C_V4_F16_LO]] +; GFX9-DAG: v_pk_fma_f16 v[[R_V4_F16_HI:[0-9]+]], v[[A_V4_F16_HI]], v[[B_V4_F16_HI]], v[[C_V4_F16_HI]] + +; GCN: buffer_store_dwordx2 v{{\[}}[[R_V4_F16_LO]]:[[R_V4_F16_HI]]{{\]}} +; GCN: s_endpgm + +define amdgpu_kernel void @fma_v4f16( + <4 x half> addrspace(1)* %r, + <4 x half> addrspace(1)* %a, + <4 x half> addrspace(1)* %b, + <4 x half> addrspace(1)* %c) { + %a.val = load <4 x half>, <4 x half> addrspace(1)* %a + %b.val = load <4 x half>, <4 x half> addrspace(1)* %b + %c.val = load <4 x half>, <4 x half> addrspace(1)* %c + %r.val = call <4 x half> @llvm.fma.v4f16(<4 x half> %a.val, <4 x half> %b.val, <4 x half> %c.val) + store <4 x half> %r.val, <4 x half> addrspace(1)* %r + ret void +}