Index: llvm/include/llvm/IR/IntrinsicsAArch64.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAArch64.td +++ llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -154,6 +154,11 @@ : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>], [IntrNoMem]>; + + class AdvSIMD_FP16FML_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>], + [IntrNoMem]>; } // Arithmetic ops @@ -424,6 +429,12 @@ // v8.2-A Dot Product def int_aarch64_neon_udot : AdvSIMD_Dot_Intrinsic; def int_aarch64_neon_sdot : AdvSIMD_Dot_Intrinsic; + + // v8.2-A FP16 Fused Multiply-Add Long + def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic; + def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic; + def int_aarch64_neon_fmlal2 : AdvSIMD_FP16FML_Intrinsic; + def int_aarch64_neon_fmlsl2 : AdvSIMD_FP16FML_Intrinsic; } let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -4941,14 +4941,27 @@ let Inst{4-0} = Rd; } -let Predicates = [HasNEON, HasFP16FML] in +// ARMv8.2 Fused Multiply Add Long Instructions (Vector) class BaseSIMDThreeSameMult size, string asm, string kind1, - string kind2> : - BaseSIMDThreeSameVector { + string kind2, RegisterOperand RegType, + ValueType AccumType, ValueType InputType, + SDPatternOperator OpNode> : + BaseSIMDThreeSameVectorTied { let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}"); let Inst{13} = b13; } +multiclass SIMDThreeSameMult size, string asm, SDPatternOperator OpNode> { + def v4f16 : BaseSIMDThreeSameMult<0, U, b13, size, asm, ".2s", ".2h", V64, + v2f32, v4f16, OpNode>; + def v8f16 : BaseSIMDThreeSameMult<1, U, b13, size, asm, ".4s", ".4h", V128, + v4f32, v8f16, OpNode>; +} + class BaseSIMDThreeSameVectorDot opc, string asm, string dst_kind, string lhs_kind, - string rhs_kind> : - BaseSIMDIndexedTied { - //idx = H:L:M + string rhs_kind, RegisterOperand RegType, + ValueType AccumType, ValueType InputType, + SDPatternOperator OpNode> : + BaseSIMDIndexedTied { + // idx = H:L:M bits<3> idx; let Inst{11} = idx{2}; // H let Inst{21} = idx{1}; // L @@ -7455,6 +7474,13 @@ v4i32, v16i8, OpNode>; } +multiclass SIMDThreeSameMultIndex opc, string asm, SDPatternOperator OpNode> { + def v4f16 : BaseSIMDThreeSameMultIndex<0, U, opc, asm, ".2s", ".2h", ".h", V64, + v2f32, v4f16, OpNode>; + def v8f16 : BaseSIMDThreeSameMultIndex<1, U, opc, asm, ".4s", ".4h", ".h", V128, + v4f32, v8f16, OpNode>; +} + multiclass SIMDFPIndexed opc, string asm, SDPatternOperator OpNode> { let Predicates = [HasNEON, HasFullFP16] in { Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -3439,22 +3439,16 @@ int_aarch64_neon_sqsub>; // FP16FML -def FMLAL_2S : BaseSIMDThreeSameMult<0, 0, 1, 0b001, "fmlal", ".2s", ".2h">; -def FMLSL_2S : BaseSIMDThreeSameMult<0, 0, 1, 0b101, "fmlsl", ".2s", ".2h">; -def FMLAL_4S : BaseSIMDThreeSameMult<1, 0, 1, 0b001, "fmlal", ".4s", ".4h">; -def FMLSL_4S : BaseSIMDThreeSameMult<1, 0, 1, 0b101, "fmlsl", ".4s", ".4h">; -def FMLAL2_2S : BaseSIMDThreeSameMult<0, 1, 0, 0b001, "fmlal2", ".2s", ".2h">; -def FMLSL2_2S : BaseSIMDThreeSameMult<0, 1, 0, 0b101, "fmlsl2", ".2s", ".2h">; -def FMLAL2_4S : BaseSIMDThreeSameMult<1, 1, 0, 0b001, "fmlal2", ".4s", ".4h">; -def FMLSL2_4S : BaseSIMDThreeSameMult<1, 1, 0, 0b101, "fmlsl2", ".4s", ".4h">; -def FMLALI_2s : BaseSIMDThreeSameMultIndex<0, 0, 0b0000, "fmlal", ".2s", ".2h", ".h">; -def FMLSLI_2s : BaseSIMDThreeSameMultIndex<0, 0, 0b0100, "fmlsl", ".2s", ".2h", ".h">; -def FMLALI_4s : BaseSIMDThreeSameMultIndex<1, 0, 0b0000, "fmlal", ".4s", ".4h", ".h">; -def FMLSLI_4s : BaseSIMDThreeSameMultIndex<1, 0, 0b0100, "fmlsl", ".4s", ".4h", ".h">; -def FMLALI2_2s : BaseSIMDThreeSameMultIndex<0, 1, 0b1000, "fmlal2", ".2s", ".2h", ".h">; -def FMLSLI2_2s : BaseSIMDThreeSameMultIndex<0, 1, 0b1100, "fmlsl2", ".2s", ".2h", ".h">; -def FMLALI2_4s : BaseSIMDThreeSameMultIndex<1, 1, 0b1000, "fmlal2", ".4s", ".4h", ".h">; -def FMLSLI2_4s : BaseSIMDThreeSameMultIndex<1, 1, 0b1100, "fmlsl2", ".4s", ".4h", ".h">; +let Predicates = [HasNEON, HasFP16FML] in { +defm FMLAL : SIMDThreeSameMult<0, 1, 0b001, "fmlal", int_aarch64_neon_fmlal>; +defm FMLSL : SIMDThreeSameMult<0, 1, 0b101, "fmlsl", int_aarch64_neon_fmlsl>; +defm FMLAL2 : SIMDThreeSameMult<1, 0, 0b001, "fmlal2", int_aarch64_neon_fmlal2>; +defm FMLSL2 : SIMDThreeSameMult<1, 0, 0b101, "fmlsl2", int_aarch64_neon_fmlsl2>; +defm FMLALlane : SIMDThreeSameMultIndex<0, 0b0000, "fmlal", int_aarch64_neon_fmlal>; +defm FMLSLlane : SIMDThreeSameMultIndex<0, 0b0100, "fmlsl", int_aarch64_neon_fmlsl>; +defm FMLAL2lane : SIMDThreeSameMultIndex<1, 0b1000, "fmlal2", int_aarch64_neon_fmlal2>; +defm FMLSL2lane : SIMDThreeSameMultIndex<1, 0b1100, "fmlsl2", int_aarch64_neon_fmlsl2>; +} defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>; defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic", Index: llvm/test/CodeGen/AArch64/neon-fp16fml.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/neon-fp16fml.ll @@ -0,0 +1,74 @@ +; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+fp16fml < %s | FileCheck %s + +declare <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float>, <4 x half>, <4 x half>) +declare <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float>, <4 x half>, <4 x half>) +declare <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float>, <4 x half>, <4 x half>) +declare <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float>, <4 x half>, <4 x half>) +declare <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float>, <8 x half>, <8 x half>) +declare <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float>, <8 x half>, <8 x half>) +declare <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float>, <8 x half>, <8 x half>) +declare <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float>, <8 x half>, <8 x half>) + +define <2 x float> @test_vfmlal_low_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c) #0 { +entry: +; CHECK-LABEL: test_vfmlal_low_u32: +; CHECK: fmlal v0.2s, v1.2h, v2.2h + %vfmlal_low2.i = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c) #2 + ret <2 x float> %vfmlal_low2.i +} + +define <2 x float> @test_vfmlsl_low_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c) #0 { +entry: +; CHECK-LABEL: test_vfmlsl_low_u32: +; CHECK: fmlsl v0.2s, v1.2h, v2.2h + %vfmlsl_low2.i = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c) #2 + ret <2 x float> %vfmlsl_low2.i +} + +define <2 x float> @test_vfmlal_high_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c) #0 { +entry: +; CHECK-LABEL: test_vfmlal_high_u32: +; CHECK: fmlal2 v0.2s, v1.2h, v2.2h + %vfmlal_high2.i = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c) #2 + ret <2 x float> %vfmlal_high2.i +} + +define <2 x float> @test_vfmlsl_high_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c) #0 { +entry: +; CHECK-LABEL: test_vfmlsl_high_u32: +; CHECK: fmlsl2 v0.2s, v1.2h, v2.2h + %vfmlsl_high2.i = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c) #2 + ret <2 x float> %vfmlsl_high2.i +} + +define <4 x float> @test_vfmlalq_low_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c) #0 { +entry: +; CHECK-LABEL: test_vfmlalq_low_u32: +; CHECK: fmlal v0.4s, v1.4h, v2.4h + %vfmlalq_low4.i = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c) #2 + ret <4 x float> %vfmlalq_low4.i +} + +define <4 x float> @test_vfmlslq_low_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c) #0 { +entry: +; CHECK-LABEL: test_vfmlslq_low_u32: +; CHECK: fmlsl v0.4s, v1.4h, v2.4h + %vfmlslq_low4.i = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c) #2 + ret <4 x float> %vfmlslq_low4.i +} + +define <4 x float> @test_vfmlalq_high_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c) #0 { +entry: +; CHECK-LABEL: test_vfmlalq_high_u32: +; CHECK: fmlal2 v0.4s, v1.4h, v2.4h + %vfmlalq_high4.i = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c) #2 + ret <4 x float> %vfmlalq_high4.i +} + +define <4 x float> @test_vfmlslq_high_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c) #0 { +entry: +; CHECK-LABEL: test_vfmlslq_high_u32: +; CHECK: fmlsl2 v0.4s, v1.4h, v2.4h + %vfmlslq_high4.i = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c) #2 + ret <4 x float> %vfmlslq_high4.i +}