Index: clang/include/clang/Basic/arm_neon.td =================================================================== --- clang/include/clang/Basic/arm_neon.td +++ clang/include/clang/Basic/arm_neon.td @@ -1499,11 +1499,9 @@ def VMULX_LANEH : IOpInst<"vmulx_lane", "ddgi", "hQh", OP_MULX_LN>; def VMULX_LANEQH : IOpInst<"vmulx_laneq", "ddji", "hQh", OP_MULX_LN>; def VMULX_NH : IOpInst<"vmulx_n", "dds", "hQh", OP_MULX_N>; - // TODO: Scalar floating point multiply extended (scalar, by element) - // Below ones are commented out because they need vmulx_f16(float16_t, float16_t) - // which will be implemented later with fp16 scalar intrinsic (arm_fp16.h) - //def SCALAR_FMULX_LANEH : IOpInst<"vmulx_lane", "ssdi", "Sh", OP_SCALAR_MUL_LN>; - //def SCALAR_FMULX_LANEQH : IOpInst<"vmulx_laneq", "ssji", "Sh", OP_SCALAR_MUL_LN>; + // Scalar floating point mulx (scalar, by element) + def SCALAR_FMULX_LANEH : IInst<"vmulx_lane", "ssdi", "Sh">; + def SCALAR_FMULX_LANEQH : IInst<"vmulx_laneq", "ssji", "Sh">; // ARMv8.2-A FP16 reduction vector intrinsics. def VMAXVH : SInst<"vmaxv", "sd", "hQh">; Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -7238,6 +7238,16 @@ Int = Intrinsic::aarch64_neon_fmulx; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmulx"); } + case NEON::BI__builtin_neon_vmulxh_lane_f16: + case NEON::BI__builtin_neon_vmulxh_laneq_f16: { + // vmulx_lane should be mapped to Neon scalar mulx after + // extracting the scalar element + Ops.push_back(EmitScalarExpr(E->getArg(2))); + Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract"); + Ops.pop_back(); + Int = Intrinsic::aarch64_neon_fmulx; + return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmulx"); + } case NEON::BI__builtin_neon_vmul_lane_v: case NEON::BI__builtin_neon_vmul_laneq_v: { // v1f64 vmul_lane should be mapped to Neon scalar mul lane Index: clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c =================================================================== --- clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c +++ clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c @@ -1223,27 +1223,25 @@ return vmulxq_n_f16(a, b); } -/* TODO: Not implemented yet (needs scalar intrinsic from arm_fp16.h) -// CCHECK-LABEL: test_vmulxh_lane_f16 -// CCHECK: [[CONV0:%.*]] = fpext half %a to float -// CCHECK: [[CONV1:%.*]] = fpext half %{{.*}} to float -// CCHECK: [[MUL:%.*]] = fmul float [[CONV0:%.*]], [[CONV0:%.*]] -// CCHECK: [[CONV3:%.*]] = fptrunc float %mul to half -// CCHECK: ret half [[CONV3:%.*]] +// CHECK-LABEL: test_vmulxh_lane_f16 +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[EXTR:%.*]] = extractelement <4 x half> [[TMP1]], i32 3 +// CHECK: [[MULX:%.*]] = call half @llvm.aarch64.neon.fmulx.f16(half %a, half [[EXTR]] +// CHECK: ret half [[MULX]] float16_t test_vmulxh_lane_f16(float16_t a, float16x4_t b) { return vmulxh_lane_f16(a, b, 3); } -// CCHECK-LABEL: test_vmulxh_laneq_f16 -// CCHECK: [[CONV0:%.*]] = fpext half %a to float -// CCHECK: [[CONV1:%.*]] = fpext half %{{.*}} to float -// CCHECK: [[MUL:%.*]] = fmul float [[CONV0:%.*]], [[CONV0:%.*]] -// CCHECK: [[CONV3:%.*]] = fptrunc float %mul to half -// CCHECK: ret half [[CONV3:%.*]] +// CHECK-LABEL: test_vmulxh_laneq_f16 +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK: [[EXTR:%.*]] = extractelement <8 x half> [[TMP1]], i32 7 +// CHECK: [[MULX:%.*]] = call half @llvm.aarch64.neon.fmulx.f16(half %a, half [[EXTR]]) +// CHECK: ret half [[MULX]] float16_t test_vmulxh_laneq_f16(float16_t a, float16x8_t b) { return vmulxh_laneq_f16(a, b, 7); } -*/ // CHECK-LABEL: test_vmaxv_f16 // CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> Index: llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll =================================================================== --- llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll +++ llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll @@ -1,5 +1,6 @@ ; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+v8.2a,+fullfp16 | FileCheck %s +declare half @llvm.aarch64.neon.fmulx.f16(half, half) declare <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half>, <4 x half>) declare <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half>, <8 x half>) declare <4 x half> @llvm.fma.v4f16(<4 x half>, <4 x half>, <4 x half>) @@ -236,6 +237,25 @@ ret half %1 } +define dso_local half @t_vmulx_f16(half %a, half %b) { +; CHECK-LABEL: t_vmulx_f16: +; CHECK: fmulx h0, h0, h1 +; CHECK-NEXT: ret +entry: + %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %b) + ret half %fmulx.i +} + +define dso_local half @t_vmulxh_lane_f16(half %a, <4 x half> %b, i32 %lane) { +; CHECK-LABEL: t_vmulxh_lane_f16: +; CHECK: fmulx h0, h0, v1.h[3] +; CHECK-NEXT: ret +entry: + %extract = extractelement <4 x half> %b, i32 3 + %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %extract) + ret half %fmulx.i +} + define dso_local <4 x half> @t_vmulx_lane_f16(<4 x half> %a, <4 x half> %b, i32 %lane) { ; CHECK-LABEL: t_vmulx_lane_f16: ; CHECK: fmulx v0.4h, v0.4h, v1.h[0] @@ -276,6 +296,16 @@ ret <8 x half> %vmulx2.i } +define dso_local half @t_vmulxh_laneq_f16(half %a, <8 x half> %b, i32 %lane) { +; CHECK-LABEL: t_vmulxh_laneq_f16: +; CHECK: fmulx h0, h0, v1.h[7] +; CHECK-NEXT: ret +entry: + %extract = extractelement <8 x half> %b, i32 7 + %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %extract) + ret half %fmulx.i +} + define dso_local <4 x half> @t_vmulx_n_f16(<4 x half> %a, half %c) { ; CHECK-LABEL: t_vmulx_n_f16: ; CHECK: dup v1.4h, v1.h[0]