diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -1902,6 +1902,25 @@ multiclass VCMLA_ROTS { foreach ROT = ["", "_rot90", "_rot180", "_rot270" ] in { def : SInst<"vcmla" # ROT, "....", type # "Q" # type>; + + // vcmla{ROT}_lane + def : SOpInst<"vcmla" # ROT # "_lane", "...qI", type, Op<(call "vcmla" # ROT, $p0, $p1, + (bitcast $p0, (dup_typed lanety , (call "vget_lane", (bitcast lanety, $p2), $p3))))>>; + + // vcmlaq{ROT}_lane + def : SOpInst<"vcmla" # ROT # "_lane", "...qI", "Q" # type, Op<(call "vcmla" # ROT, $p0, $p1, + (bitcast $p0, (dup_typed laneqty , (call "vget_lane", (bitcast lanety, $p2), $p3))))>>; + + + let isLaneQ = 1 in { + // vcmla{ROT}_laneq + def : SOpInst<"vcmla" # ROT # "_laneq", "...QI", type, Op<(call "vcmla" # ROT, $p0, $p1, + (bitcast $p0, (dup_typed lanety, (call "vget_lane", (bitcast laneqty, $p2), $p3))))>>; + + // vcmlaq{ROT}_laneq + def : SOpInst<"vcmla" # ROT # "_laneq", "...QI", "Q" # type, Op<(call "vcmla" # ROT, $p0, $p1, + (bitcast $p0, (dup_typed laneqty , (call "vget_lane", (bitcast laneqty, $p2), $p3))))>>; + } } } diff --git a/clang/test/CodeGen/aarch64-neon-vcmla.c b/clang/test/CodeGen/aarch64-neon-vcmla.c --- a/clang/test/CodeGen/aarch64-neon-vcmla.c +++ b/clang/test/CodeGen/aarch64-neon-vcmla.c @@ -144,3 +144,83 @@ float64x2_t test_vcmlaq_rot270_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) { return vcmlaq_rot270_f64(acc, lhs, rhs); } + +// CHECK-LABEL: @test_vcmla_lane_f16( +// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32> +// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> +// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half> +// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]]) +// CHECK: ret <4 x half> [[RES]] +float16x4_t test_vcmla_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) { + return vcmla_lane_f16(acc, lhs, rhs, 1); +} + +// ACLE says this exists, but it won't map to a single instruction if lane > 1. +// CHECK-LABEL: @test_vcmla_laneq_f16( +// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32> +// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <2 x i32> +// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half> +// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]]) +// CHECK: ret <4 x half> [[RES]] +float16x4_t test_vcmla_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) { + return vcmla_laneq_f16(acc, lhs, rhs, 3); +} + +// CHECK-LABEL: @test_vcmlaq_lane_f16( +// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32> +// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <4 x i32> +// CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half> +// CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]]) +// CHECK: ret <8 x half> [[RES]] +float16x8_t test_vcmlaq_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) { + return vcmlaq_lane_f16(acc, lhs, rhs, 1); +} + +// CHECK-LABEL: @test_vcmlaq_laneq_f16( +// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32> +// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <4 x i32> +// CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half> +// CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]]) +// CHECK: ret <8 x half> [[RES]] +float16x8_t test_vcmlaq_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) { + return vcmlaq_laneq_f16(acc, lhs, rhs, 3); +} + +// CHECK-LABEL: @test_vcmla_lane_f32( +// CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs) +// CHECK: ret <2 x float> [[RES]] +float32x2_t test_vcmla_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) { + return vcmla_lane_f32(acc, lhs, rhs, 0); +} + +// ACLE says this exists, but it won't map to a single instruction if lane > 1. +// CHECK-LABEL: @test_vcmla_laneq_f32( +// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64> +// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <1 x i32> +// CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float> +// CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]]) +// CHECK: ret <2 x float> [[RES]] +float32x2_t test_vcmla_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) { + return vcmla_laneq_f32(acc, lhs, rhs, 1); +} + +// CHECK-LABEL: @test_vcmlaq_lane_f32( +// CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64 +// CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i32 0 +// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> undef, <2 x i32> zeroinitializer +// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float> +// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]]) +// CHECK: ret <4 x float> [[RES]] +float32x4_t test_vcmlaq_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) { + return vcmlaq_lane_f32(acc, lhs, rhs, 0); +} + +// CHECK-LABEL: @test_vcmlaq_laneq_f32( +// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64> +// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> +// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float> +// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]]) +// CHECK: ret <4 x float> [[RES]] +float32x4_t test_vcmlaq_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) { + return vcmlaq_laneq_f32(acc, lhs, rhs, 1); +} diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -1690,14 +1690,18 @@ std::pair Intrinsic::DagEmitter::emitDagDupTyped(DagInit *DI) { assert_with_loc(DI->getNumArgs() == 2, "dup_typed() expects two arguments"); - std::pair A = - emitDagArg(DI->getArg(0), std::string(DI->getArgNameStr(0))); std::pair B = emitDagArg(DI->getArg(1), std::string(DI->getArgNameStr(1))); assert_with_loc(B.first.isScalar(), "dup_typed() requires a scalar as the second argument"); + Type T; + // If the type argument is a constant string, construct the type directly. + if (StringInit *SI = dyn_cast(DI->getArg(0))) { + T = Type::fromTypedefName(SI->getAsUnquotedString()); + assert_with_loc(!T.isVoid(), "Unknown typedef"); + } else + T = emitDagArg(DI->getArg(0), std::string(DI->getArgNameStr(0))).first; - Type T = A.first; assert_with_loc(T.isVector(), "dup_typed() used but target type is scalar!"); std::string S = "(" + T.str() + ") {"; for (unsigned I = 0; I < T.getNumElements(); ++I) {