Index: include/clang/Basic/arm_neon.td =================================================================== --- include/clang/Basic/arm_neon.td +++ include/clang/Basic/arm_neon.td @@ -199,6 +199,13 @@ (bitcast "int16_t", $p0), (bitcast "int16x8_t", $p1), $p2))>; +def OP_DOT_LN + : Op<(call "vdot", $p0, $p1, + (bitcast $p1, (splat(bitcast "uint32x2_t", $p2), $p3)))>; +def OP_DOT_LNQ + : Op<(call "vdot", $p0, $p1, + (bitcast $p1, (splat(bitcast "uint32x4_t", $p2), $p3)))>; + //===----------------------------------------------------------------------===// // Instructions //===----------------------------------------------------------------------===// @@ -1579,3 +1586,13 @@ def SCALAR_VDUP_LANEH : IInst<"vdup_lane", "sdi", "Sh">; def SCALAR_VDUP_LANEQH : IInst<"vdup_laneq", "sji", "Sh">; } + +// v8.2-A dot product instructions. +let ArchGuard = "defined(__ARM_FEATURE_DOTPROD)" in { + def DOT : SInst<"vdot", "dd88", "iQiUiQUi">; + def DOT_LANE : SOpInst<"vdot_lane", "dd87i", "iUiQiQUi", OP_DOT_LN>; +} +let ArchGuard = "defined(__ARM_FEATURE_DOTPROD) && defined(__aarch64__)" in { + // Variants indexing into a 128-bit vector are A64 only. + def UDOT_LANEQ : SOpInst<"vdot_laneq", "dd89i", "iUiQiQUi", OP_DOT_LNQ>; +} Index: include/clang/Basic/arm_neon_incl.td =================================================================== --- include/clang/Basic/arm_neon_incl.td +++ include/clang/Basic/arm_neon_incl.td @@ -253,6 +253,9 @@ // B,C,D: array of default elts, force 'Q' size modifier. // p: pointer type // c: const pointer type +// 7: vector of 8-bit elements, ignore 'Q' size modifier +// 8: vector of 8-bit elements, same width as default type +// 9: vector of 8-bit elements, force 'Q' size modifier // Every intrinsic subclasses Inst. class Inst { Index: lib/CodeGen/CGBuiltin.cpp =================================================================== --- lib/CodeGen/CGBuiltin.cpp +++ lib/CodeGen/CGBuiltin.cpp @@ -3867,6 +3867,8 @@ NEONMAP0(vcvtq_u16_v), NEONMAP0(vcvtq_u32_v), NEONMAP0(vcvtq_u64_v), + NEONMAP2(vdot_v, arm_neon_udot, arm_neon_sdot, 0), + NEONMAP2(vdotq_v, arm_neon_udot, arm_neon_sdot, 0), NEONMAP0(vext_v), NEONMAP0(vextq_v), NEONMAP0(vfma_v), @@ -4061,6 +4063,8 @@ NEONMAP1(vcvtq_n_u32_v, aarch64_neon_vcvtfp2fxu, 0), NEONMAP1(vcvtq_n_u64_v, aarch64_neon_vcvtfp2fxu, 0), NEONMAP1(vcvtx_f32_v, aarch64_neon_fcvtxn, AddRetType | Add1ArgType), + NEONMAP2(vdot_v, aarch64_neon_udot, aarch64_neon_sdot, 0), + NEONMAP2(vdotq_v, aarch64_neon_udot, aarch64_neon_sdot, 0), NEONMAP0(vext_v), NEONMAP0(vextq_v), NEONMAP0(vfma_v), @@ -4974,6 +4978,14 @@ } return SV; } + case NEON::BI__builtin_neon_vdot_v: + case NEON::BI__builtin_neon_vdotq_v: { + llvm::Type *InputTy = + llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); + llvm::Type *Tys[2] = { Ty, InputTy }; + Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic; + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vdot"); + } } assert(Int && "Expected valid intrinsic number"); Index: test/CodeGen/aarch64-neon-dot-product.c =================================================================== --- test/CodeGen/aarch64-neon-dot-product.c +++ test/CodeGen/aarch64-neon-dot-product.c @@ -0,0 +1,117 @@ +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +dotprod \ +// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -instcombine | FileCheck %s + +// REQUIRES: aarch64-registered-target + +// Test AArch64 Armv8.2-A dot product intrinsics + +#include + +uint32x2_t test_vdot_u32(uint32x2_t a, uint8x8_t b, uint8x8_t c) { +// CHECK-LABEL: define <2 x i32> @test_vdot_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) +// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) +// CHECK: ret <2 x i32> [[RESULT]] + return vdot_u32(a, b, c); +} + +uint32x4_t test_vdotq_u32(uint32x4_t a, uint8x16_t b, uint8x16_t c) { +// CHECK-LABEL: define <4 x i32> @test_vdotq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) +// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) +// CHECK: ret <4 x i32> [[RESULT]] + return vdotq_u32(a, b, c); +} + +int32x2_t test_vdot_s32(int32x2_t a, int8x8_t b, int8x8_t c) { +// CHECK-LABEL: define <2 x i32> @test_vdot_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) +// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) +// CHECK: ret <2 x i32> [[RESULT]] + return vdot_s32(a, b, c); +} + +int32x4_t test_vdotq_s32(int32x4_t a, int8x16_t b, int8x16_t c) { +// CHECK-LABEL: define <4 x i32> @test_vdotq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) +// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) +// CHECK: ret <4 x i32> [[RESULT]] + return vdotq_s32(a, b, c); +} + +uint32x2_t test_vdot_lane_u32(uint32x2_t a, uint8x8_t b, uint8x8_t c) { +// CHECK-LABEL: define <2 x i32> @test_vdot_lane_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) +// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <2 x i32> +// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]]) +// CHECK: ret <2 x i32> [[RESULT]] + return vdot_lane_u32(a, b, c, 1); +} + +uint32x4_t test_vdotq_lane_u32(uint32x4_t a, uint8x16_t b, uint8x8_t c) { +// CHECK-LABEL: define <4 x i32> @test_vdotq_lane_u32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) +// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <4 x i32> +// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> +// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]]) +// CHECK: ret <4 x i32> [[RESULT]] + return vdotq_lane_u32(a, b, c, 1); +} + +uint32x2_t test_vdot_laneq_u32(uint32x2_t a, uint8x8_t b, uint8x16_t c) { +// CHECK-LABEL: define <2 x i32> @test_vdot_laneq_u32(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) +// CHECK: [[CAST1:%.*]] = bitcast <16 x i8> %c to <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[CAST1]], <4 x i32> undef, <2 x i32> +// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]]) +// CHECK: ret <2 x i32> [[RESULT]] + return vdot_laneq_u32(a, b, c, 1); +} + +uint32x4_t test_vdotq_laneq_u32(uint32x4_t a, uint8x16_t b, uint8x16_t c) { +// CHECK-LABEL: define <4 x i32> @test_vdotq_laneq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) +// CHECK: [[CAST1:%.*]] = bitcast <16 x i8> %c to <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[CAST1]], <4 x i32> undef, <4 x i32> +// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> +// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]]) +// CHECK: ret <4 x i32> [[RESULT]] + return vdotq_laneq_u32(a, b, c, 1); +} + +int32x2_t test_vdot_lane_s32(int32x2_t a, int8x8_t b, int8x8_t c) { +// CHECK-LABEL: define <2 x i32> @test_vdot_lane_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) +// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <2 x i32> +// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]]) +// CHECK: ret <2 x i32> [[RESULT]] + return vdot_lane_s32(a, b, c, 1); +} + +int32x4_t test_vdotq_lane_s32(int32x4_t a, int8x16_t b, int8x8_t c) { +// CHECK-LABEL: define <4 x i32> @test_vdotq_lane_s32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) +// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <4 x i32> +// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> +// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]]) +// CHECK: ret <4 x i32> [[RESULT]] + return vdotq_lane_s32(a, b, c, 1); +} + +int32x2_t test_vdot_laneq_s32(int32x2_t a, int8x8_t b, int8x16_t c) { +// CHECK-LABEL: define <2 x i32> @test_vdot_laneq_s32(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) +// CHECK: [[CAST1:%.*]] = bitcast <16 x i8> %c to <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[CAST1]], <4 x i32> undef, <2 x i32> +// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]]) +// CHECK: ret <2 x i32> [[RESULT]] + return vdot_laneq_s32(a, b, c, 1); +} + +int32x4_t test_vdotq_laneq_s32(int32x4_t a, int8x16_t b, int8x16_t c) { +// CHECK-LABEL: define <4 x i32> @test_vdotq_laneq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) +// CHECK: [[CAST1:%.*]] = bitcast <16 x i8> %c to <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[CAST1]], <4 x i32> undef, <4 x i32> +// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> +// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]]) +// CHECK: ret <4 x i32> [[RESULT]] + return vdotq_laneq_s32(a, b, c, 1); +} + Index: test/CodeGen/arm-neon-dot-product.c =================================================================== --- test/CodeGen/arm-neon-dot-product.c +++ test/CodeGen/arm-neon-dot-product.c @@ -0,0 +1,76 @@ +// RUN: %clang_cc1 -triple armv8-linux-gnueabihf -target-cpu cortex-a75 -target-feature +dotprod \ +// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -instcombine | FileCheck %s + +// REQUIRES: arm-registered-target + +// Test ARM v8.2-A dot product intrinsics + +#include + +uint32x2_t test_vdot_u32(uint32x2_t a, uint8x8_t b, uint8x8_t c) { +// CHECK-LABEL: define <2 x i32> @test_vdot_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) +// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.arm.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) +// CHECK: ret <2 x i32> [[RESULT]] + return vdot_u32(a, b, c); +} + +uint32x4_t test_vdotq_u32(uint32x4_t a, uint8x16_t b, uint8x16_t c) { +// CHECK-LABEL: define <4 x i32> @test_vdotq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) +// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.arm.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) +// CHECK: ret <4 x i32> [[RESULT]] + return vdotq_u32(a, b, c); +} + +int32x2_t test_vdot_s32(int32x2_t a, int8x8_t b, int8x8_t c) { +// CHECK-LABEL: define <2 x i32> @test_vdot_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) +// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.arm.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) +// CHECK: ret <2 x i32> [[RESULT]] + return vdot_s32(a, b, c); +} + +int32x4_t test_vdotq_s32(int32x4_t a, int8x16_t b, int8x16_t c) { +// CHECK-LABEL: define <4 x i32> @test_vdotq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) +// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.arm.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) +// CHECK: ret <4 x i32> [[RESULT]] + return vdotq_s32(a, b, c); +} + +uint32x2_t test_vdot_lane_u32(uint32x2_t a, uint8x8_t b, uint8x8_t c) { +// CHECK-LABEL: define <2 x i32> @test_vdot_lane_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) +// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <2 x i32> +// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.arm.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]]) +// CHECK: ret <2 x i32> [[RESULT]] + return vdot_lane_u32(a, b, c, 1); +} + +uint32x4_t test_vdotq_lane_u32(uint32x4_t a, uint8x16_t b, uint8x8_t c) { +// CHECK-LABEL: define <4 x i32> @test_vdotq_lane_u32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) +// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <4 x i32> +// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> +// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.arm.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]]) +// CHECK: ret <4 x i32> [[RESULT]] + return vdotq_lane_u32(a, b, c, 1); +} + +int32x2_t test_vdot_lane_s32(int32x2_t a, int8x8_t b, int8x8_t c) { +// CHECK-LABEL: define <2 x i32> @test_vdot_lane_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) +// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <2 x i32> +// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.arm.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]]) +// CHECK: ret <2 x i32> [[RESULT]] + return vdot_lane_s32(a, b, c, 1); +} + +int32x4_t test_vdotq_lane_s32(int32x4_t a, int8x16_t b, int8x8_t c) { +// CHECK-LABEL: define <4 x i32> @test_vdotq_lane_s32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) +// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <4 x i32> +// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> +// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.arm.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]]) +// CHECK: ret <4 x i32> [[RESULT]] + return vdotq_lane_s32(a, b, c, 1); +} Index: utils/TableGen/NeonEmitter.cpp =================================================================== --- utils/TableGen/NeonEmitter.cpp +++ utils/TableGen/NeonEmitter.cpp @@ -995,6 +995,19 @@ if (!AppliedQuad) Bitwidth *= 2; break; + case '7': + if (AppliedQuad) + Bitwidth /= 2; + ElementBitwidth = 8; + break; + case '8': + ElementBitwidth = 8; + break; + case '9': + if (!AppliedQuad) + Bitwidth *= 2; + ElementBitwidth = 8; + break; default: llvm_unreachable("Unhandled character!"); }