Diff 271095

clang/include/clang/Basic/arm_neon.td

Show First 20 Lines • Show All 238 Lines • ▼ Show 20 Lines
// sudot splats the second vector and then calls vusdot		// sudot splats the second vector and then calls vusdot
def OP_SUDOT_LN		def OP_SUDOT_LN
: Op<(call "vusdot", $p0,		: Op<(call "vusdot", $p0,
(cast "8", "U", (call_mangled "splat_lane", (bitcast "int32x2_t", $p2), $p3)), $p1)>;		(cast "8", "U", (call_mangled "splat_lane", (bitcast "int32x2_t", $p2), $p3)), $p1)>;
def OP_SUDOT_LNQ		def OP_SUDOT_LNQ
: Op<(call "vusdot", $p0,		: Op<(call "vusdot", $p0,
(cast "8", "U", (call_mangled "splat_lane", (bitcast "int32x4_t", $p2), $p3)), $p1)>;		(cast "8", "U", (call_mangled "splat_lane", (bitcast "int32x4_t", $p2), $p3)), $p1)>;

		def OP_BFDOT_LN
		: Op<(call "vbfdot", $p0, $p1,
		(bitcast $p1, (call_mangled "splat_lane", (bitcast "float32x2_t", $p2), $p3)))>;

		def OP_BFDOT_LNQ
		: Op<(call "vbfdot", $p0, $p1,
		(bitcast $p1, (call_mangled "splat_lane", (bitcast "float32x4_t", $p2), $p3)))>;

		def OP_BFMLALB_LN
		: Op<(call "vbfmlalb", $p0, $p1,
		(dup_typed $p1, (call "vget_lane", $p2, $p3)))>;

		def OP_BFMLALT_LN
		: Op<(call "vbfmlalt", $p0, $p1,
		(dup_typed $p1, (call "vget_lane", $p2, $p3)))>;

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Auxiliary Instructions		// Auxiliary Instructions
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

// Splat operation - performs a range-checked splat over a vector		// Splat operation - performs a range-checked splat over a vector
def SPLAT : WInst<"splat_lane", ".(!q)I",		def SPLAT : WInst<"splat_lane", ".(!q)I",
"UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPl">;		"UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPl">;
def SPLATQ : WInst<"splat_laneq", ".(!Q)I",		def SPLATQ : WInst<"splat_laneq", ".(!Q)I",
▲ Show 20 Lines • Show All 1,587 Lines • ▼ Show 20 Lines	let ArchGuard = "defined(__ARM_FEATURE_MATMUL_INT8)" in {
let ArchGuard = "defined(__aarch64__)" in {		let ArchGuard = "defined(__aarch64__)" in {
let isLaneQ = 1 in {		let isLaneQ = 1 in {
def VUSDOT_LANEQ : SOpInst<"vusdot_laneq", "..(<<U)(<<Q)I", "iQi", OP_USDOT_LNQ>;		def VUSDOT_LANEQ : SOpInst<"vusdot_laneq", "..(<<U)(<<Q)I", "iQi", OP_USDOT_LNQ>;
def VSUDOT_LANEQ : SOpInst<"vsudot_laneq", "..(<<)(<<QU)I", "iQi", OP_SUDOT_LNQ>;		def VSUDOT_LANEQ : SOpInst<"vsudot_laneq", "..(<<)(<<QU)I", "iQi", OP_SUDOT_LNQ>;
}		}
}		}
}		}

		let ArchGuard = "defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC)" in {
		def VDOT_BF : SInst<"vbfdot", "..BB", "fQf">;
		def VDOT_LANE_BF : SOpInst<"vbfdot_lane", "..B(Bq)I", "fQf", OP_BFDOT_LN>;
		def VDOT_LANEQ_BF : SOpInst<"vbfdot_laneq", "..B(BQ)I", "fQf", OP_BFDOT_LNQ> {
		let isLaneQ = 1;
		}

		def VFMMLA_BF : SInst<"vbfmmla", "..BB", "Qf">;

		def VFMLALB_BF : SInst<"vbfmlalb", "..BB", "Qf">;
		def VFMLALT_BF : SInst<"vbfmlalt", "..BB", "Qf">;

		def VFMLALB_LANE_BF : SOpInst<"vbfmlalb_lane", "..B(Bq)I", "Qf", OP_BFMLALB_LN>;
		def VFMLALB_LANEQ_BF : SOpInst<"vbfmlalb_laneq", "..B(BQ)I", "Qf", OP_BFMLALB_LN>;

		def VFMLALT_LANE_BF : SOpInst<"vbfmlalt_lane", "..B(Bq)I", "Qf", OP_BFMLALT_LN>;
		def VFMLALT_LANEQ_BF : SOpInst<"vbfmlalt_laneq", "..B(BQ)I", "Qf", OP_BFMLALT_LN>;
		}

// v8.3-A Vector complex addition intrinsics		// v8.3-A Vector complex addition intrinsics
let ArchGuard = "defined(__ARM_FEATURE_COMPLEX) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)" in {		let ArchGuard = "defined(__ARM_FEATURE_COMPLEX) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)" in {
def VCADD_ROT90_FP16 : SInst<"vcadd_rot90", "...", "h">;		def VCADD_ROT90_FP16 : SInst<"vcadd_rot90", "...", "h">;
def VCADD_ROT270_FP16 : SInst<"vcadd_rot270", "...", "h">;		def VCADD_ROT270_FP16 : SInst<"vcadd_rot270", "...", "h">;
def VCADDQ_ROT90_FP16 : SInst<"vcaddq_rot90", "QQQ", "h">;		def VCADDQ_ROT90_FP16 : SInst<"vcaddq_rot90", "QQQ", "h">;
def VCADDQ_ROT270_FP16 : SInst<"vcaddq_rot270", "QQQ", "h">;		def VCADDQ_ROT270_FP16 : SInst<"vcaddq_rot270", "QQQ", "h">;
}		}
let ArchGuard = "defined(__ARM_FEATURE_COMPLEX)" in {		let ArchGuard = "defined(__ARM_FEATURE_COMPLEX)" in {
▲ Show 20 Lines • Show All 92 Lines • Show Last 20 Lines

clang/lib/CodeGen/CGBuiltin.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 4,964 Lines • ▼ Show 20 Lines	static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
NEONMAP0(splatq_laneq_v),		NEONMAP0(splatq_laneq_v),
NEONMAP1(vabs_v, aarch64_neon_abs, 0),		NEONMAP1(vabs_v, aarch64_neon_abs, 0),
NEONMAP1(vabsq_v, aarch64_neon_abs, 0),		NEONMAP1(vabsq_v, aarch64_neon_abs, 0),
NEONMAP0(vaddhn_v),		NEONMAP0(vaddhn_v),
NEONMAP1(vaesdq_v, aarch64_crypto_aesd, 0),		NEONMAP1(vaesdq_v, aarch64_crypto_aesd, 0),
NEONMAP1(vaeseq_v, aarch64_crypto_aese, 0),		NEONMAP1(vaeseq_v, aarch64_crypto_aese, 0),
NEONMAP1(vaesimcq_v, aarch64_crypto_aesimc, 0),		NEONMAP1(vaesimcq_v, aarch64_crypto_aesimc, 0),
NEONMAP1(vaesmcq_v, aarch64_crypto_aesmc, 0),		NEONMAP1(vaesmcq_v, aarch64_crypto_aesmc, 0),
		NEONMAP1(vbfdot_v, aarch64_neon_bfdot, 0),
		NEONMAP1(vbfdotq_v, aarch64_neon_bfdot, 0),
		NEONMAP1(vbfmlalbq_v, aarch64_neon_bfmlalb, 0),
		NEONMAP1(vbfmlaltq_v, aarch64_neon_bfmlalt, 0),
		NEONMAP1(vbfmmlaq_v, aarch64_neon_bfmmla, 0),
NEONMAP1(vcadd_rot270_v, aarch64_neon_vcadd_rot270, Add1ArgType),		NEONMAP1(vcadd_rot270_v, aarch64_neon_vcadd_rot270, Add1ArgType),
NEONMAP1(vcadd_rot90_v, aarch64_neon_vcadd_rot90, Add1ArgType),		NEONMAP1(vcadd_rot90_v, aarch64_neon_vcadd_rot90, Add1ArgType),
NEONMAP1(vcaddq_rot270_v, aarch64_neon_vcadd_rot270, Add1ArgType),		NEONMAP1(vcaddq_rot270_v, aarch64_neon_vcadd_rot270, Add1ArgType),
NEONMAP1(vcaddq_rot90_v, aarch64_neon_vcadd_rot90, Add1ArgType),		NEONMAP1(vcaddq_rot90_v, aarch64_neon_vcadd_rot90, Add1ArgType),
NEONMAP1(vcage_v, aarch64_neon_facge, 0),		NEONMAP1(vcage_v, aarch64_neon_facge, 0),
NEONMAP1(vcageq_v, aarch64_neon_facge, 0),		NEONMAP1(vcageq_v, aarch64_neon_facge, 0),
NEONMAP1(vcagt_v, aarch64_neon_facgt, 0),		NEONMAP1(vcagt_v, aarch64_neon_facgt, 0),
NEONMAP1(vcagtq_v, aarch64_neon_facgt, 0),		NEONMAP1(vcagtq_v, aarch64_neon_facgt, 0),
▲ Show 20 Lines • Show All 1,155 Lines • ▼ Show 20 Lines	Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
}		}
case NEON::BI__builtin_neon_vusdot_v:		case NEON::BI__builtin_neon_vusdot_v:
case NEON::BI__builtin_neon_vusdotq_v: {		case NEON::BI__builtin_neon_vusdotq_v: {
auto *InputTy =		auto *InputTy =
llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);		llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
llvm::Type *Tys[2] = { Ty, InputTy };		llvm::Type *Tys[2] = { Ty, InputTy };
return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusdot");		return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusdot");
}		}
		case NEON::BI__builtin_neon_vbfdot_v:
		case NEON::BI__builtin_neon_vbfdotq_v: {
		llvm::Type *InputTy =
		llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
		llvm::Type *Tys[2] = { Ty, InputTy };
		return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfdot");
		}
		case NEON::BI__builtin_neon_vbfmmlaq_v: {
		llvm::Type *InputTy =
		llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
		llvm::Type *Tys[2] = { Ty, InputTy };
		return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmmla");
		}
		case NEON::BI__builtin_neon_vbfmlalbq_v: {
		llvm::Type *InputTy =
		llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
		llvm::Type *Tys[2] = { Ty, InputTy };
		return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmlalb");
		}
		case NEON::BI__builtin_neon_vbfmlaltq_v: {
		llvm::Type *InputTy =
		llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
		llvm::Type *Tys[2] = { Ty, InputTy };
		return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmlalt");
		}

}		}

assert(Int && "Expected valid intrinsic number");		assert(Int && "Expected valid intrinsic number");

// Determine the type(s) of this overloaded AArch64 intrinsic.		// Determine the type(s) of this overloaded AArch64 intrinsic.
Function *F = LookupNeonLLVMIntrinsic(Int, Modifier, Ty, E);		Function *F = LookupNeonLLVMIntrinsic(Int, Modifier, Ty, E);

Value *Result = EmitNeonCall(F, Ops, NameHint);		Value *Result = EmitNeonCall(F, Ops, NameHint);
▲ Show 20 Lines • Show All 10,074 Lines • ▼ Show 20 Lines	case WebAssembly::BI__builtin_wasm_qfms_f64x2:
IntNo = Intrinsic::wasm_qfms;		IntNo = Intrinsic::wasm_qfms;
break;		break;
default:		default:
llvm_unreachable("unexpected builtin ID");		llvm_unreachable("unexpected builtin ID");
}		}
Function *Callee = CGM.getIntrinsic(IntNo, A->getType());		Function *Callee = CGM.getIntrinsic(IntNo, A->getType());
return Builder.CreateCall(Callee, {A, B, C});		return Builder.CreateCall(Callee, {A, B, C});
}		}
case WebAssembly::BI__builtin_wasm_narrow_s_i8x16_i16x8:		case WebAssembly::BI__builtin_wasm_narrow_s_i8x16_i16x8:
		miyukiUnsubmitted Not Done Reply Inline Actions This chunk does not belong to the patch miyuki: This chunk does not belong to the patch
		miyukiUnsubmitted Not Done Reply Inline Actions Oops, ignore the previous comment, please. miyuki: Oops, ignore the previous comment, please.
case WebAssembly::BI__builtin_wasm_narrow_u_i8x16_i16x8:		case WebAssembly::BI__builtin_wasm_narrow_u_i8x16_i16x8:
case WebAssembly::BI__builtin_wasm_narrow_s_i16x8_i32x4:		case WebAssembly::BI__builtin_wasm_narrow_s_i16x8_i32x4:
case WebAssembly::BI__builtin_wasm_narrow_u_i16x8_i32x4: {		case WebAssembly::BI__builtin_wasm_narrow_u_i16x8_i32x4: {
Value *Low = EmitScalarExpr(E->getArg(0));		Value *Low = EmitScalarExpr(E->getArg(0));
Value *High = EmitScalarExpr(E->getArg(1));		Value *High = EmitScalarExpr(E->getArg(1));
unsigned IntNo;		unsigned IntNo;
switch (BuiltinID) {		switch (BuiltinID) {
case WebAssembly::BI__builtin_wasm_narrow_s_i8x16_i16x8:		case WebAssembly::BI__builtin_wasm_narrow_s_i8x16_i16x8:
▲ Show 20 Lines • Show All 310 Lines • Show Last 20 Lines

clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c

This file was added.

				// RUN: %clang_cc1 -triple aarch64-arm-none-eabi -target-feature +neon -target-feature +bf16 \
				// RUN: -disable-O0-optnone -emit-llvm %s -o - \| opt -S -mem2reg -instcombine \| FileCheck %s

				miyukiUnsubmitted Done Reply Inline Actions Is it possible to avoid running the whole -O2 pipeline and instead run, say, %clang_cc1 -triple aarch64-arm-none-eabi -target-feature +neon -target-feature +bf16 \ -disable-O0-optnone -emit-llvm %s -o - \| opt -S -mem2reg -instcombine \| FileCheck %s Also, I suggest auto-generating the checks using `llvm/utils/update_cc_test_checks.py`. Sorry, I should have mentioned it in the previous review iteration. miyuki: Is it possible to avoid running the whole -O2 pipeline and instead run, say, ``` %clang_cc1…
				#include <arm_neon.h>

				// CHECK-LABEL: test_vbfdot_f32
				// CHECK-NEXT: entry:
				// CHECK-NEXT %0 = bitcast <4 x bfloat> %a to <8 x i8>
				// CHECK-NEXT %1 = bitcast <4 x bfloat> %b to <8 x i8>
				miyukiUnsubmitted Done Reply Inline Actions Why not `CHECK-NEXT`? miyuki: Why not `CHECK-NEXT`?
				// CHECK-NEXT %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
				// CHECK-NEXT ret <2 x float> %vbfdot1.i
				miyukiUnsubmitted Done Reply Inline Actions `CHECK-NEXT:` miyuki: `CHECK-NEXT:`
				float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
				return vbfdot_f32(r, a, b);
				}

				// CHECK-LABEL: test_vbfdotq_f32
				// CHECK-NEXT: entry:
				// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>
				// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8>
				// CHECK-NEXT %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
				// CHECK-NEXT ret <4 x float> %vbfdot1.i
				float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
				return vbfdotq_f32(r, a, b);
				}

				// CHECK-LABEL: test_vbfdot_lane_f32
				// CHECK-NEXT: entry:
				// CHECK-NEXT %0 = bitcast <4 x bfloat> %b to <2 x float>
				// CHECK-NEXT %lane = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer
				// CHECK-NEXT %1 = bitcast <4 x bfloat> %a to <8 x i8>
				// CHECK-NEXT %2 = bitcast <2 x float> %lane to <8 x i8>
				// CHECK-NEXT %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
				// CHECK-NEXT ret <2 x float> %vbfdot1.i
				float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
				return vbfdot_lane_f32(r, a, b, 0);
				}

				// CHECK-LABEL: test_vbfdotq_laneq_f32
				// CHECK-NEXT: entry:
				// CHECK-NEXT %0 = bitcast <8 x bfloat> %b to <4 x float>
				// CHECK-NEXT %lane = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
				// CHECK-NEXT %1 = bitcast <8 x bfloat> %a to <16 x i8>
				// CHECK-NEXT %2 = bitcast <4 x float> %lane to <16 x i8>
				// CHECK-NEXT %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
				// CHECK-NEXT ret <4 x float> %vbfdot1.i
				float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
				return vbfdotq_laneq_f32(r, a, b, 3);
				}

				// CHECK-LABEL: test_vbfdot_laneq_f32
				// CHECK-NEXT: entry:
				// CHECK-NEXT %0 = bitcast <8 x bfloat> %b to <4 x float>
				// CHECK-NEXT %lane = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 3, i32 3>
				// CHECK-NEXT %1 = bitcast <4 x bfloat> %a to <8 x i8>
				// CHECK-NEXT %2 = bitcast <2 x float> %lane to <8 x i8>
				// CHECK-NEXT %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
				// CHECK-NEXT ret <2 x float> %vbfdot1.i
				float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) {
				return vbfdot_laneq_f32(r, a, b, 3);
				}

				// CHECK-LABEL: test_vbfdotq_lane_f32
				// CHECK-NEXT: entry:
				// CHECK-NEXT %0 = bitcast <4 x bfloat> %b to <2 x float>
				// CHECK-NEXT %lane = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer
				// CHECK-NEXT %1 = bitcast <8 x bfloat> %a to <16 x i8>
				// CHECK-NEXT %2 = bitcast <4 x float> %lane to <16 x i8>
				// CHECK-NEXT %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
				// CHECK-NEXT ret <4 x float> %vbfdot1.i
				float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
				return vbfdotq_lane_f32(r, a, b, 0);
				}

				// CHECK-LABEL: test_vbfmmlaq_f32
				// CHECK-NEXT: entry:
				// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>
				// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8>
				// CHECK-NEXT %vbfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
				// CHECK-NEXT ret <4 x float> %vbfmmla1.i
				float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
				return vbfmmlaq_f32(r, a, b);
				}

				// CHECK-LABEL: test_vbfmlalbq_f32
				// CHECK-NEXT: entry:
				// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>
				// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8>
				// CHECK-NEXT %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
				// CHECK-NEXT ret <4 x float> %vbfmlalb1.i
				float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
				return vbfmlalbq_f32(r, a, b);
				}

				// CHECK-LABEL: test_vbfmlaltq_f32
				// CHECK-NEXT: entry:
				// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>
				// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8>
				// CHECK-NEXT %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
				// CHECK-NEXT ret <4 x float> %vbfmlalt1.i
				float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
				return vbfmlaltq_f32(r, a, b);
				}

				// CHECK-LABEL: test_vbfmlalbq_lane_f32
				// CHECK-NEXT: entry:
				// CHECK-NEXT %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
				// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>
				// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
				// CHECK-NEXT %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
				// CHECK-NEXT ret <4 x float> %vbfmlalb1.i
				float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
				return vbfmlalbq_lane_f32(r, a, b, 0);
				}

				// CHECK-LABEL: test_vbfmlalbq_laneq_f32
				// CHECK-NEXT: entry:
				// CHECK-NEXT %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
				// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>
				// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
				// CHECK-NEXT %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
				// CHECK-NEXT ret <4 x float> %vbfmlalb1.i
				float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
				return vbfmlalbq_laneq_f32(r, a, b, 3);
				}

				// CHECK-LABEL: test_vbfmlaltq_lane_f32
				// CHECK-NEXT: entry:
				// CHECK-NEXT %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
				// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>
				// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
				// CHECK-NEXT %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
				// CHECK-NEXT ret <4 x float> %vbfmlalt1.i
				float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
				return vbfmlaltq_lane_f32(r, a, b, 0);
				}

				// CHECK-LABEL: test_vbfmlaltq_laneq_f32
				// CHECK-NEXT: entry:
				// CHECK-NEXT %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
				// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>
				// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
				// CHECK-NEXT %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
				// CHECK-NEXT ret <4 x float> %vbfmlalt1.i
				float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
				return vbfmlaltq_laneq_f32(r, a, b, 3);
				}

llvm/include/llvm/IR/IntrinsicsAArch64.td

Show First 20 Lines • Show All 172 Lines • ▼ Show 20 Lines	class AdvSIMD_FP16FML_Intrinsic
: Intrinsic<[llvm_anyvector_ty],		: Intrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],		[LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
[IntrNoMem]>;		[IntrNoMem]>;

class AdvSIMD_MatMul_Intrinsic		class AdvSIMD_MatMul_Intrinsic
: Intrinsic<[llvm_anyvector_ty],		: Intrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],		[LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
[IntrNoMem]>;		[IntrNoMem]>;

		class AdvSIMD_FML_Intrinsic
		: Intrinsic<[llvm_anyvector_ty],
		[LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
		[IntrNoMem]>;

}		}

// Arithmetic ops		// Arithmetic ops

let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {		let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
// Vector Add Across Lanes		// Vector Add Across Lanes
def int_aarch64_neon_saddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;		def int_aarch64_neon_saddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
def int_aarch64_neon_uaddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;		def int_aarch64_neon_uaddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
▲ Show 20 Lines • Show All 265 Lines • ▼ Show 20 Lines	let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
def int_aarch64_neon_udot : AdvSIMD_Dot_Intrinsic;		def int_aarch64_neon_udot : AdvSIMD_Dot_Intrinsic;
def int_aarch64_neon_sdot : AdvSIMD_Dot_Intrinsic;		def int_aarch64_neon_sdot : AdvSIMD_Dot_Intrinsic;

// v8.6-A Matrix Multiply Intrinsics		// v8.6-A Matrix Multiply Intrinsics
def int_aarch64_neon_ummla : AdvSIMD_MatMul_Intrinsic;		def int_aarch64_neon_ummla : AdvSIMD_MatMul_Intrinsic;
def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic;		def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic;
def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;		def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;
def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;		def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;
		def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic;
		def int_aarch64_neon_bfmmla : AdvSIMD_MatMul_Intrinsic;
		def int_aarch64_neon_bfmlalb : AdvSIMD_FML_Intrinsic;
		def int_aarch64_neon_bfmlalt : AdvSIMD_FML_Intrinsic;


// v8.2-A FP16 Fused Multiply-Add Long		// v8.2-A FP16 Fused Multiply-Add Long
def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic;		def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic;
def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic;		def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic;
def int_aarch64_neon_fmlal2 : AdvSIMD_FP16FML_Intrinsic;		def int_aarch64_neon_fmlal2 : AdvSIMD_FP16FML_Intrinsic;
def int_aarch64_neon_fmlsl2 : AdvSIMD_FP16FML_Intrinsic;		def int_aarch64_neon_fmlsl2 : AdvSIMD_FP16FML_Intrinsic;

// v8.3-A Floating-point complex add		// v8.3-A Floating-point complex add
▲ Show 20 Lines • Show All 1,879 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64InstrFormats.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 7,809 Lines • ▼ Show 20 Lines
	//----------------------------------------------------------------------------			//----------------------------------------------------------------------------
	// Armv8.6 BFloat16 Extension			// Armv8.6 BFloat16 Extension
	//----------------------------------------------------------------------------			//----------------------------------------------------------------------------
	let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in {			let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in {

	class BaseSIMDThreeSameVectorBFDot<bit Q, bit U, string asm, string kind1,			class BaseSIMDThreeSameVectorBFDot<bit Q, bit U, string asm, string kind1,
	string kind2, RegisterOperand RegType,			string kind2, RegisterOperand RegType,
	ValueType AccumType, ValueType InputType>			ValueType AccumType, ValueType InputType>
	: BaseSIMDThreeSameVectorTied<Q, U, 0b010, 0b11111, RegType, asm, kind1, []> {			: BaseSIMDThreeSameVectorTied<Q, U, 0b010, 0b11111, RegType, asm, kind1, [(set (AccumType RegType:$dst),
				(int_aarch64_neon_bfdot (AccumType RegType:$Rd),
				(InputType RegType:$Rn),
				(InputType RegType:$Rm)))]> {
	let AsmString = !strconcat(asm,			let AsmString = !strconcat(asm,
	"{\t$Rd" # kind1 # ", $Rn" # kind2 #			"{\t$Rd" # kind1 # ", $Rn" # kind2 #
	", $Rm" # kind2 # "}");			", $Rm" # kind2 # "}");
	}			}

	multiclass SIMDThreeSameVectorBFDot<bit U, string asm> {			multiclass SIMDThreeSameVectorBFDot<bit U, string asm> {
	def v4f16 : BaseSIMDThreeSameVectorBFDot<0, U, asm, ".2s", ".4h", V64,			def v4bf16 : BaseSIMDThreeSameVectorBFDot<0, U, asm, ".2s", ".4h", V64,
	v2f32, v8i8>;			v2f32, v8i8>;
	def v8f16 : BaseSIMDThreeSameVectorBFDot<1, U, asm, ".4s", ".8h", V128,			def v8bf16 : BaseSIMDThreeSameVectorBFDot<1, U, asm, ".4s", ".8h", V128,
	v4f32, v16i8>;			v4f32, v16i8>;
	}			}

	class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,			class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,
	string dst_kind, string lhs_kind,			string dst_kind, string lhs_kind,
	string rhs_kind,			string rhs_kind,
	RegisterOperand RegType,			RegisterOperand RegType,
	ValueType AccumType,			ValueType AccumType,
	ValueType InputType>			ValueType InputType>
	: BaseSIMDIndexedTied<Q, U, 0b0, 0b01, 0b1111,			: BaseSIMDIndexedTied<Q, U, 0b0, 0b01, 0b1111,
	RegType, RegType, V128, VectorIndexS,			RegType, RegType, V128, VectorIndexS,
	asm, "", dst_kind, lhs_kind, rhs_kind,			asm, "", dst_kind, lhs_kind, rhs_kind,
	[]> {			[(set (AccumType RegType:$dst),
				(AccumType (int_aarch64_neon_bfdot
				(AccumType RegType:$Rd),
				(InputType RegType:$Rn),
				(InputType (bitconvert (AccumType
				(AArch64duplane32 (v4f32 V128:$Rm),
				VectorIndexH:$idx)))))))]> {

	bits<2> idx;			bits<2> idx;
	let Inst{21} = idx{0}; // L			let Inst{21} = idx{0}; // L
	let Inst{11} = idx{1}; // H			let Inst{11} = idx{1}; // H
	}			}

	multiclass SIMDThreeSameVectorBF16DotI<bit U, string asm> {			multiclass SIMDThreeSameVectorBF16DotI<bit U, string asm> {

	def v4f16 : BaseSIMDThreeSameVectorBF16DotI<0, U, asm, ".2s", ".4h",			def v4bf16 : BaseSIMDThreeSameVectorBF16DotI<0, U, asm, ".2s", ".4h",
	".2h", V64, v2f32, v8i8>;			".2h", V64, v2f32, v8i8>;
	def v8f16 : BaseSIMDThreeSameVectorBF16DotI<1, U, asm, ".4s", ".8h",			def v8bf16 : BaseSIMDThreeSameVectorBF16DotI<1, U, asm, ".4s", ".8h",
	".2h", V128, v4f32, v16i8>;			".2h", V128, v4f32, v16i8>;
	}			}

	class SIMDBF16MLAL<bit Q, string asm>			class SIMDBF16MLAL<bit Q, string asm, SDPatternOperator OpNode>
	: BaseSIMDThreeSameVectorTied<Q, 0b1, 0b110, 0b11111, V128, asm, ".4s",			: BaseSIMDThreeSameVectorTied<Q, 0b1, 0b110, 0b11111, V128, asm, ".4s",
	[]> { // TODO: Add intrinsics			[(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
				(v16i8 V128:$Rn),
				(v16i8 V128:$Rm)))]> {
	let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}");			let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}");
	}			}

	class SIMDBF16MLALIndex<bit Q, string asm>			class SIMDBF16MLALIndex<bit Q, string asm, SDPatternOperator OpNode>
	: I<(outs V128:$dst),			: I<(outs V128:$dst),
	(ins V128:$Rd, V128:$Rn, V128_lo:$Rm, VectorIndexH:$idx), asm,			(ins V128:$Rd, V128:$Rn, V128_lo:$Rm, VectorIndexH:$idx), asm,
	"{\t$Rd.4s, $Rn.8h, $Rm.h$idx}", "$Rd = $dst",			"{\t$Rd.4s, $Rn.8h, $Rm.h$idx}", "$Rd = $dst",
	[]>, // TODO: Add intrinsics			[(set (v4f32 V128:$dst),
				(v4f32 (OpNode (v4f32 V128:$Rd),
				(v16i8 V128:$Rn),
				(v16i8 (bitconvert (v8bf16
				(AArch64duplane16 (v8bf16 V128_lo:$Rm),
				VectorIndexH:$idx)))))))]>,
	Sched<[WriteV]> {			Sched<[WriteV]> {
	bits<5> Rd;			bits<5> Rd;
	bits<5> Rn;			bits<5> Rn;
	bits<4> Rm;			bits<4> Rm;
	bits<3> idx;			bits<3> idx;

	let Inst{31} = 0;			let Inst{31} = 0;
	let Inst{30} = Q;			let Inst{30} = Q;
	let Inst{29-22} = 0b00111111;			let Inst{29-22} = 0b00111111;
	let Inst{21-20} = idx{1-0};			let Inst{21-20} = idx{1-0};
	let Inst{19-16} = Rm;			let Inst{19-16} = Rm;
	let Inst{15-12} = 0b1111;			let Inst{15-12} = 0b1111;
	let Inst{11} = idx{2}; // H			let Inst{11} = idx{2}; // H
	let Inst{10} = 0;			let Inst{10} = 0;
	let Inst{9-5} = Rn;			let Inst{9-5} = Rn;
	let Inst{4-0} = Rd;			let Inst{4-0} = Rd;
	}			}

	class SIMDThreeSameVectorBF16MatrixMul<string asm>			class SIMDThreeSameVectorBF16MatrixMul<string asm>
	: BaseSIMDThreeSameVectorTied<1, 1, 0b010, 0b11101,			: BaseSIMDThreeSameVectorTied<1, 1, 0b010, 0b11101,
	V128, asm, ".4s",			V128, asm, ".4s",
	[]> {			[(set (v4f32 V128:$dst),
				(int_aarch64_neon_bfmmla (v4f32 V128:$Rd),
				(v16i8 V128:$Rn),
				(v16i8 V128:$Rm)))]> {
	let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h",			let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h",
	", $Rm", ".8h", "}");			", $Rm", ".8h", "}");
	}			}

	class SIMD_BFCVTN			class SIMD_BFCVTN
	: BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128,			: BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128,
	"bfcvtn", ".4h", ".4s",			"bfcvtn", ".4h", ".4s",
	[]>;			[]>;
	▲ Show 20 Lines • Show All 3,314 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64InstrInfo.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 778 Lines • ▼ Show 20 Lines
	defm UDOTlane : SIMDThreeSameVectorDotIndex<1, 0, 0b10, "udot", int_aarch64_neon_udot>;			defm UDOTlane : SIMDThreeSameVectorDotIndex<1, 0, 0b10, "udot", int_aarch64_neon_udot>;
	}			}

	// ARMv8.6-A BFloat			// ARMv8.6-A BFloat
	let Predicates = [HasBF16] in {			let Predicates = [HasBF16] in {
	defm BFDOT : SIMDThreeSameVectorBFDot<1, "bfdot">;			defm BFDOT : SIMDThreeSameVectorBFDot<1, "bfdot">;
	defm BF16DOTlane : SIMDThreeSameVectorBF16DotI<0, "bfdot">;			defm BF16DOTlane : SIMDThreeSameVectorBF16DotI<0, "bfdot">;
	def BFMMLA : SIMDThreeSameVectorBF16MatrixMul<"bfmmla">;			def BFMMLA : SIMDThreeSameVectorBF16MatrixMul<"bfmmla">;
	def BFMLALB : SIMDBF16MLAL<0, "bfmlalb">;			def BFMLALB : SIMDBF16MLAL<0, "bfmlalb", int_aarch64_neon_bfmlalb>;
	def BFMLALT : SIMDBF16MLAL<1, "bfmlalt">;			def BFMLALT : SIMDBF16MLAL<1, "bfmlalt", int_aarch64_neon_bfmlalt>;
	def BFMLALBIdx : SIMDBF16MLALIndex<0, "bfmlalb">;			def BFMLALBIdx : SIMDBF16MLALIndex<0, "bfmlalb", int_aarch64_neon_bfmlalb>;
	def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt">;			def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>;
	def BFCVTN : SIMD_BFCVTN;			def BFCVTN : SIMD_BFCVTN;
	def BFCVTN2 : SIMD_BFCVTN2;			def BFCVTN2 : SIMD_BFCVTN2;
	def BFCVT : BF16ToSinglePrecision<"bfcvt">;			def BFCVT : BF16ToSinglePrecision<"bfcvt">;
	}			}

	// ARMv8.6A AArch64 matrix multiplication			// ARMv8.6A AArch64 matrix multiplication
	let Predicates = [HasMatMulInt8] in {			let Predicates = [HasMatMulInt8] in {
	def SMMLA : SIMDThreeSameVectorMatMul<0, 0, "smmla", int_aarch64_neon_smmla>;			def SMMLA : SIMDThreeSameVectorMatMul<0, 0, "smmla", int_aarch64_neon_smmla>;
	▲ Show 20 Lines • Show All 6,860 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -mtriple aarch64-arm-none-eabi -mattr=+bf16 %s -o - \| FileCheck %s

				define <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
				; CHECK-LABEL: test_vbfdot_f32:
				miyukiUnsubmitted Not Done Reply Inline Actions Would it make sense to check the whole body of the compiled function? miyuki: Would it make sense to check the whole body of the compiled function?
				LukeGeesonAuthorUnsubmitted Done Reply Inline Actions Oops sorry, having all kinds of issues with my commit history here, give me a moment to address this LukeGeeson: Oops sorry, having all kinds of issues with my commit history here, give me a moment to address…
				LukeGeesonAuthorUnsubmitted Done Reply Inline Actions I would say it's not worth testing the whole function here, the only code emitted for each is the instruction mentioned in the `CHECK` and a `ret` surrounded by lot's of compiler labels and directives that we don't need to test here LukeGeeson: I would say it's not worth testing the whole function here, the only code emitted for each is…
				miyukiUnsubmitted Not Done Reply Inline Actions I meant, just the code from the first BB label to ret (inclusive), without directives. I suggest using `llvm/utils/update_llc_test_checks.py` to generate the checks. miyuki: I meant, just the code from the first BB label to ret (inclusive), without directives. I…
				LukeGeesonAuthorUnsubmitted Done Reply Inline Actions Hopefully this is everything now, please let me know if there is anything else :) LukeGeeson: Hopefully this is everything now, please let me know if there is anything else :)
				; CHECK: // %bb.0: // %entry
				; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.4h
				; CHECK-NEXT: ret
				entry:
				%0 = bitcast <4 x bfloat> %a to <8 x i8>
				%1 = bitcast <4 x bfloat> %b to <8 x i8>
				%vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
				ret <2 x float> %vbfdot1.i
				}

				define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
				; CHECK-LABEL: test_vbfdotq_f32:
				; CHECK: // %bb.0: // %entry
				; CHECK-NEXT: bfdot v0.4s, v1.8h, v2.8h
				; CHECK-NEXT: ret
				entry:
				%0 = bitcast <8 x bfloat> %a to <16 x i8>
				%1 = bitcast <8 x bfloat> %b to <16 x i8>
				%vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
				ret <4 x float> %vbfdot1.i
				}

				define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
				; CHECK-LABEL: test_vbfdot_lane_f32:
				; CHECK: // %bb.0: // %entry
				; CHECK: bfdot v0.2s, v1.4h, v2.2h[0]
				; CHECK-NEXT: ret
				entry:
				%0 = bitcast <4 x bfloat> %b to <2 x float>
				%shuffle = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer
				%1 = bitcast <4 x bfloat> %a to <8 x i8>
				%2 = bitcast <2 x float> %shuffle to <8 x i8>
				%vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
				ret <2 x float> %vbfdot1.i
				}

				define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
				; CHECK-LABEL: test_vbfdotq_laneq_f32:
				; CHECK: // %bb.0: // %entry
				; CHECK-NEXT: bfdot v0.4s, v1.8h, v2.2h[3]
				; CHECK-NEXT: ret
				entry:
				%0 = bitcast <8 x bfloat> %b to <4 x float>
				%shuffle = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
				%1 = bitcast <8 x bfloat> %a to <16 x i8>
				%2 = bitcast <4 x float> %shuffle to <16 x i8>
				%vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
				ret <4 x float> %vbfdot1.i
				}

				define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) {
				; CHECK-LABEL: test_vbfdot_laneq_f32:
				; CHECK: // %bb.0: // %entry
				; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.2h[3]
				; CHECK-NEXT: ret
				entry:
				%0 = bitcast <8 x bfloat> %b to <4 x float>
				%shuffle = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 3, i32 3>
				%1 = bitcast <4 x bfloat> %a to <8 x i8>
				%2 = bitcast <2 x float> %shuffle to <8 x i8>
				%vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
				ret <2 x float> %vbfdot1.i
				}

				define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
				; CHECK-LABEL: test_vbfdotq_lane_f32:
				; CHECK: // %bb.0: // %entry
				; CHECK: bfdot v0.4s, v1.8h, v2.2h[0]
				; CHECK-NEXT: ret
				entry:
				%0 = bitcast <4 x bfloat> %b to <2 x float>
				%shuffle = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer
				%1 = bitcast <8 x bfloat> %a to <16 x i8>
				%2 = bitcast <4 x float> %shuffle to <16 x i8>
				%vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
				ret <4 x float> %vbfdot1.i
				}

				define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
				; CHECK-LABEL: test_vbfmmlaq_f32:
				; CHECK: // %bb.0: // %entry
				; CHECK-NEXT: bfmmla v0.4s, v1.8h, v2.8h
				; CHECK-NEXT: ret
				entry:
				%0 = bitcast <8 x bfloat> %a to <16 x i8>
				%1 = bitcast <8 x bfloat> %b to <16 x i8>
				%vbfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
				ret <4 x float> %vbfmmla1.i
				}

				define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
				; CHECK-LABEL: test_vbfmlalbq_f32:
				; CHECK: // %bb.0: // %entry
				; CHECK-NEXT: bfmlalb v0.4s, v1.8h, v2.8h
				; CHECK-NEXT: ret
				entry:
				%0 = bitcast <8 x bfloat> %a to <16 x i8>
				%1 = bitcast <8 x bfloat> %b to <16 x i8>
				%vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
				ret <4 x float> %vbfmlalb1.i
				}

				define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
				; CHECK-LABEL: test_vbfmlaltq_f32:
				; CHECK: // %bb.0: // %entry
				; CHECK-NEXT: bfmlalt v0.4s, v1.8h, v2.8h
				; CHECK-NEXT: ret
				entry:
				%0 = bitcast <8 x bfloat> %a to <16 x i8>
				%1 = bitcast <8 x bfloat> %b to <16 x i8>
				%vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
				ret <4 x float> %vbfmlalt1.i
				}

				define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
				; CHECK-LABEL: test_vbfmlalbq_lane_f32:
				; CHECK: // %bb.0: // %entry
				; CHECK: bfmlalb v0.4s, v1.8h, v2.h[0]
				; CHECK-NEXT: ret
				entry:
				%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
				%0 = bitcast <8 x bfloat> %a to <16 x i8>
				%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
				%vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
				ret <4 x float> %vbfmlalb1.i
				}

				define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
				; CHECK-LABEL: test_vbfmlalbq_laneq_f32:
				; CHECK: // %bb.0: // %entry
				; CHECK-NEXT: bfmlalb v0.4s, v1.8h, v2.h[3]
				; CHECK-NEXT: ret
				entry:
				%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
				%0 = bitcast <8 x bfloat> %a to <16 x i8>
				%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
				%vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
				ret <4 x float> %vbfmlalb1.i
				}

				define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
				; CHECK-LABEL: test_vbfmlaltq_lane_f32:
				; CHECK: // %bb.0: // %entry
				; CHECK: bfmlalt v0.4s, v1.8h, v2.h[0]
				; CHECK-NEXT: ret
				entry:
				%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
				%0 = bitcast <8 x bfloat> %a to <16 x i8>
				%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
				%vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
				ret <4 x float> %vbfmlalt1.i
				}

				define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
				; CHECK-LABEL: test_vbfmlaltq_laneq_f32:
				; CHECK: // %bb.0: // %entry
				; CHECK-NEXT: bfmlalt v0.4s, v1.8h, v2.h[3]
				; CHECK-NEXT: ret
				entry:
				%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
				%0 = bitcast <8 x bfloat> %a to <16 x i8>
				%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
				%vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
				ret <4 x float> %vbfmlalt1.i
				}

				declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>) #2
				declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
				declare <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
				declare <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
				declare <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64]: BFloat MatMul Intrinsics&CodeGen
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 271095

clang/include/clang/Basic/arm_neon.td

clang/lib/CodeGen/CGBuiltin.cpp

clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c

llvm/include/llvm/IR/IntrinsicsAArch64.td

llvm/lib/Target/AArch64/AArch64InstrFormats.td

llvm/lib/Target/AArch64/AArch64InstrInfo.td

llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64]: BFloat MatMul Intrinsics&CodeGenClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 271095

clang/include/clang/Basic/arm_neon.td

clang/lib/CodeGen/CGBuiltin.cpp

clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c

llvm/include/llvm/IR/IntrinsicsAArch64.td

llvm/lib/Target/AArch64/AArch64InstrFormats.td

llvm/lib/Target/AArch64/AArch64InstrInfo.td

llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll

[AArch64]: BFloat MatMul Intrinsics&CodeGen
ClosedPublic