Diff 288390

clang/lib/CodeGen/CGBuiltin.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 6,235 Lines • ▼ Show 20 Lines	case NEON::BI__builtin_neon_vusdotq_v: {
auto *InputTy =		auto *InputTy =
llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);		llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
llvm::Type *Tys[2] = { Ty, InputTy };		llvm::Type *Tys[2] = { Ty, InputTy };
return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusdot");		return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusdot");
}		}
case NEON::BI__builtin_neon_vbfdot_v:		case NEON::BI__builtin_neon_vbfdot_v:
case NEON::BI__builtin_neon_vbfdotq_v: {		case NEON::BI__builtin_neon_vbfdotq_v: {
llvm::Type *InputTy =		llvm::Type *InputTy =
llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);		llvm::FixedVectorType::get(BFloatTy, Ty->getPrimitiveSizeInBits() / 16);
llvm::Type *Tys[2] = { Ty, InputTy };		llvm::Type *Tys[2] = { Ty, InputTy };
return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfdot");		return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfdot");
}		}
case NEON::BI__builtin_neon_vbfmmlaq_v: {
llvm::Type *InputTy =
llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
llvm::Type *Tys[2] = { Ty, InputTy };
return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmmla");
}
case NEON::BI__builtin_neon_vbfmlalbq_v: {
llvm::Type *InputTy =
llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
llvm::Type *Tys[2] = { Ty, InputTy };
return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmlalb");
}
case NEON::BI__builtin_neon_vbfmlaltq_v: {
llvm::Type *InputTy =
llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
llvm::Type *Tys[2] = { Ty, InputTy };
return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmlalt");
}
case NEON::BI__builtin_neon___a32_vcvt_bf16_v: {		case NEON::BI__builtin_neon___a32_vcvt_bf16_v: {
llvm::Type *Tys[1] = { Ty };		llvm::Type *Tys[1] = { Ty };
Function *F = CGM.getIntrinsic(Int, Tys);		Function *F = CGM.getIntrinsic(Int, Tys);
return EmitNeonCall(F, Ops, "vcvtfp2bf");		return EmitNeonCall(F, Ops, "vcvtfp2bf");
}		}

}		}

▲ Show 20 Lines • Show All 10,509 Lines • Show Last 20 Lines

clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c

				// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
	// RUN: %clang_cc1 -triple aarch64-arm-none-eabi -target-feature +neon -target-feature +bf16 \			// RUN: %clang_cc1 -triple aarch64-arm-none-eabi -target-feature +neon -target-feature +bf16 \
	// RUN: -disable-O0-optnone -emit-llvm %s -o - \| opt -S -mem2reg -instcombine \| FileCheck %s			// RUN: -disable-O0-optnone -emit-llvm %s -o - \| opt -S -mem2reg -instcombine \| FileCheck %s

	#include <arm_neon.h>			#include <arm_neon.h>

	// CHECK-LABEL: test_vbfdot_f32			// CHECK-LABEL: @test_vbfdot_f32(
	// CHECK-NEXT: entry:			// CHECK-NEXT: entry:
	// CHECK-NEXT %0 = bitcast <4 x bfloat> %a to <8 x i8>			// CHECK-NEXT: [[VBFDOT3_I:%.]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.]], <4 x bfloat> [[A:%.]], <4 x bfloat> [[B:%.]]) [[ATTR3:#.*]]
	// CHECK-NEXT %1 = bitcast <4 x bfloat> %b to <8 x i8>			// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
	// CHECK-NEXT %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)			//
	// CHECK-NEXT ret <2 x float> %vbfdot1.i
	float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {			float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
	return vbfdot_f32(r, a, b);			return vbfdot_f32(r, a, b);
	}			}

	// CHECK-LABEL: test_vbfdotq_f32			// CHECK-LABEL: @test_vbfdotq_f32(
	// CHECK-NEXT: entry:			// CHECK-NEXT: entry:
	// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>			// CHECK-NEXT: [[VBFDOT3_I:%.]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.]], <8 x bfloat> [[A:%.]], <8 x bfloat> [[B:%.]]) [[ATTR3]]
	// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8>			// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
	// CHECK-NEXT %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)			//
	// CHECK-NEXT ret <4 x float> %vbfdot1.i
	float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){			float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
	return vbfdotq_f32(r, a, b);			return vbfdotq_f32(r, a, b);
	}			}

	// CHECK-LABEL: test_vbfdot_lane_f32			// CHECK-LABEL: @test_vbfdot_lane_f32(
	// CHECK-NEXT: entry:			// CHECK-NEXT: entry:
	// CHECK-NEXT %0 = bitcast <4 x bfloat> %b to <2 x float>			// CHECK-NEXT: [[DOTCAST:%.]] = bitcast <4 x bfloat> [[B:%.]] to <2 x float>
	// CHECK-NEXT %lane = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer			// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <2 x i32> zeroinitializer
	// CHECK-NEXT %1 = bitcast <4 x bfloat> %a to <8 x i8>			// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
	// CHECK-NEXT %2 = bitcast <2 x float> %lane to <8 x i8>			// CHECK-NEXT: [[VBFDOT3_I:%.]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) [[ATTR3]]
	// CHECK-NEXT %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)			// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
	// CHECK-NEXT ret <2 x float> %vbfdot1.i			//
	float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){			float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
	return vbfdot_lane_f32(r, a, b, 0);			return vbfdot_lane_f32(r, a, b, 0);
	}			}

	// CHECK-LABEL: test_vbfdotq_laneq_f32			// CHECK-LABEL: @test_vbfdotq_laneq_f32(
	// CHECK-NEXT: entry:			// CHECK-NEXT: entry:
	// CHECK-NEXT %0 = bitcast <8 x bfloat> %b to <4 x float>			// CHECK-NEXT: [[DOTCAST:%.]] = bitcast <8 x bfloat> [[B:%.]] to <4 x float>
	// CHECK-NEXT %lane = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>			// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
	// CHECK-NEXT %1 = bitcast <8 x bfloat> %a to <16 x i8>			// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
	// CHECK-NEXT %2 = bitcast <4 x float> %lane to <16 x i8>			// CHECK-NEXT: [[VBFDOT3_I:%.]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) [[ATTR3]]
	// CHECK-NEXT %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)			// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
	// CHECK-NEXT ret <4 x float> %vbfdot1.i			//
	float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {			float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
	return vbfdotq_laneq_f32(r, a, b, 3);			return vbfdotq_laneq_f32(r, a, b, 3);
	}			}

	// CHECK-LABEL: test_vbfdot_laneq_f32			// CHECK-LABEL: @test_vbfdot_laneq_f32(
	// CHECK-NEXT: entry:			// CHECK-NEXT: entry:
	// CHECK-NEXT %0 = bitcast <8 x bfloat> %b to <4 x float>			// CHECK-NEXT: [[DOTCAST:%.]] = bitcast <8 x bfloat> [[B:%.]] to <4 x float>
	// CHECK-NEXT %lane = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 3, i32 3>			// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <2 x i32> <i32 3, i32 3>
	// CHECK-NEXT %1 = bitcast <4 x bfloat> %a to <8 x i8>			// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
	// CHECK-NEXT %2 = bitcast <2 x float> %lane to <8 x i8>			// CHECK-NEXT: [[VBFDOT3_I:%.]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) [[ATTR3]]
	// CHECK-NEXT %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)			// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
	// CHECK-NEXT ret <2 x float> %vbfdot1.i			//
	float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) {			float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) {
	return vbfdot_laneq_f32(r, a, b, 3);			return vbfdot_laneq_f32(r, a, b, 3);
	}			}

	// CHECK-LABEL: test_vbfdotq_lane_f32			// CHECK-LABEL: @test_vbfdotq_lane_f32(
	// CHECK-NEXT: entry:			// CHECK-NEXT: entry:
	// CHECK-NEXT %0 = bitcast <4 x bfloat> %b to <2 x float>			// CHECK-NEXT: [[DOTCAST:%.]] = bitcast <4 x bfloat> [[B:%.]] to <2 x float>
	// CHECK-NEXT %lane = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer			// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <4 x i32> zeroinitializer
	// CHECK-NEXT %1 = bitcast <8 x bfloat> %a to <16 x i8>			// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
	// CHECK-NEXT %2 = bitcast <4 x float> %lane to <16 x i8>			// CHECK-NEXT: [[VBFDOT3_I:%.]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) [[ATTR3]]
	// CHECK-NEXT %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)			// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
	// CHECK-NEXT ret <4 x float> %vbfdot1.i			//
	float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {			float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
	return vbfdotq_lane_f32(r, a, b, 0);			return vbfdotq_lane_f32(r, a, b, 0);
	}			}

	// CHECK-LABEL: test_vbfmmlaq_f32			// CHECK-LABEL: @test_vbfmmlaq_f32(
	// CHECK-NEXT: entry:			// CHECK-NEXT: entry:
	// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>			// CHECK-NEXT: [[VBFMMLAQ_V3_I:%.]] = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> [[R:%.]], <8 x bfloat> [[A:%.]], <8 x bfloat> [[B:%.]]) [[ATTR3]]
	// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8>			// CHECK-NEXT: ret <4 x float> [[VBFMMLAQ_V3_I]]
	// CHECK-NEXT %vbfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)			//
	// CHECK-NEXT ret <4 x float> %vbfmmla1.i
	float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {			float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
	return vbfmmlaq_f32(r, a, b);			return vbfmmlaq_f32(r, a, b);
	}			}

	// CHECK-LABEL: test_vbfmlalbq_f32			// CHECK-LABEL: @test_vbfmlalbq_f32(
	// CHECK-NEXT: entry:			// CHECK-NEXT: entry:
	// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>			// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R:%.]], <8 x bfloat> [[A:%.]], <8 x bfloat> [[B:%.]]) [[ATTR3]]
	// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8>			// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
	// CHECK-NEXT %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)			//
	// CHECK-NEXT ret <4 x float> %vbfmlalb1.i
	float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {			float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
	return vbfmlalbq_f32(r, a, b);			return vbfmlalbq_f32(r, a, b);
	}			}

	// CHECK-LABEL: test_vbfmlaltq_f32			// CHECK-LABEL: @test_vbfmlaltq_f32(
	// CHECK-NEXT: entry:			// CHECK-NEXT: entry:
	// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>			// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R:%.]], <8 x bfloat> [[A:%.]], <8 x bfloat> [[B:%.]]) [[ATTR3]]
	// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8>			// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
	// CHECK-NEXT %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)			//
	// CHECK-NEXT ret <4 x float> %vbfmlalt1.i
	float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {			float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
	return vbfmlaltq_f32(r, a, b);			return vbfmlaltq_f32(r, a, b);
	}			}

	// CHECK-LABEL: test_vbfmlalbq_lane_f32			// CHECK-LABEL: @test_vbfmlalbq_lane_f32(
	// CHECK-NEXT: entry:			// CHECK-NEXT: entry:
	// CHECK-NEXT %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer			// CHECK-NEXT: [[VECINIT35:%.]] = shufflevector <4 x bfloat> [[B:%.]], <4 x bfloat> undef, <8 x i32> zeroinitializer
	// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>			// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R:%.]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
	// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>			// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
	// CHECK-NEXT %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)			//
	// CHECK-NEXT ret <4 x float> %vbfmlalb1.i
	float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {			float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
	return vbfmlalbq_lane_f32(r, a, b, 0);			return vbfmlalbq_lane_f32(r, a, b, 0);
	}			}

	// CHECK-LABEL: test_vbfmlalbq_laneq_f32			// CHECK-LABEL: @test_vbfmlalbq_laneq_f32(
	// CHECK-NEXT: entry:			// CHECK-NEXT: entry:
	// CHECK-NEXT %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>			// CHECK-NEXT: [[VECINIT35:%.]] = shufflevector <8 x bfloat> [[B:%.]], <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
	// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>			// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R:%.]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
	// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>			// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
	// CHECK-NEXT %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)			//
	// CHECK-NEXT ret <4 x float> %vbfmlalb1.i
	float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {			float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
	return vbfmlalbq_laneq_f32(r, a, b, 3);			return vbfmlalbq_laneq_f32(r, a, b, 3);
	}			}

	// CHECK-LABEL: test_vbfmlaltq_lane_f32			// CHECK-LABEL: @test_vbfmlaltq_lane_f32(
	// CHECK-NEXT: entry:			// CHECK-NEXT: entry:
	// CHECK-NEXT %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer			// CHECK-NEXT: [[VECINIT35:%.]] = shufflevector <4 x bfloat> [[B:%.]], <4 x bfloat> undef, <8 x i32> zeroinitializer
	// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>			// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R:%.]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
	// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>			// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
	// CHECK-NEXT %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)			//
	// CHECK-NEXT ret <4 x float> %vbfmlalt1.i
	float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {			float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
	return vbfmlaltq_lane_f32(r, a, b, 0);			return vbfmlaltq_lane_f32(r, a, b, 0);
	}			}

	// CHECK-LABEL: test_vbfmlaltq_laneq_f32			// CHECK-LABEL: @test_vbfmlaltq_laneq_f32(
	// CHECK-NEXT: entry:			// CHECK-NEXT: entry:
	// CHECK-NEXT %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>			// CHECK-NEXT: [[VECINIT35:%.]] = shufflevector <8 x bfloat> [[B:%.]], <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
	// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>			// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R:%.]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
	// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>			// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
	// CHECK-NEXT %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)			//
	// CHECK-NEXT ret <4 x float> %vbfmlalt1.i
	float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {			float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
	return vbfmlaltq_laneq_f32(r, a, b, 3);			return vbfmlaltq_laneq_f32(r, a, b, 3);
	}			}

clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c

	// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py			// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
	// RUN: %clang_cc1 -triple armv8-arm-none-eabi \			// RUN: %clang_cc1 -triple armv8-arm-none-eabi \
	// RUN: -target-feature +neon -target-feature +bf16 -mfloat-abi soft \			// RUN: -target-feature +neon -target-feature +bf16 -mfloat-abi soft \
	// RUN: -disable-O0-optnone -S -emit-llvm -o - %s \			// RUN: -disable-O0-optnone -S -emit-llvm -o - %s \
	// RUN: \| opt -S -mem2reg -instcombine \| FileCheck %s			// RUN: \| opt -S -mem2reg -instcombine \| FileCheck %s
	// RUN: %clang_cc1 -triple armv8-arm-none-eabi \			// RUN: %clang_cc1 -triple armv8-arm-none-eabi \
	// RUN: -target-feature +neon -target-feature +bf16 -mfloat-abi hard \			// RUN: -target-feature +neon -target-feature +bf16 -mfloat-abi hard \
	// RUN: -disable-O0-optnone -S -emit-llvm -o - %s \			// RUN: -disable-O0-optnone -S -emit-llvm -o - %s \
	// RUN: \| opt -S -mem2reg -instcombine \| FileCheck %s			// RUN: \| opt -S -mem2reg -instcombine \| FileCheck %s

	#include <arm_neon.h>			#include <arm_neon.h>

	// CHECK-LABEL: @test_vbfdot_f32(			// CHECK-LABEL: @test_vbfdot_f32(
	// CHECK-NEXT: entry:			// CHECK-NEXT: entry:
	// CHECK-NEXT: [[TMP0:%.]] = bitcast <4 x bfloat> [[A:%.]] to <8 x i8>			// CHECK-NEXT: [[VBFDOT3_I:%.]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.]], <4 x bfloat> [[A:%.]], <4 x bfloat> [[B:%.]]) [[ATTR3:#.*]]
	// CHECK-NEXT: [[TMP1:%.]] = bitcast <4 x bfloat> [[B:%.]] to <8 x i8>			// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
	// CHECK-NEXT: [[VBFDOT1_I:%.]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #3
	// CHECK-NEXT: ret <2 x float> [[VBFDOT1_I]]
	//			//
	float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {			float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
	return vbfdot_f32(r, a, b);			return vbfdot_f32(r, a, b);
	}			}

	// CHECK-LABEL: @test_vbfdotq_f32(			// CHECK-LABEL: @test_vbfdotq_f32(
	// CHECK-NEXT: entry:			// CHECK-NEXT: entry:
	// CHECK-NEXT: [[TMP0:%.]] = bitcast <8 x bfloat> [[A:%.]] to <16 x i8>			// CHECK-NEXT: [[VBFDOT3_I:%.]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.]], <8 x bfloat> [[A:%.]], <8 x bfloat> [[B:%.]]) [[ATTR3]]
	// CHECK-NEXT: [[TMP1:%.]] = bitcast <8 x bfloat> [[B:%.]] to <16 x i8>			// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
	// CHECK-NEXT: [[VBFDOT1_I:%.]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
	// CHECK-NEXT: ret <4 x float> [[VBFDOT1_I]]
	//			//
	float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){			float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
	return vbfdotq_f32(r, a, b);			return vbfdotq_f32(r, a, b);
	}			}

	// CHECK-LABEL: @test_vbfdot_lane_f32(			// CHECK-LABEL: @test_vbfdot_lane_f32(
	// CHECK-NEXT: entry:			// CHECK-NEXT: entry:
	// CHECK-NEXT: [[DOTCAST:%.]] = bitcast <4 x bfloat> [[B:%.]] to <2 x float>			// CHECK-NEXT: [[DOTCAST:%.]] = bitcast <4 x bfloat> [[B:%.]] to <2 x float>
	// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <2 x i32> zeroinitializer			// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <2 x i32> zeroinitializer
	// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>			// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
	// CHECK-NEXT: [[TMP0:%.]] = bitcast <4 x bfloat> [[A:%.]] to <8 x i8>			// CHECK-NEXT: [[VBFDOT3_I:%.]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) [[ATTR3]]
	// CHECK-NEXT: [[VBFDOT1_I:%.]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.]], <8 x i8> [[TMP0]], <8 x i8> [[DOTCAST1]]) #3			// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
	// CHECK-NEXT: ret <2 x float> [[VBFDOT1_I]]
	//			//
	float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){			float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
	return vbfdot_lane_f32(r, a, b, 0);			return vbfdot_lane_f32(r, a, b, 0);
	}			}

	// CHECK-LABEL: @test_vbfdotq_laneq_f32(			// CHECK-LABEL: @test_vbfdotq_laneq_f32(
	// CHECK-NEXT: entry:			// CHECK-NEXT: entry:
	// CHECK-NEXT: [[DOTCAST:%.]] = bitcast <8 x bfloat> [[B:%.]] to <4 x float>			// CHECK-NEXT: [[DOTCAST:%.]] = bitcast <8 x bfloat> [[B:%.]] to <4 x float>
	// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>			// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
	// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>			// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
	// CHECK-NEXT: [[TMP0:%.]] = bitcast <8 x bfloat> [[A:%.]] to <16 x i8>			// CHECK-NEXT: [[VBFDOT3_I:%.]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) [[ATTR3]]
	// CHECK-NEXT: [[VBFDOT1_I:%.]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.]], <16 x i8> [[TMP0]], <16 x i8> [[DOTCAST1]]) #3			// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
	// CHECK-NEXT: ret <4 x float> [[VBFDOT1_I]]
	//			//
	float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {			float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
	return vbfdotq_laneq_f32(r, a, b, 3);			return vbfdotq_laneq_f32(r, a, b, 3);
	}			}

	// CHECK-LABEL: @test_vbfdot_laneq_f32(			// CHECK-LABEL: @test_vbfdot_laneq_f32(
	// CHECK-NEXT: entry:			// CHECK-NEXT: entry:
	// CHECK-NEXT: [[DOTCAST:%.]] = bitcast <8 x bfloat> [[B:%.]] to <4 x float>			// CHECK-NEXT: [[DOTCAST:%.]] = bitcast <8 x bfloat> [[B:%.]] to <4 x float>
	// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <2 x i32> <i32 3, i32 3>			// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <2 x i32> <i32 3, i32 3>
	// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>			// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
	// CHECK-NEXT: [[TMP0:%.]] = bitcast <4 x bfloat> [[A:%.]] to <8 x i8>			// CHECK-NEXT: [[VBFDOT3_I:%.]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) [[ATTR3]]
	// CHECK-NEXT: [[VBFDOT1_I:%.]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.]], <8 x i8> [[TMP0]], <8 x i8> [[DOTCAST1]]) #3			// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
	// CHECK-NEXT: ret <2 x float> [[VBFDOT1_I]]
	//			//
	float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) {			float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) {
	return vbfdot_laneq_f32(r, a, b, 3);			return vbfdot_laneq_f32(r, a, b, 3);
	}			}

	// CHECK-LABEL: @test_vbfdotq_lane_f32(			// CHECK-LABEL: @test_vbfdotq_lane_f32(
	// CHECK-NEXT: entry:			// CHECK-NEXT: entry:
	// CHECK-NEXT: [[DOTCAST:%.]] = bitcast <4 x bfloat> [[B:%.]] to <2 x float>			// CHECK-NEXT: [[DOTCAST:%.]] = bitcast <4 x bfloat> [[B:%.]] to <2 x float>
	// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <4 x i32> zeroinitializer			// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <4 x i32> zeroinitializer
	// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>			// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
	// CHECK-NEXT: [[TMP0:%.]] = bitcast <8 x bfloat> [[A:%.]] to <16 x i8>			// CHECK-NEXT: [[VBFDOT3_I:%.]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) [[ATTR3]]
	// CHECK-NEXT: [[VBFDOT1_I:%.]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.]], <16 x i8> [[TMP0]], <16 x i8> [[DOTCAST1]]) #3			// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
	// CHECK-NEXT: ret <4 x float> [[VBFDOT1_I]]
	//			//
	float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {			float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
	return vbfdotq_lane_f32(r, a, b, 0);			return vbfdotq_lane_f32(r, a, b, 0);
	}			}

	// CHECK-LABEL: @test_vbfmmlaq_f32(			// CHECK-LABEL: @test_vbfmmlaq_f32(
	// CHECK-NEXT: entry:			// CHECK-NEXT: entry:
	// CHECK-NEXT: [[TMP0:%.]] = bitcast <8 x bfloat> [[A:%.]] to <16 x i8>			// CHECK-NEXT: [[VBFMMLAQ_V3_I:%.]] = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> [[R:%.]], <8 x bfloat> [[A:%.]], <8 x bfloat> [[B:%.]]) [[ATTR3]]
	// CHECK-NEXT: [[TMP1:%.]] = bitcast <8 x bfloat> [[B:%.]] to <16 x i8>			// CHECK-NEXT: ret <4 x float> [[VBFMMLAQ_V3_I]]
	// CHECK-NEXT: [[VBFMMLA1_I:%.]] = call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> [[R:%.]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
	// CHECK-NEXT: ret <4 x float> [[VBFMMLA1_I]]
	//			//
	float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {			float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
	return vbfmmlaq_f32(r, a, b);			return vbfmmlaq_f32(r, a, b);
	}			}

	// CHECK-LABEL: @test_vbfmlalbq_f32(			// CHECK-LABEL: @test_vbfmlalbq_f32(
	// CHECK-NEXT: entry:			// CHECK-NEXT: entry:
	// CHECK-NEXT: [[TMP0:%.]] = bitcast <8 x bfloat> [[A:%.]] to <16 x i8>			// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R:%.]], <8 x bfloat> [[A:%.]], <8 x bfloat> [[B:%.]]) [[ATTR3]]
	// CHECK-NEXT: [[TMP1:%.]] = bitcast <8 x bfloat> [[B:%.]] to <16 x i8>			// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
	// CHECK-NEXT: [[VBFMLALB1_I:%.]] = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> [[R:%.]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
	// CHECK-NEXT: ret <4 x float> [[VBFMLALB1_I]]
	//			//
	float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {			float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
	return vbfmlalbq_f32(r, a, b);			return vbfmlalbq_f32(r, a, b);
	}			}

	// CHECK-LABEL: @test_vbfmlaltq_f32(			// CHECK-LABEL: @test_vbfmlaltq_f32(
	// CHECK-NEXT: entry:			// CHECK-NEXT: entry:
	// CHECK-NEXT: [[TMP0:%.]] = bitcast <8 x bfloat> [[A:%.]] to <16 x i8>			// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R:%.]], <8 x bfloat> [[A:%.]], <8 x bfloat> [[B:%.]]) [[ATTR3]]
	// CHECK-NEXT: [[TMP1:%.]] = bitcast <8 x bfloat> [[B:%.]] to <16 x i8>			// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
	// CHECK-NEXT: [[VBFMLALT1_I:%.]] = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> [[R:%.]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
	// CHECK-NEXT: ret <4 x float> [[VBFMLALT1_I]]
	//			//
	float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {			float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
	return vbfmlaltq_f32(r, a, b);			return vbfmlaltq_f32(r, a, b);
	}			}

	// CHECK-LABEL: @test_vbfmlalbq_lane_f32(			// CHECK-LABEL: @test_vbfmlalbq_lane_f32(
	// CHECK-NEXT: entry:			// CHECK-NEXT: entry:
	// CHECK-NEXT: [[VECINIT35:%.]] = shufflevector <4 x bfloat> [[B:%.]], <4 x bfloat> undef, <8 x i32> zeroinitializer			// CHECK-NEXT: [[VECINIT35:%.]] = shufflevector <4 x bfloat> [[B:%.]], <4 x bfloat> undef, <8 x i32> zeroinitializer
	// CHECK-NEXT: [[TMP0:%.]] = bitcast <8 x bfloat> [[A:%.]] to <16 x i8>			// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R:%.]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
	// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>			// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
	// CHECK-NEXT: [[VBFMLALB1_I:%.]] = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> [[R:%.]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
	// CHECK-NEXT: ret <4 x float> [[VBFMLALB1_I]]
	//			//
	float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {			float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
	return vbfmlalbq_lane_f32(r, a, b, 0);			return vbfmlalbq_lane_f32(r, a, b, 0);
	}			}

	// CHECK-LABEL: @test_vbfmlalbq_laneq_f32(			// CHECK-LABEL: @test_vbfmlalbq_laneq_f32(
	// CHECK-NEXT: entry:			// CHECK-NEXT: entry:
	// CHECK-NEXT: [[VECINIT35:%.]] = shufflevector <8 x bfloat> [[B:%.]], <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>			// CHECK-NEXT: [[VECINIT35:%.]] = shufflevector <8 x bfloat> [[B:%.]], <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
	// CHECK-NEXT: [[TMP0:%.]] = bitcast <8 x bfloat> [[A:%.]] to <16 x i8>			// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R:%.]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
	// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>			// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
	// CHECK-NEXT: [[VBFMLALB1_I:%.]] = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> [[R:%.]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
	// CHECK-NEXT: ret <4 x float> [[VBFMLALB1_I]]
	//			//
	float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {			float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
	return vbfmlalbq_laneq_f32(r, a, b, 3);			return vbfmlalbq_laneq_f32(r, a, b, 3);
	}			}

	// CHECK-LABEL: @test_vbfmlaltq_lane_f32(			// CHECK-LABEL: @test_vbfmlaltq_lane_f32(
	// CHECK-NEXT: entry:			// CHECK-NEXT: entry:
	// CHECK-NEXT: [[VECINIT35:%.]] = shufflevector <4 x bfloat> [[B:%.]], <4 x bfloat> undef, <8 x i32> zeroinitializer			// CHECK-NEXT: [[VECINIT35:%.]] = shufflevector <4 x bfloat> [[B:%.]], <4 x bfloat> undef, <8 x i32> zeroinitializer
	// CHECK-NEXT: [[TMP0:%.]] = bitcast <8 x bfloat> [[A:%.]] to <16 x i8>			// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R:%.]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
	// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>			// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
	// CHECK-NEXT: [[VBFMLALT1_I:%.]] = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> [[R:%.]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
	// CHECK-NEXT: ret <4 x float> [[VBFMLALT1_I]]
	//			//
	float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {			float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
	return vbfmlaltq_lane_f32(r, a, b, 0);			return vbfmlaltq_lane_f32(r, a, b, 0);
	}			}

	// CHECK-LABEL: @test_vbfmlaltq_laneq_f32(			// CHECK-LABEL: @test_vbfmlaltq_laneq_f32(
	// CHECK-NEXT: entry:			// CHECK-NEXT: entry:
	// CHECK-NEXT: [[VECINIT35:%.]] = shufflevector <8 x bfloat> [[B:%.]], <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>			// CHECK-NEXT: [[VECINIT35:%.]] = shufflevector <8 x bfloat> [[B:%.]], <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
	// CHECK-NEXT: [[TMP0:%.]] = bitcast <8 x bfloat> [[A:%.]] to <16 x i8>			// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R:%.]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
	// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>			// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
	// CHECK-NEXT: [[VBFMLALT1_I:%.]] = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> [[R:%.]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
	// CHECK-NEXT: ret <4 x float> [[VBFMLALT1_I]]
	//			//
	float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {			float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
	return vbfmlaltq_laneq_f32(r, a, b, 3);			return vbfmlaltq_laneq_f32(r, a, b, 3);
	}			}

llvm/include/llvm/IR/IntrinsicsAArch64.td

Show First 20 Lines • Show All 178 Lines • ▼ Show 20 Lines	: Intrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],		[LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
[IntrNoMem]>;		[IntrNoMem]>;

class AdvSIMD_FML_Intrinsic		class AdvSIMD_FML_Intrinsic
: Intrinsic<[llvm_anyvector_ty],		: Intrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],		[LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
[IntrNoMem]>;		[IntrNoMem]>;

		class AdvSIMD_BF16FML_Intrinsic
		: Intrinsic<[llvm_v4f32_ty],
		[llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
		[IntrNoMem]>;
}		}

// Arithmetic ops		// Arithmetic ops

let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {		let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
// Vector Add Across Lanes		// Vector Add Across Lanes
def int_aarch64_neon_saddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;		def int_aarch64_neon_saddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
def int_aarch64_neon_uaddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;		def int_aarch64_neon_uaddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
▲ Show 20 Lines • Show All 266 Lines • ▼ Show 20 Lines	let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
def int_aarch64_neon_sdot : AdvSIMD_Dot_Intrinsic;		def int_aarch64_neon_sdot : AdvSIMD_Dot_Intrinsic;

// v8.6-A Matrix Multiply Intrinsics		// v8.6-A Matrix Multiply Intrinsics
def int_aarch64_neon_ummla : AdvSIMD_MatMul_Intrinsic;		def int_aarch64_neon_ummla : AdvSIMD_MatMul_Intrinsic;
def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic;		def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic;
def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;		def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;
def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;		def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;
def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic;		def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic;
def int_aarch64_neon_bfmmla : AdvSIMD_MatMul_Intrinsic;		def int_aarch64_neon_bfmmla
def int_aarch64_neon_bfmlalb : AdvSIMD_FML_Intrinsic;		: Intrinsic<[llvm_v4f32_ty],
def int_aarch64_neon_bfmlalt : AdvSIMD_FML_Intrinsic;		[llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
		[IntrNoMem]>;
		def int_aarch64_neon_bfmlalb : AdvSIMD_BF16FML_Intrinsic;
		def int_aarch64_neon_bfmlalt : AdvSIMD_BF16FML_Intrinsic;
		dmgreenUnsubmitted Not Done Reply Inline Actions This can be a AdvSIMD_BF16FML_Intrinsic? dmgreen: This can be a AdvSIMD_BF16FML_Intrinsic?
		miyukiAuthorUnsubmitted Done Reply Inline Actions I want to make a distinction because FML is "Fused multiply-add" and bfmmla is a matrix multiplication intrinsic (even though its prototype matches AdvSIMD_BF16FML_Intrinsic). miyuki: I want to make a distinction because FML is "Fused multiply-add" and bfmmla is a matrix…


// v8.6-A Bfloat Intrinsics		// v8.6-A Bfloat Intrinsics
def int_aarch64_neon_bfcvt		def int_aarch64_neon_bfcvt
: Intrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>;		: Intrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>;
def int_aarch64_neon_bfcvtn		def int_aarch64_neon_bfcvtn
: Intrinsic<[llvm_v8bf16_ty], [llvm_v4f32_ty], [IntrNoMem]>;		: Intrinsic<[llvm_v8bf16_ty], [llvm_v4f32_ty], [IntrNoMem]>;
def int_aarch64_neon_bfcvtn2		def int_aarch64_neon_bfcvtn2
▲ Show 20 Lines • Show All 1,916 Lines • Show Last 20 Lines

llvm/include/llvm/IR/IntrinsicsARM.td

	Show First 20 Lines • Show All 785 Lines • ▼ Show 20 Lines

	// v8.6-A Bfloat Intrinsics			// v8.6-A Bfloat Intrinsics
	def int_arm_neon_vcvtfp2bf			def int_arm_neon_vcvtfp2bf
	: Intrinsic<[llvm_anyvector_ty], [llvm_v4f32_ty], [IntrNoMem]>;			: Intrinsic<[llvm_anyvector_ty], [llvm_v4f32_ty], [IntrNoMem]>;
	def int_arm_neon_vcvtbfp2bf			def int_arm_neon_vcvtbfp2bf
	: Intrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>;			: Intrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>;

	def int_arm_neon_bfdot : Neon_Dot_Intrinsic;			def int_arm_neon_bfdot : Neon_Dot_Intrinsic;
	def int_arm_neon_bfmmla : Neon_MatMul_Intrinsic;			def int_arm_neon_bfmmla
				: Intrinsic<[llvm_v4f32_ty],
				[llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
				[IntrNoMem]>;

	class Neon_FML_Intrinsic			class Neon_BF16FML_Intrinsic
	: Intrinsic<[llvm_anyvector_ty],			: Intrinsic<[llvm_v4f32_ty],
	[LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],			[llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
	[IntrNoMem]>;			[IntrNoMem]>;
	def int_arm_neon_bfmlalb : Neon_FML_Intrinsic;			def int_arm_neon_bfmlalb : Neon_BF16FML_Intrinsic;
	def int_arm_neon_bfmlalt : Neon_FML_Intrinsic;			def int_arm_neon_bfmlalt : Neon_BF16FML_Intrinsic;

	def int_arm_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;			def int_arm_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
	def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;			def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;

	def int_arm_mve_vctp8 : Intrinsic<[llvm_v16i1_ty], [llvm_i32_ty], [IntrNoMem]>;			def int_arm_mve_vctp8 : Intrinsic<[llvm_v16i1_ty], [llvm_i32_ty], [IntrNoMem]>;
	def int_arm_mve_vctp16 : Intrinsic<[llvm_v8i1_ty], [llvm_i32_ty], [IntrNoMem]>;			def int_arm_mve_vctp16 : Intrinsic<[llvm_v8i1_ty], [llvm_i32_ty], [IntrNoMem]>;
	def int_arm_mve_vctp32 : Intrinsic<[llvm_v4i1_ty], [llvm_i32_ty], [IntrNoMem]>;			def int_arm_mve_vctp32 : Intrinsic<[llvm_v4i1_ty], [llvm_i32_ty], [IntrNoMem]>;
	// vctp64 takes v4i1, to work around v2i1 not being a legal MVE type			// vctp64 takes v4i1, to work around v2i1 not being a legal MVE type
	▲ Show 20 Lines • Show All 570 Lines • Show Last 20 Lines

llvm/lib/IR/AutoUpgrade.cpp

Show First 20 Lines • Show All 626 Lines • ▼ Show 20 Lines	if (Name.startswith("aarch64.neon.addp")) {
break; // Invalid IR.		break; // Invalid IR.
VectorType *Ty = dyn_cast<VectorType>(F->getReturnType());		VectorType *Ty = dyn_cast<VectorType>(F->getReturnType());
if (Ty && Ty->getElementType()->isFloatingPointTy()) {		if (Ty && Ty->getElementType()->isFloatingPointTy()) {
NewFn = Intrinsic::getDeclaration(F->getParent(),		NewFn = Intrinsic::getDeclaration(F->getParent(),
Intrinsic::aarch64_neon_faddp, Ty);		Intrinsic::aarch64_neon_faddp, Ty);
return true;		return true;
}		}
}		}

		// Changed in 12.0: bfdot accept v4bf16 and v8bf16 instead of v8i8 and v16i8
		// respectively
		if ((Name.startswith("arm.neon.bfdot.") \|\|
		Name.startswith("aarch64.neon.bfdot.")) &&
		Name.endswith("i8")) {
		Intrinsic::ID IID =
		StringSwitch<Intrinsic::ID>(Name)
		.Cases("arm.neon.bfdot.v2f32.v8i8",
		"arm.neon.bfdot.v4f32.v16i8",
		Intrinsic::arm_neon_bfdot)
		.Cases("aarch64.neon.bfdot.v2f32.v8i8",
		"aarch64.neon.bfdot.v4f32.v16i8",
		Intrinsic::aarch64_neon_bfdot)
		.Default(Intrinsic::not_intrinsic);
		if (IID == Intrinsic::not_intrinsic)
		break;

		size_t OperandWidth = F->getReturnType()->getPrimitiveSizeInBits();
		assert((OperandWidth == 64 \|\| OperandWidth == 128) &&
		"Unexpected operand width");
		LLVMContext &Ctx = F->getParent()->getContext();
		std::array<Type *, 2> Tys {{
		F->getReturnType(),
		FixedVectorType::get(Type::getBFloatTy(Ctx), OperandWidth / 16)
		}};
		NewFn = Intrinsic::getDeclaration(F->getParent(), IID, Tys);
		return true;
		}

		// Changed in 12.0: bfmmla, bfmlalb and bfmlalt are not polymorphic anymore
		// and accept v8bf16 instead of v16i8
		if ((Name.startswith("arm.neon.bfm") \|\|
		Name.startswith("aarch64.neon.bfm")) &&
		Name.endswith(".v4f32.v16i8")) {
		Intrinsic::ID IID =
		StringSwitch<Intrinsic::ID>(Name)
		.Case("arm.neon.bfmmla.v4f32.v16i8",
		Intrinsic::arm_neon_bfmmla)
		.Case("arm.neon.bfmlalb.v4f32.v16i8",
		Intrinsic::arm_neon_bfmlalb)
		.Case("arm.neon.bfmlalt.v4f32.v16i8",
		Intrinsic::arm_neon_bfmlalt)
		.Case("aarch64.neon.bfmmla.v4f32.v16i8",
		Intrinsic::aarch64_neon_bfmmla)
		.Case("aarch64.neon.bfmlalb.v4f32.v16i8",
		Intrinsic::aarch64_neon_bfmlalb)
		.Case("aarch64.neon.bfmlalt.v4f32.v16i8",
		Intrinsic::aarch64_neon_bfmlalt)
		.Default(Intrinsic::not_intrinsic);
		if (IID == Intrinsic::not_intrinsic)
		break;

		std::array<Type *, 0> Tys;
		NewFn = Intrinsic::getDeclaration(F->getParent(), IID, Tys);
		return true;
		}
break;		break;
}		}

case 'c': {		case 'c': {
if (Name.startswith("ctlz.") && F->arg_size() == 1) {		if (Name.startswith("ctlz.") && F->arg_size() == 1) {
rename(F);		rename(F);
NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz,		NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz,
F->arg_begin()->getType());		F->arg_begin()->getType());
▲ Show 20 Lines • Show All 2,970 Lines • ▼ Show 20 Lines
case Intrinsic::arm_neon_vst3lane:		case Intrinsic::arm_neon_vst3lane:
case Intrinsic::arm_neon_vst4lane: {		case Intrinsic::arm_neon_vst4lane: {
SmallVector<Value *, 4> Args(CI->arg_operands().begin(),		SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
CI->arg_operands().end());		CI->arg_operands().end());
NewCall = Builder.CreateCall(NewFn, Args);		NewCall = Builder.CreateCall(NewFn, Args);
break;		break;
}		}

		case Intrinsic::arm_neon_bfdot:
		case Intrinsic::arm_neon_bfmmla:
		case Intrinsic::arm_neon_bfmlalb:
		case Intrinsic::arm_neon_bfmlalt:
		case Intrinsic::aarch64_neon_bfdot:
		case Intrinsic::aarch64_neon_bfmmla:
		case Intrinsic::aarch64_neon_bfmlalb:
		case Intrinsic::aarch64_neon_bfmlalt: {
		SmallVector<Value *, 3> Args;
		assert(CI->getNumArgOperands() == 3 &&
		"Mismatch between function args and call args");
		size_t OperandWidth =
		CI->getArgOperand(1)->getType()->getPrimitiveSizeInBits();
		assert((OperandWidth == 64 \|\| OperandWidth == 128) &&
		"Unexpected operand width");
		Type *NewTy = FixedVectorType::get(Type::getBFloatTy(C), OperandWidth / 16);
		auto Iter = CI->arg_operands().begin();
		Args.push_back(*Iter++);
		Args.push_back(Builder.CreateBitCast(*Iter++, NewTy));
		Args.push_back(Builder.CreateBitCast(*Iter++, NewTy));
		NewCall = Builder.CreateCall(NewFn, Args);
		break;
		}

case Intrinsic::bitreverse:		case Intrinsic::bitreverse:
NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(0)});		NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(0)});
break;		break;

case Intrinsic::ctlz:		case Intrinsic::ctlz:
case Intrinsic::cttz:		case Intrinsic::cttz:
assert(CI->getNumArgOperands() == 1 &&		assert(CI->getNumArgOperands() == 1 &&
"Mismatch between function args and call args");		"Mismatch between function args and call args");
▲ Show 20 Lines • Show All 726 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64InstrFormats.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 7,835 Lines • ▼ Show 20 Lines	: BaseSIMDThreeSameVectorTied<Q, U, 0b010, 0b11111, RegType, asm, kind1, [(set (AccumType RegType:$dst),
(InputType RegType:$Rm)))]> {		(InputType RegType:$Rm)))]> {
let AsmString = !strconcat(asm,		let AsmString = !strconcat(asm,
"{\t$Rd" # kind1 # ", $Rn" # kind2 #		"{\t$Rd" # kind1 # ", $Rn" # kind2 #
", $Rm" # kind2 # "}");		", $Rm" # kind2 # "}");
}		}

multiclass SIMDThreeSameVectorBFDot<bit U, string asm> {		multiclass SIMDThreeSameVectorBFDot<bit U, string asm> {
def v4bf16 : BaseSIMDThreeSameVectorBFDot<0, U, asm, ".2s", ".4h", V64,		def v4bf16 : BaseSIMDThreeSameVectorBFDot<0, U, asm, ".2s", ".4h", V64,
v2f32, v8i8>;		v2f32, v4bf16>;
def v8bf16 : BaseSIMDThreeSameVectorBFDot<1, U, asm, ".4s", ".8h", V128,		def v8bf16 : BaseSIMDThreeSameVectorBFDot<1, U, asm, ".4s", ".8h", V128,
v4f32, v16i8>;		v4f32, v8bf16>;
}		}

class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,		class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,
string dst_kind, string lhs_kind,		string dst_kind, string lhs_kind,
string rhs_kind,		string rhs_kind,
RegisterOperand RegType,		RegisterOperand RegType,
ValueType AccumType,		ValueType AccumType,
ValueType InputType>		ValueType InputType>
: BaseSIMDIndexedTied<Q, U, 0b0, 0b01, 0b1111,		: BaseSIMDIndexedTied<Q, U, 0b0, 0b01, 0b1111,
RegType, RegType, V128, VectorIndexS,		RegType, RegType, V128, VectorIndexS,
asm, "", dst_kind, lhs_kind, rhs_kind,		asm, "", dst_kind, lhs_kind, rhs_kind,
[(set (AccumType RegType:$dst),		[(set (AccumType RegType:$dst),
(AccumType (int_aarch64_neon_bfdot		(AccumType (int_aarch64_neon_bfdot
(AccumType RegType:$Rd),		(AccumType RegType:$Rd),
(InputType RegType:$Rn),		(InputType RegType:$Rn),
(InputType (bitconvert (AccumType		(InputType (bitconvert (AccumType
(AArch64duplane32 (v4f32 V128:$Rm),		(AArch64duplane32 (v4f32 V128:$Rm),
VectorIndexH:$idx)))))))]> {		VectorIndexS:$idx)))))))]> {

bits<2> idx;		bits<2> idx;
let Inst{21} = idx{0}; // L		let Inst{21} = idx{0}; // L
let Inst{11} = idx{1}; // H		let Inst{11} = idx{1}; // H
}		}

multiclass SIMDThreeSameVectorBF16DotI<bit U, string asm> {		multiclass SIMDThreeSameVectorBF16DotI<bit U, string asm> {

def v4bf16 : BaseSIMDThreeSameVectorBF16DotI<0, U, asm, ".2s", ".4h",		def v4bf16 : BaseSIMDThreeSameVectorBF16DotI<0, U, asm, ".2s", ".4h",
".2h", V64, v2f32, v8i8>;		".2h", V64, v2f32, v4bf16>;
def v8bf16 : BaseSIMDThreeSameVectorBF16DotI<1, U, asm, ".4s", ".8h",		def v8bf16 : BaseSIMDThreeSameVectorBF16DotI<1, U, asm, ".4s", ".8h",
".2h", V128, v4f32, v16i8>;		".2h", V128, v4f32, v8bf16>;
}		}

class SIMDBF16MLAL<bit Q, string asm, SDPatternOperator OpNode>		class SIMDBF16MLAL<bit Q, string asm, SDPatternOperator OpNode>
: BaseSIMDThreeSameVectorTied<Q, 0b1, 0b110, 0b11111, V128, asm, ".4s",		: BaseSIMDThreeSameVectorTied<Q, 0b1, 0b110, 0b11111, V128, asm, ".4s",
[(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),		[(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
(v16i8 V128:$Rn),		(v8bf16 V128:$Rn),
(v16i8 V128:$Rm)))]> {		(v8bf16 V128:$Rm)))]> {
let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}");		let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}");
}		}

class SIMDBF16MLALIndex<bit Q, string asm, SDPatternOperator OpNode>		class SIMDBF16MLALIndex<bit Q, string asm, SDPatternOperator OpNode>
: I<(outs V128:$dst),		: I<(outs V128:$dst),
(ins V128:$Rd, V128:$Rn, V128_lo:$Rm, VectorIndexH:$idx), asm,		(ins V128:$Rd, V128:$Rn, V128_lo:$Rm, VectorIndexH:$idx), asm,
"{\t$Rd.4s, $Rn.8h, $Rm.h$idx}", "$Rd = $dst",		"{\t$Rd.4s, $Rn.8h, $Rm.h$idx}", "$Rd = $dst",
[(set (v4f32 V128:$dst),		[(set (v4f32 V128:$dst),
(v4f32 (OpNode (v4f32 V128:$Rd),		(v4f32 (OpNode (v4f32 V128:$Rd),
(v16i8 V128:$Rn),		(v8bf16 V128:$Rn),
(v16i8 (bitconvert (v8bf16		(v8bf16
(AArch64duplane16 (v8bf16 V128_lo:$Rm),		(AArch64duplane16 (v8bf16 V128_lo:$Rm),
VectorIndexH:$idx)))))))]>,		VectorIndexH:$idx)))))]>,
Sched<[WriteV]> {		Sched<[WriteV]> {
bits<5> Rd;		bits<5> Rd;
bits<5> Rn;		bits<5> Rn;
bits<4> Rm;		bits<4> Rm;
bits<3> idx;		bits<3> idx;

let Inst{31} = 0;		let Inst{31} = 0;
let Inst{30} = Q;		let Inst{30} = Q;
let Inst{29-22} = 0b00111111;		let Inst{29-22} = 0b00111111;
let Inst{21-20} = idx{1-0};		let Inst{21-20} = idx{1-0};
let Inst{19-16} = Rm;		let Inst{19-16} = Rm;
let Inst{15-12} = 0b1111;		let Inst{15-12} = 0b1111;
let Inst{11} = idx{2}; // H		let Inst{11} = idx{2}; // H
let Inst{10} = 0;		let Inst{10} = 0;
let Inst{9-5} = Rn;		let Inst{9-5} = Rn;
let Inst{4-0} = Rd;		let Inst{4-0} = Rd;
}		}

class SIMDThreeSameVectorBF16MatrixMul<string asm>		class SIMDThreeSameVectorBF16MatrixMul<string asm>
: BaseSIMDThreeSameVectorTied<1, 1, 0b010, 0b11101,		: BaseSIMDThreeSameVectorTied<1, 1, 0b010, 0b11101,
V128, asm, ".4s",		V128, asm, ".4s",
[(set (v4f32 V128:$dst),		[(set (v4f32 V128:$dst),
(int_aarch64_neon_bfmmla (v4f32 V128:$Rd),		(int_aarch64_neon_bfmmla (v4f32 V128:$Rd),
(v16i8 V128:$Rn),		(v8bf16 V128:$Rn),
(v16i8 V128:$Rm)))]> {		(v8bf16 V128:$Rm)))]> {
let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h",		let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h",
", $Rm", ".8h", "}");		", $Rm", ".8h", "}");
}		}

class SIMD_BFCVTN		class SIMD_BFCVTN
: BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128,		: BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128,
"bfcvtn", ".4h", ".4s",		"bfcvtn", ".4h", ".4s",
[(set (v8bf16 V128:$Rd),		[(set (v8bf16 V128:$Rd),
▲ Show 20 Lines • Show All 3,317 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64InstrInfo.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 792 Lines • ▼ Show 20 Lines
	def BFMMLA : SIMDThreeSameVectorBF16MatrixMul<"bfmmla">;			def BFMMLA : SIMDThreeSameVectorBF16MatrixMul<"bfmmla">;
	def BFMLALB : SIMDBF16MLAL<0, "bfmlalb", int_aarch64_neon_bfmlalb>;			def BFMLALB : SIMDBF16MLAL<0, "bfmlalb", int_aarch64_neon_bfmlalb>;
	def BFMLALT : SIMDBF16MLAL<1, "bfmlalt", int_aarch64_neon_bfmlalt>;			def BFMLALT : SIMDBF16MLAL<1, "bfmlalt", int_aarch64_neon_bfmlalt>;
	def BFMLALBIdx : SIMDBF16MLALIndex<0, "bfmlalb", int_aarch64_neon_bfmlalb>;			def BFMLALBIdx : SIMDBF16MLALIndex<0, "bfmlalb", int_aarch64_neon_bfmlalb>;
	def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>;			def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>;
	def BFCVTN : SIMD_BFCVTN;			def BFCVTN : SIMD_BFCVTN;
	def BFCVTN2 : SIMD_BFCVTN2;			def BFCVTN2 : SIMD_BFCVTN2;
	def BFCVT : BF16ToSinglePrecision<"bfcvt">;			def BFCVT : BF16ToSinglePrecision<"bfcvt">;

				// Vector-scalar BFDOT:
				// The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit
				// register (the instruction uses a single 32-bit lane from it), so the pattern
				// is a bit tricky.
				def : Pat<(v2f32 (int_aarch64_neon_bfdot
				(v2f32 V64:$Rd), (v4bf16 V64:$Rn),
				(v4bf16 (bitconvert
				(v2i32 (AArch64duplane32
				(v4i32 (bitconvert
				(v8bf16 (insert_subvector undef,
				(v4bf16 V64:$Rm),
				(i64 0))))),
				VectorIndexS:$idx)))))),
				(BF16DOTlanev4bf16 (v2f32 V64:$Rd), (v4bf16 V64:$Rn),
				(SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
				VectorIndexS:$idx)>;
	}			}

	// ARMv8.6A AArch64 matrix multiplication			// ARMv8.6A AArch64 matrix multiplication
	let Predicates = [HasMatMulInt8] in {			let Predicates = [HasMatMulInt8] in {
	def SMMLA : SIMDThreeSameVectorMatMul<0, 0, "smmla", int_aarch64_neon_smmla>;			def SMMLA : SIMDThreeSameVectorMatMul<0, 0, "smmla", int_aarch64_neon_smmla>;
	def UMMLA : SIMDThreeSameVectorMatMul<0, 1, "ummla", int_aarch64_neon_ummla>;			def UMMLA : SIMDThreeSameVectorMatMul<0, 1, "ummla", int_aarch64_neon_ummla>;
	def USMMLA : SIMDThreeSameVectorMatMul<1, 0, "usmmla", int_aarch64_neon_usmmla>;			def USMMLA : SIMDThreeSameVectorMatMul<1, 0, "usmmla", int_aarch64_neon_usmmla>;
	defm USDOT : SIMDThreeSameVectorDot<0, 1, "usdot", int_aarch64_neon_usdot>;			defm USDOT : SIMDThreeSameVectorDot<0, 1, "usdot", int_aarch64_neon_usdot>;
	▲ Show 20 Lines • Show All 6,865 Lines • Show Last 20 Lines

llvm/lib/Target/ARM/ARMInstrNEON.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 9,073 Lines • ▼ Show 20 Lines	def : Pat<
(AccumTy (int_arm_neon_bfdot (AccumTy RegTy:$Vd),		(AccumTy (int_arm_neon_bfdot (AccumTy RegTy:$Vd),
(InputTy RegTy:$Vn),		(InputTy RegTy:$Vn),
(InputTy (bitconvert (AccumTy		(InputTy (bitconvert (AccumTy
(ARMvduplane (AccumTy RegTy:$Vm),		(ARMvduplane (AccumTy RegTy:$Vm),
VectorIndex32:$lane)))))),		VectorIndex32:$lane)))))),
(!cast<Instruction>(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>;		(!cast<Instruction>(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>;
}		}

def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v8i8>;		def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v4bf16>;
def BF16VDOTS_VDOTQ : BF16VDOTS<1, QPR, "vdot", v4f32, v16i8>;		def BF16VDOTS_VDOTQ : BF16VDOTS<1, QPR, "vdot", v4f32, v8bf16>;

defm BF16VDOTI_VDOTD : BF16VDOTI<0, DPR, "vdot", v2f32, v8i8, (v2f32 DPR_VFP2:$Vm)>;		defm BF16VDOTI_VDOTD : BF16VDOTI<0, DPR, "vdot", v2f32, v4bf16, (v2f32 DPR_VFP2:$Vm)>;
defm BF16VDOTI_VDOTQ : BF16VDOTI<1, QPR, "vdot", v4f32, v16i8, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;		defm BF16VDOTI_VDOTQ : BF16VDOTI<1, QPR, "vdot", v4f32, v8bf16, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;

class BF16MM<bit Q, RegisterClass RegTy,		class BF16MM<bit Q, RegisterClass RegTy,
string opc>		string opc>
: N3Vnp<0b11000, 0b00, 0b1100, Q, 0,		: N3Vnp<0b11000, 0b00, 0b1100, Q, 0,
(outs RegTy:$dst), (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm),		(outs RegTy:$dst), (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm),
N3RegFrm, IIC_VDOTPROD, "", "",		N3RegFrm, IIC_VDOTPROD, "", "",
[(set (v4f32 QPR:$dst), (int_arm_neon_bfmmla (v4f32 QPR:$Vd),		[(set (v4f32 QPR:$dst), (int_arm_neon_bfmmla (v4f32 QPR:$Vd),
(v16i8 QPR:$Vn),		(v8bf16 QPR:$Vn),
(v16i8 QPR:$Vm)))]> {		(v8bf16 QPR:$Vm)))]> {
let Constraints = "$dst = $Vd";		let Constraints = "$dst = $Vd";
let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm");		let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm");
let DecoderNamespace = "VFPV8";		let DecoderNamespace = "VFPV8";
}		}

def VMMLA : BF16MM<1, QPR, "vmmla">;		def VMMLA : BF16MM<1, QPR, "vmmla">;

class VBF16MALQ<bit T, string suffix, SDPatternOperator OpNode>		class VBF16MALQ<bit T, string suffix, SDPatternOperator OpNode>
: N3VCP8<0b00, 0b11, T, 1,		: N3VCP8<0b00, 0b11, T, 1,
(outs QPR:$dst), (ins QPR:$Vd, QPR:$Vn, QPR:$Vm),		(outs QPR:$dst), (ins QPR:$Vd, QPR:$Vn, QPR:$Vm),
NoItinerary, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm", "",		NoItinerary, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm", "",
[(set (v4f32 QPR:$dst),		[(set (v4f32 QPR:$dst),
(OpNode (v4f32 QPR:$Vd),		(OpNode (v4f32 QPR:$Vd),
(v16i8 QPR:$Vn),		(v8bf16 QPR:$Vn),
(v16i8 QPR:$Vm)))]> {		(v8bf16 QPR:$Vm)))]> {
let Constraints = "$dst = $Vd";		let Constraints = "$dst = $Vd";
let DecoderNamespace = "VFPV8";		let DecoderNamespace = "VFPV8";
}		}

def VBF16MALTQ: VBF16MALQ<1, "t", int_arm_neon_bfmlalt>;		def VBF16MALTQ: VBF16MALQ<1, "t", int_arm_neon_bfmlalt>;
def VBF16MALBQ: VBF16MALQ<0, "b", int_arm_neon_bfmlalb>;		def VBF16MALBQ: VBF16MALQ<0, "b", int_arm_neon_bfmlalb>;

multiclass VBF16MALQI<bit T, string suffix, SDPatternOperator OpNode> {		multiclass VBF16MALQI<bit T, string suffix, SDPatternOperator OpNode> {
def "" : N3VLaneCP8<0, 0b11, T, 1, (outs QPR:$dst),		def "" : N3VLaneCP8<0, 0b11, T, 1, (outs QPR:$dst),
(ins QPR:$Vd, QPR:$Vn, DPR_8:$Vm, VectorIndex16:$idx),		(ins QPR:$Vd, QPR:$Vn, DPR_8:$Vm, VectorIndex16:$idx),
IIC_VMACD, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm$idx", "", []> {		IIC_VMACD, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm$idx", "", []> {
bits<2> idx;		bits<2> idx;
let Inst{5} = idx{1};		let Inst{5} = idx{1};
let Inst{3} = idx{0};		let Inst{3} = idx{0};
let Constraints = "$dst = $Vd";		let Constraints = "$dst = $Vd";
let DecoderNamespace = "VFPV8";		let DecoderNamespace = "VFPV8";
}		}

def : Pat<		def : Pat<
(v4f32 (OpNode (v4f32 QPR:$Vd),		(v4f32 (OpNode (v4f32 QPR:$Vd),
(v16i8 QPR:$Vn),		(v8bf16 QPR:$Vn),
(v16i8 (bitconvert (v8bf16 (ARMvduplane (v8bf16 QPR:$Vm),		(v8bf16 (ARMvduplane (v8bf16 QPR:$Vm),
VectorIndex16:$lane)))))),		VectorIndex16:$lane)))),
(!cast<Instruction>(NAME) QPR:$Vd,		(!cast<Instruction>(NAME) QPR:$Vd,
QPR:$Vn,		QPR:$Vn,
(EXTRACT_SUBREG QPR:$Vm,		(EXTRACT_SUBREG QPR:$Vm,
(DSubReg_i16_reg VectorIndex16:$lane)),		(DSubReg_i16_reg VectorIndex16:$lane)),
(SubReg_i16_lane VectorIndex16:$lane))>;		(SubReg_i16_lane VectorIndex16:$lane))>;
}		}

defm VBF16MALTQI: VBF16MALQI<1, "t", int_arm_neon_bfmlalt>;		defm VBF16MALTQI: VBF16MALQI<1, "t", int_arm_neon_bfmlalt>;
defm VBF16MALBQI: VBF16MALQI<0, "b", int_arm_neon_bfmlalb>;		defm VBF16MALBQI: VBF16MALQI<0, "b", int_arm_neon_bfmlalb>;

def BF16_VCVT : N2V<0b11, 0b11, 0b01, 0b10, 0b01100, 1, 0,		def BF16_VCVT : N2V<0b11, 0b11, 0b01, 0b10, 0b01100, 1, 0,
(outs DPR:$Vd), (ins QPR:$Vm),		(outs DPR:$Vd), (ins QPR:$Vm),
NoItinerary, "vcvt", "bf16.f32", "$Vd, $Vm", "", []>;		NoItinerary, "vcvt", "bf16.f32", "$Vd, $Vm", "", []>;
}		}
// End of BFloat16 instructions		// End of BFloat16 instructions

llvm/test/Bitcode/aarch64-bf16-upgrade.ll

This file was added.

				; RUN: llvm-dis < %s.bc \| FileCheck %s

				; Bitcode was generated from file below

				define <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
				; CHECK-LABEL: @test_vbfdot_f32
				entry:
				%0 = bitcast <4 x bfloat> %a to <8 x i8>
				%1 = bitcast <4 x bfloat> %b to <8 x i8>
				; CHECK: %2 = bitcast <8 x i8> %0 to <4 x bfloat>
				; CHECK-NEXT: %3 = bitcast <8 x i8> %1 to <4 x bfloat>
				; CHECK-NEXT: %vbfdot1.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %2, <4 x bfloat> %3)
				%vbfdot1.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
				ret <2 x float> %vbfdot1.i
				}

				define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
				; CHECK-LABEL: @test_vbfdotq_f32
				entry:
				%0 = bitcast <8 x bfloat> %a to <16 x i8>
				%1 = bitcast <8 x bfloat> %b to <16 x i8>
				; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
				; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
				; CHECK-NEXT: %vbfdot1.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
				%vbfdot1.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
				ret <4 x float> %vbfdot1.i
				}

				define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
				; CHECK-LABEL: @test_vbfmmlaq_f32
				entry:
				%0 = bitcast <8 x bfloat> %a to <16 x i8>
				%1 = bitcast <8 x bfloat> %b to <16 x i8>
				%vbfmmla1.i = call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
				; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
				; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
				; CHECK-NEXT: %vbfmmla1.i = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
				ret <4 x float> %vbfmmla1.i
				}

				define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
				; CHECK-LABEL: @test_vbfmlalbq_laneq_f32
				entry:
				%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
				%0 = bitcast <8 x bfloat> %a to <16 x i8>
				%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
				%vbfmlalb1.i = call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
				; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
				; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
				; CHECK-NEXT: %vbfmlalb1.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
				ret <4 x float> %vbfmlalb1.i
				}

				define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
				; CHECK-LABEL: @test_vbfmlaltq_laneq_f32
				entry:
				%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
				%0 = bitcast <8 x bfloat> %a to <16 x i8>
				%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
				%vbfmlalt1.i = call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
				; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
				; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
				; CHECK-NEXT: %vbfmlalt1.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
				ret <4 x float> %vbfmlalt1.i
				}

				declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>)
				; CHECK: declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>)
				declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
				; CHECK: declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>)
				declare <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
				; CHECK: declare <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>)
				declare <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
				; CHECK: declare <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>)
				declare <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
				; CHECK: declare <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)
				No newline at end of file

llvm/test/Bitcode/aarch64-bf16-upgrade.ll.bc

This file was added.

This is an empty file.

llvm/test/Bitcode/arm-bf16-upgrade.ll

This file was added.

				; RUN: llvm-dis < %s.bc \| FileCheck %s

				; Bitcode was generated from file below

				define arm_aapcs_vfpcc <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
				; CHECK-LABEL: @test_vbfdot_f32
				entry:
				%0 = bitcast <4 x bfloat> %a to <8 x i8>
				%1 = bitcast <4 x bfloat> %b to <8 x i8>
				%vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
				; CHECK: %2 = bitcast <8 x i8> %0 to <4 x bfloat>
				; CHECK-NEXT: %3 = bitcast <8 x i8> %1 to <4 x bfloat>
				; CHECK-NEXT: %vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %2, <4 x bfloat> %3)
				ret <2 x float> %vbfdot1.i
				}

				define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
				; CHECK-LABEL: @test_vbfdotq_f32
				entry:
				%0 = bitcast <8 x bfloat> %a to <16 x i8>
				%1 = bitcast <8 x bfloat> %b to <16 x i8>
				%vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
				; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
				; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
				; CHECK-NEXT: %vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
				ret <4 x float> %vbfdot1.i
				}

				define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
				; CHECK-LABEL: @test_vbfmmlaq_f32
				entry:
				%0 = bitcast <8 x bfloat> %a to <16 x i8>
				%1 = bitcast <8 x bfloat> %b to <16 x i8>
				%vbfmmla1.i = call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
				; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
				; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
				; CHECK-NEXT: %vbfmmla1.i = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
				ret <4 x float> %vbfmmla1.i
				}

				define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
				; CHECK-LABEL: @test_vbfmlalbq_laneq_f32
				entry:
				%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
				%0 = bitcast <8 x bfloat> %a to <16 x i8>
				%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
				%vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
				; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
				; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
				; CHECK-NEXT: %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
				ret <4 x float> %vbfmlalb1.i
				}

				define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
				; CHECK-LABEL: @test_vbfmlaltq_laneq_f32
				entry:
				%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
				%0 = bitcast <8 x bfloat> %a to <16 x i8>
				%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
				%vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
				; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
				; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
				; CHECK-NEXT: %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
				ret <4 x float> %vbfmlalt1.i
				}

				declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>)
				; CHECK: declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>)
				declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
				; CHECK: declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>)
				declare <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
				; CHECK: declare <4 x float> @llvm.arm.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>)
				declare <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
				; CHECK: declare <4 x float> @llvm.arm.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>)
				declare <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
				; CHECK: declare <4 x float> @llvm.arm.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)
				No newline at end of file

llvm/test/Bitcode/arm-bf16-upgrade.ll.bc

This file was added.

This is an empty file.

llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -mtriple aarch64-arm-none-eabi -mattr=+bf16 %s -o - \| FileCheck %s			; RUN: llc -mtriple aarch64-arm-none-eabi -mattr=+bf16 %s -o - \| FileCheck %s

	define <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {			define <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
	; CHECK-LABEL: test_vbfdot_f32:			; CHECK-LABEL: test_vbfdot_f32:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.4h			; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.4h
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%0 = bitcast <4 x bfloat> %a to <8 x i8>			%vbfdot3.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b)
	%1 = bitcast <4 x bfloat> %b to <8 x i8>			ret <2 x float> %vbfdot3.i
	%vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
	ret <2 x float> %vbfdot1.i
	}			}

	define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {			define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
	; CHECK-LABEL: test_vbfdotq_f32:			; CHECK-LABEL: test_vbfdotq_f32:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: bfdot v0.4s, v1.8h, v2.8h			; CHECK-NEXT: bfdot v0.4s, v1.8h, v2.8h
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%0 = bitcast <8 x bfloat> %a to <16 x i8>			%vbfdot3.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
	%1 = bitcast <8 x bfloat> %b to <16 x i8>			ret <4 x float> %vbfdot3.i
	%vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
	ret <4 x float> %vbfdot1.i
	}			}

	define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {			define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
	; CHECK-LABEL: test_vbfdot_lane_f32:			; CHECK-LABEL: test_vbfdot_lane_f32:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK: bfdot v0.2s, v1.4h, v2.2h[0]			; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
				; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.2h[0]
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%0 = bitcast <4 x bfloat> %b to <2 x float>			%.cast = bitcast <4 x bfloat> %b to <2 x float>
	%shuffle = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer			%lane = shufflevector <2 x float> %.cast, <2 x float> undef, <2 x i32> zeroinitializer
	%1 = bitcast <4 x bfloat> %a to <8 x i8>			%.cast1 = bitcast <2 x float> %lane to <4 x bfloat>
	%2 = bitcast <2 x float> %shuffle to <8 x i8>			%vbfdot3.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1)
	%vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)			ret <2 x float> %vbfdot3.i
	ret <2 x float> %vbfdot1.i
	}			}

	define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {			define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
	; CHECK-LABEL: test_vbfdotq_laneq_f32:			; CHECK-LABEL: test_vbfdotq_laneq_f32:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: bfdot v0.4s, v1.8h, v2.2h[3]			; CHECK-NEXT: bfdot v0.4s, v1.8h, v2.2h[3]
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%0 = bitcast <8 x bfloat> %b to <4 x float>			%.cast = bitcast <8 x bfloat> %b to <4 x float>
	%shuffle = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>			%lane = shufflevector <4 x float> %.cast, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
	%1 = bitcast <8 x bfloat> %a to <16 x i8>			%.cast1 = bitcast <4 x float> %lane to <8 x bfloat>
	%2 = bitcast <4 x float> %shuffle to <16 x i8>			%vbfdot3.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1)
	%vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)			ret <4 x float> %vbfdot3.i
	ret <4 x float> %vbfdot1.i
	}			}

	define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) {			define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) {
	; CHECK-LABEL: test_vbfdot_laneq_f32:			; CHECK-LABEL: test_vbfdot_laneq_f32:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.2h[3]			; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.2h[3]
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%0 = bitcast <8 x bfloat> %b to <4 x float>			%.cast = bitcast <8 x bfloat> %b to <4 x float>
	%shuffle = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 3, i32 3>			%lane = shufflevector <4 x float> %.cast, <4 x float> undef, <2 x i32> <i32 3, i32 3>
	%1 = bitcast <4 x bfloat> %a to <8 x i8>			%.cast1 = bitcast <2 x float> %lane to <4 x bfloat>
	%2 = bitcast <2 x float> %shuffle to <8 x i8>			%vbfdot3.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1)
	%vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)			ret <2 x float> %vbfdot3.i
	ret <2 x float> %vbfdot1.i
	}			}

	define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {			define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
	; CHECK-LABEL: test_vbfdotq_lane_f32:			; CHECK-LABEL: test_vbfdotq_lane_f32:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK: bfdot v0.4s, v1.8h, v2.2h[0]			; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
				; CHECK-NEXT: bfdot v0.4s, v1.8h, v2.2h[0]
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%0 = bitcast <4 x bfloat> %b to <2 x float>			%.cast = bitcast <4 x bfloat> %b to <2 x float>
	%shuffle = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer			%lane = shufflevector <2 x float> %.cast, <2 x float> undef, <4 x i32> zeroinitializer
	%1 = bitcast <8 x bfloat> %a to <16 x i8>			%.cast1 = bitcast <4 x float> %lane to <8 x bfloat>
	%2 = bitcast <4 x float> %shuffle to <16 x i8>			%vbfdot3.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1)
	%vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)			ret <4 x float> %vbfdot3.i
	ret <4 x float> %vbfdot1.i
	}			}

	define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {			define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
	; CHECK-LABEL: test_vbfmmlaq_f32:			; CHECK-LABEL: test_vbfmmlaq_f32:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: bfmmla v0.4s, v1.8h, v2.8h			; CHECK-NEXT: bfmmla v0.4s, v1.8h, v2.8h
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%0 = bitcast <8 x bfloat> %a to <16 x i8>			%vbfmmlaq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
	%1 = bitcast <8 x bfloat> %b to <16 x i8>			ret <4 x float> %vbfmmlaq_v3.i
	%vbfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
	ret <4 x float> %vbfmmla1.i
	}			}

	define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {			define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
	; CHECK-LABEL: test_vbfmlalbq_f32:			; CHECK-LABEL: test_vbfmlalbq_f32:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: bfmlalb v0.4s, v1.8h, v2.8h			; CHECK-NEXT: bfmlalb v0.4s, v1.8h, v2.8h
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%0 = bitcast <8 x bfloat> %a to <16 x i8>			%vbfmlalbq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
	%1 = bitcast <8 x bfloat> %b to <16 x i8>			ret <4 x float> %vbfmlalbq_v3.i
	%vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
	ret <4 x float> %vbfmlalb1.i
	}			}

	define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {			define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
	; CHECK-LABEL: test_vbfmlaltq_f32:			; CHECK-LABEL: test_vbfmlaltq_f32:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: bfmlalt v0.4s, v1.8h, v2.8h			; CHECK-NEXT: bfmlalt v0.4s, v1.8h, v2.8h
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%0 = bitcast <8 x bfloat> %a to <16 x i8>			%vbfmlaltq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
	%1 = bitcast <8 x bfloat> %b to <16 x i8>			ret <4 x float> %vbfmlaltq_v3.i
	%vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
	ret <4 x float> %vbfmlalt1.i
	}			}

	define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {			define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
	; CHECK-LABEL: test_vbfmlalbq_lane_f32:			; CHECK-LABEL: test_vbfmlalbq_lane_f32:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK: bfmlalb v0.4s, v1.8h, v2.h[0]			; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
				; CHECK-NEXT: bfmlalb v0.4s, v1.8h, v2.h[0]
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer			%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
	%0 = bitcast <8 x bfloat> %a to <16 x i8>			%vbfmlalbq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
	%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>			ret <4 x float> %vbfmlalbq_v3.i
	%vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
	ret <4 x float> %vbfmlalb1.i
	}			}

	define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {			define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
	; CHECK-LABEL: test_vbfmlalbq_laneq_f32:			; CHECK-LABEL: test_vbfmlalbq_laneq_f32:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: bfmlalb v0.4s, v1.8h, v2.h[3]			; CHECK-NEXT: bfmlalb v0.4s, v1.8h, v2.h[3]
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>			%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
	%0 = bitcast <8 x bfloat> %a to <16 x i8>			%vbfmlalbq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
	%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>			ret <4 x float> %vbfmlalbq_v3.i
	%vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
	ret <4 x float> %vbfmlalb1.i
	}			}

	define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {			define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
	; CHECK-LABEL: test_vbfmlaltq_lane_f32:			; CHECK-LABEL: test_vbfmlaltq_lane_f32:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK: bfmlalt v0.4s, v1.8h, v2.h[0]			; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
				; CHECK-NEXT: bfmlalt v0.4s, v1.8h, v2.h[0]
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer			%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
	%0 = bitcast <8 x bfloat> %a to <16 x i8>			%vbfmlaltq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
	%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>			ret <4 x float> %vbfmlaltq_v3.i
	%vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
	ret <4 x float> %vbfmlalt1.i
	}			}

	define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {			define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
	; CHECK-LABEL: test_vbfmlaltq_laneq_f32:			; CHECK-LABEL: test_vbfmlaltq_laneq_f32:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: bfmlalt v0.4s, v1.8h, v2.h[3]			; CHECK-NEXT: bfmlalt v0.4s, v1.8h, v2.h[3]
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>			%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
	%0 = bitcast <8 x bfloat> %a to <16 x i8>			%vbfmlaltq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
	%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>			ret <4 x float> %vbfmlaltq_v3.i
	%vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
	ret <4 x float> %vbfmlalt1.i
	}			}

	declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>) #2			declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>)
	declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2			declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>)
	declare <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2			declare <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>)
	declare <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2			declare <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>)
	declare <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2			declare <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)

llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -mtriple armv8.6a-arm-none-eabi -mattr=+neon,+bf16 -float-abi=hard -verify-machineinstrs < %s -o - \| FileCheck %s			; RUN: llc -mtriple armv8.6a-arm-none-eabi -mattr=+neon,+bf16 -float-abi=hard -verify-machineinstrs < %s -o - \| FileCheck %s

	define arm_aapcs_vfpcc <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {			define arm_aapcs_vfpcc <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
	; CHECK-LABEL: test_vbfdot_f32:			; CHECK-LABEL: test_vbfdot_f32:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: vdot.bf16 d0, d1, d2			; CHECK-NEXT: vdot.bf16 d0, d1, d2
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	entry:			entry:
	%0 = bitcast <4 x bfloat> %a to <8 x i8>			%vbfdot3.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) #3
	%1 = bitcast <4 x bfloat> %b to <8 x i8>			ret <2 x float> %vbfdot3.i
	%vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
	ret <2 x float> %vbfdot1.i
	}			}

	define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {			define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
	; CHECK-LABEL: test_vbfdotq_f32:			; CHECK-LABEL: test_vbfdotq_f32:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: vdot.bf16 q0, q1, q2			; CHECK-NEXT: vdot.bf16 q0, q1, q2
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	entry:			entry:
	%0 = bitcast <8 x bfloat> %a to <16 x i8>			%vbfdot3.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) #3
	%1 = bitcast <8 x bfloat> %b to <16 x i8>			ret <4 x float> %vbfdot3.i
	%vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
	ret <4 x float> %vbfdot1.i
	}			}

	define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {			define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
	; CHECK-LABEL: test_vbfdot_lane_f32:			; CHECK-LABEL: test_vbfdot_lane_f32:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: vdot.bf16 d0, d1, d2[0]			; CHECK-NEXT: vdot.bf16 d0, d1, d2[0]
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	entry:			entry:
	%0 = bitcast <4 x bfloat> %b to <2 x float>			%.cast = bitcast <4 x bfloat> %b to <2 x float>
	%shuffle = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer			%lane = shufflevector <2 x float> %.cast, <2 x float> undef, <2 x i32> zeroinitializer
	%1 = bitcast <4 x bfloat> %a to <8 x i8>			%.cast1 = bitcast <2 x float> %lane to <4 x bfloat>
	%2 = bitcast <2 x float> %shuffle to <8 x i8>			%vbfdot3.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1) #3
	%vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)			ret <2 x float> %vbfdot3.i
	ret <2 x float> %vbfdot1.i
	}			}

	define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {			define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
	; CHECK-LABEL: test_vbfdotq_laneq_f32:			; CHECK-LABEL: test_vbfdotq_laneq_f32:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: vdup.32 q8, d5[1]			; CHECK-NEXT: vdup.32 q8, d5[1]
	; CHECK-NEXT: vdot.bf16 q0, q1, q8			; CHECK-NEXT: vdot.bf16 q0, q1, q8
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	entry:			entry:
	%0 = bitcast <8 x bfloat> %b to <4 x float>			%.cast = bitcast <8 x bfloat> %b to <4 x float>
	%shuffle = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>			%lane = shufflevector <4 x float> %.cast, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
	%1 = bitcast <8 x bfloat> %a to <16 x i8>			%.cast1 = bitcast <4 x float> %lane to <8 x bfloat>
	%2 = bitcast <4 x float> %shuffle to <16 x i8>			%vbfdot3.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1) #3
	%vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)			ret <4 x float> %vbfdot3.i
	ret <4 x float> %vbfdot1.i
	}			}

	define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) {			define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) {
	; CHECK-LABEL: test_vbfdot_laneq_f32:			; CHECK-LABEL: test_vbfdot_laneq_f32:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: vdot.bf16 d0, d1, d3[1]			; CHECK-NEXT: vdot.bf16 d0, d1, d3[1]
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	entry:			entry:
	%0 = bitcast <8 x bfloat> %b to <4 x float>			%.cast = bitcast <8 x bfloat> %b to <4 x float>
	%shuffle = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 3, i32 3>			%lane = shufflevector <4 x float> %.cast, <4 x float> undef, <2 x i32> <i32 3, i32 3>
	%1 = bitcast <4 x bfloat> %a to <8 x i8>			%.cast1 = bitcast <2 x float> %lane to <4 x bfloat>
	%2 = bitcast <2 x float> %shuffle to <8 x i8>			%vbfdot3.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1) #3
	%vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)			ret <2 x float> %vbfdot3.i
	ret <2 x float> %vbfdot1.i
	}			}

	define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {			define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
	; CHECK-LABEL: test_vbfdotq_lane_f32:			; CHECK-LABEL: test_vbfdotq_lane_f32:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2			; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2
	; CHECK-NEXT: vdot.bf16 q0, q1, d4[0]			; CHECK-NEXT: vdot.bf16 q0, q1, d4[0]
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	entry:			entry:
	%0 = bitcast <4 x bfloat> %b to <2 x float>			%.cast = bitcast <4 x bfloat> %b to <2 x float>
	%shuffle = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer			%lane = shufflevector <2 x float> %.cast, <2 x float> undef, <4 x i32> zeroinitializer
	%1 = bitcast <8 x bfloat> %a to <16 x i8>			%.cast1 = bitcast <4 x float> %lane to <8 x bfloat>
	%2 = bitcast <4 x float> %shuffle to <16 x i8>			%vbfdot3.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1) #3
	%vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)			ret <4 x float> %vbfdot3.i
	ret <4 x float> %vbfdot1.i
	}			}

	define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {			define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
	; CHECK-LABEL: test_vbfmmlaq_f32:			; CHECK-LABEL: test_vbfmmlaq_f32:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: vmmla.bf16 q0, q1, q2			; CHECK-NEXT: vmmla.bf16 q0, q1, q2
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	entry:			entry:
	%0 = bitcast <8 x bfloat> %a to <16 x i8>			%vbfmmlaq_v3.i = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
	%1 = bitcast <8 x bfloat> %b to <16 x i8>			ret <4 x float> %vbfmmlaq_v3.i
	%vbfmmla1.i = call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
	ret <4 x float> %vbfmmla1.i
	}			}

	define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {			define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
	; CHECK-LABEL: test_vbfmlalbq_f32:			; CHECK-LABEL: test_vbfmlalbq_f32:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: vfmab.bf16 q0, q1, q2			; CHECK-NEXT: vfmab.bf16 q0, q1, q2
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	entry:			entry:
	%0 = bitcast <8 x bfloat> %a to <16 x i8>			%vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
	%1 = bitcast <8 x bfloat> %b to <16 x i8>			ret <4 x float> %vbfmlalbq_v3.i
	%vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
	ret <4 x float> %vbfmlalb1.i
	}			}

	define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {			define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
	; CHECK-LABEL: test_vbfmlaltq_f32:			; CHECK-LABEL: test_vbfmlaltq_f32:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: vfmat.bf16 q0, q1, q2			; CHECK-NEXT: vfmat.bf16 q0, q1, q2
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	entry:			entry:
	%0 = bitcast <8 x bfloat> %a to <16 x i8>			%vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
	%1 = bitcast <8 x bfloat> %b to <16 x i8>			ret <4 x float> %vbfmlaltq_v3.i
	%vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
	ret <4 x float> %vbfmlalt1.i
	}			}

	define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {			define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
	; CHECK-LABEL: test_vbfmlalbq_lane_f32:			; CHECK-LABEL: test_vbfmlalbq_lane_f32:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2			; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2
	; CHECK-NEXT: vfmab.bf16 q0, q1, d4[0]			; CHECK-NEXT: vfmab.bf16 q0, q1, d4[0]
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	entry:			entry:
	%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer			%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
	%0 = bitcast <8 x bfloat> %a to <16 x i8>			%vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
	%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>			ret <4 x float> %vbfmlalbq_v3.i
	%vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
	ret <4 x float> %vbfmlalb1.i
	}			}

	define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {			define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
	; CHECK-LABEL: test_vbfmlalbq_laneq_f32:			; CHECK-LABEL: test_vbfmlalbq_laneq_f32:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: vfmab.bf16 q0, q1, d4[3]			; CHECK-NEXT: vfmab.bf16 q0, q1, d4[3]
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	entry:			entry:
	%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>			%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
	%0 = bitcast <8 x bfloat> %a to <16 x i8>			%vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
	%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>			ret <4 x float> %vbfmlalbq_v3.i
	%vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
	ret <4 x float> %vbfmlalb1.i
	}			}

	define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {			define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
	; CHECK-LABEL: test_vbfmlaltq_lane_f32:			; CHECK-LABEL: test_vbfmlaltq_lane_f32:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2			; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2
	; CHECK-NEXT: vfmat.bf16 q0, q1, d4[0]			; CHECK-NEXT: vfmat.bf16 q0, q1, d4[0]
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	entry:			entry:
	%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer			%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
	%0 = bitcast <8 x bfloat> %a to <16 x i8>			%vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
	%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>			ret <4 x float> %vbfmlaltq_v3.i
	%vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
	ret <4 x float> %vbfmlalt1.i
	}			}

	define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {			define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
	; CHECK-LABEL: test_vbfmlaltq_laneq_f32:			; CHECK-LABEL: test_vbfmlaltq_laneq_f32:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: vfmat.bf16 q0, q1, d4[3]			; CHECK-NEXT: vfmat.bf16 q0, q1, d4[3]
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	entry:			entry:
	%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>			%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
	%0 = bitcast <8 x bfloat> %a to <16 x i8>			%vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
	%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>			ret <4 x float> %vbfmlaltq_v3.i
	%vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
	ret <4 x float> %vbfmlalt1.i
	}			}

	define <4 x float> @test_vbfmlaltq_laneq_f32_v2(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {			define <4 x float> @test_vbfmlaltq_laneq_f32_v2(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
	dmgreenUnsubmitted Done Reply Inline Actions It seems like it's probably worth keeping this test. dmgreen: It seems like it's probably worth keeping this test.
	; CHECK-LABEL: test_vbfmlaltq_laneq_f32_v2:			; CHECK-LABEL: test_vbfmlaltq_laneq_f32_v2:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: vdup.16 q8, d5[2]			; CHECK-NEXT: vdup.16 q8, d5[2]
	; CHECK-NEXT: vfmat.bf16 q0, q1, q8			; CHECK-NEXT: vfmat.bf16 q0, q1, q8
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	entry:			entry:
	%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>			%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
	%0 = bitcast <8 x bfloat> %a to <16 x i8>			%vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
	%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
	%vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
	ret <4 x float> %vbfmlalt1.i			ret <4 x float> %vbfmlalt1.i
	}			}

	declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>)			declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>)
	declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)			declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>)
	declare <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)			declare <4 x float> @llvm.arm.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>)
	declare <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)			declare <4 x float> @llvm.arm.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>)
	declare <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)			declare <4 x float> @llvm.arm.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)

This is an archive of the discontinued LLVM Phabricator instance.

[ARM][BFloat16] Change types of some Arm and AArch64 bf16 intrinsics
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 288390

clang/lib/CodeGen/CGBuiltin.cpp

clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c

clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c

llvm/include/llvm/IR/IntrinsicsAArch64.td

llvm/include/llvm/IR/IntrinsicsARM.td

llvm/lib/IR/AutoUpgrade.cpp

llvm/lib/Target/AArch64/AArch64InstrFormats.td

llvm/lib/Target/AArch64/AArch64InstrInfo.td

llvm/lib/Target/ARM/ARMInstrNEON.td

llvm/test/Bitcode/aarch64-bf16-upgrade.ll

llvm/test/Bitcode/aarch64-bf16-upgrade.ll.bc

llvm/test/Bitcode/arm-bf16-upgrade.ll

llvm/test/Bitcode/arm-bf16-upgrade.ll.bc

llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll

llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll

This is an archive of the discontinued LLVM Phabricator instance.

[ARM][BFloat16] Change types of some Arm and AArch64 bf16 intrinsicsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 288390

clang/lib/CodeGen/CGBuiltin.cpp

clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c

clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c

llvm/include/llvm/IR/IntrinsicsAArch64.td

llvm/include/llvm/IR/IntrinsicsARM.td

llvm/lib/IR/AutoUpgrade.cpp

llvm/lib/Target/AArch64/AArch64InstrFormats.td

llvm/lib/Target/AArch64/AArch64InstrInfo.td

llvm/lib/Target/ARM/ARMInstrNEON.td

llvm/test/Bitcode/aarch64-bf16-upgrade.ll

llvm/test/Bitcode/aarch64-bf16-upgrade.ll.bc

llvm/test/Bitcode/arm-bf16-upgrade.ll

llvm/test/Bitcode/arm-bf16-upgrade.ll.bc

llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll

llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll

[ARM][BFloat16] Change types of some Arm and AArch64 bf16 intrinsics
ClosedPublic