This is an archive of the discontinued LLVM Phabricator instance.

[AArch64] Change size suffix for FP16FML intrinsics.
ClosedPublic

Authored by ab on Feb 15 2019, 2:04 PM.

Download Raw Diff

Details

Reviewers

SjoerdMeijer
bryanpkc

Commits

rGb26fa705df57: [AArch64] Change size suffix for FP16FML intrinsics.
rC354538: [AArch64] Change size suffix for FP16FML intrinsics.
rL354538: [AArch64] Change size suffix for FP16FML intrinsics.

Summary

These currently use _u32, but they should instead use _f32 or _f16, the types of the accumulator, and of the multiplication.

I'm starting with _f16 (because that seems to match the various integer vmlal variants), but either seems fine.

Diff Detail

Repository: rL LLVM

Event Timeline

ab created this revision.Feb 15 2019, 2:04 PM

Herald added a subscriber: javed.absar. · View Herald TranscriptFeb 15 2019, 2:04 PM

ab mentioned this in D53633: [AArch64] Implement FP16FML intrinsics.Feb 15 2019, 2:09 PM

I am discussing this with our GCC team as we would like both Clang/GCC implementation to be the same. But you're right that _f16 looks like to be the more consistent choice. I will let you know as soon I know more.

LGTM

The ACLE has been updated and a new version with change included will be released soon.

This revision is now accepted and ready to land.Feb 19 2019, 5:08 AM

Closed by commit rL354538: [AArch64] Change size suffix for FP16FML intrinsics. (authored by ab). · Explain WhyFeb 20 2019, 5:13 PM

This revision was automatically updated to reflect the committed changes.

Herald added a project: Restricted Project. · View Herald TranscriptFeb 20 2019, 5:13 PM

Herald added a subscriber: llvm-commits. · View Herald Transcript

Thanks for checking, much appreciated!

Revision Contents

Path

Size

cfe/

trunk/

include/

clang/

Basic/

arm_neon.td

28 lines

test/

CodeGen/

aarch64-neon-fp16fml.c

144 lines

Diff 187696

cfe/trunk/include/clang/Basic/arm_neon.td

	Show First 20 Lines • Show All 1,645 Lines • ▼ Show 20 Lines
	}			}
	let ArchGuard = "defined(__ARM_FEATURE_DOTPROD) && defined(__aarch64__)" in {			let ArchGuard = "defined(__ARM_FEATURE_DOTPROD) && defined(__aarch64__)" in {
	// Variants indexing into a 128-bit vector are A64 only.			// Variants indexing into a 128-bit vector are A64 only.
	def UDOT_LANEQ : SOpInst<"vdot_laneq", "dd89i", "iUiQiQUi", OP_DOT_LNQ>;			def UDOT_LANEQ : SOpInst<"vdot_laneq", "dd89i", "iUiQiQUi", OP_DOT_LNQ>;
	}			}

	// v8.2-A FP16 fused multiply-add long instructions.			// v8.2-A FP16 fused multiply-add long instructions.
	let ArchGuard = "defined(__ARM_FEATURE_FP16FML) && defined(__aarch64__)" in {			let ArchGuard = "defined(__ARM_FEATURE_FP16FML) && defined(__aarch64__)" in {
	def VFMLAL_LOW : SInst<"vfmlal_low", "ffHH", "UiQUi">;			def VFMLAL_LOW : SInst<"vfmlal_low", "ffHH", "hQh">;
	def VFMLSL_LOW : SInst<"vfmlsl_low", "ffHH", "UiQUi">;			def VFMLSL_LOW : SInst<"vfmlsl_low", "ffHH", "hQh">;
	def VFMLAL_HIGH : SInst<"vfmlal_high", "ffHH", "UiQUi">;			def VFMLAL_HIGH : SInst<"vfmlal_high", "ffHH", "hQh">;
	def VFMLSL_HIGH : SInst<"vfmlsl_high", "ffHH", "UiQUi">;			def VFMLSL_HIGH : SInst<"vfmlsl_high", "ffHH", "hQh">;

	def VFMLAL_LANE_LOW : SOpInst<"vfmlal_lane_low", "ffH0i", "UiQUi", OP_FMLAL_LN>;			def VFMLAL_LANE_LOW : SOpInst<"vfmlal_lane_low", "ffH0i", "hQh", OP_FMLAL_LN>;
	def VFMLSL_LANE_LOW : SOpInst<"vfmlsl_lane_low", "ffH0i", "UiQUi", OP_FMLSL_LN>;			def VFMLSL_LANE_LOW : SOpInst<"vfmlsl_lane_low", "ffH0i", "hQh", OP_FMLSL_LN>;
	def VFMLAL_LANE_HIGH : SOpInst<"vfmlal_lane_high", "ffH0i", "UiQUi", OP_FMLAL_LN_Hi>;			def VFMLAL_LANE_HIGH : SOpInst<"vfmlal_lane_high", "ffH0i", "hQh", OP_FMLAL_LN_Hi>;
	def VFMLSL_LANE_HIGH : SOpInst<"vfmlsl_lane_high", "ffH0i", "UiQUi", OP_FMLSL_LN_Hi>;			def VFMLSL_LANE_HIGH : SOpInst<"vfmlsl_lane_high", "ffH0i", "hQh", OP_FMLSL_LN_Hi>;

	def VFMLAL_LANEQ_LOW : SOpInst<"vfmlal_laneq_low", "ffH1i", "UiQUi", OP_FMLAL_LN>;			def VFMLAL_LANEQ_LOW : SOpInst<"vfmlal_laneq_low", "ffH1i", "hQh", OP_FMLAL_LN>;
	def VFMLSL_LANEQ_LOW : SOpInst<"vfmlsl_laneq_low", "ffH1i", "UiQUi", OP_FMLSL_LN>;			def VFMLSL_LANEQ_LOW : SOpInst<"vfmlsl_laneq_low", "ffH1i", "hQh", OP_FMLSL_LN>;
	def VFMLAL_LANEQ_HIGH : SOpInst<"vfmlal_laneq_high", "ffH1i", "UiQUi", OP_FMLAL_LN_Hi>;			def VFMLAL_LANEQ_HIGH : SOpInst<"vfmlal_laneq_high", "ffH1i", "hQh", OP_FMLAL_LN_Hi>;
	def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "ffH1i", "UiQUi", OP_FMLSL_LN_Hi>;			def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "ffH1i", "hQh", OP_FMLSL_LN_Hi>;
	}			}

cfe/trunk/test/CodeGen/aarch64-neon-fp16fml.c

	// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +v8.2a -target-feature +neon -target-feature +fp16fml \			// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +v8.2a -target-feature +neon -target-feature +fp16fml \
	// RUN: -fallow-half-arguments-and-returns -disable-O0-optnone -emit-llvm -o - %s \| opt -S -instcombine \| FileCheck %s			// RUN: -fallow-half-arguments-and-returns -disable-O0-optnone -emit-llvm -o - %s \| opt -S -instcombine \| FileCheck %s

	// REQUIRES: aarch64-registered-target			// REQUIRES: aarch64-registered-target

	// Test AArch64 Armv8.2-A FP16 Fused Multiply-Add Long intrinsics			// Test AArch64 Armv8.2-A FP16 Fused Multiply-Add Long intrinsics

	#include <arm_neon.h>			#include <arm_neon.h>

	// Vector form			// Vector form

	float32x2_t test_vfmlal_low_u32(float32x2_t a, float16x4_t b, float16x4_t c) {			float32x2_t test_vfmlal_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
	// CHECK-LABEL: define <2 x float> @test_vfmlal_low_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)			// CHECK-LABEL: define <2 x float> @test_vfmlal_low_f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)
	// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)			// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)
	// CHECK: ret <2 x float> [[RESULT]]			// CHECK: ret <2 x float> [[RESULT]]
	return vfmlal_low_u32(a, b, c);			return vfmlal_low_f16(a, b, c);
	}			}

	float32x2_t test_vfmlsl_low_u32(float32x2_t a, float16x4_t b, float16x4_t c) {			float32x2_t test_vfmlsl_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
	// CHECK-LABEL: define <2 x float> @test_vfmlsl_low_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)			// CHECK-LABEL: define <2 x float> @test_vfmlsl_low_f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)
	// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)			// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)
	// CHECK: ret <2 x float> [[RESULT]]			// CHECK: ret <2 x float> [[RESULT]]
	return vfmlsl_low_u32(a, b, c);			return vfmlsl_low_f16(a, b, c);
	}			}

	float32x2_t test_vfmlal_high_u32(float32x2_t a, float16x4_t b, float16x4_t c) {			float32x2_t test_vfmlal_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
	// CHECK-LABEL: define <2 x float> @test_vfmlal_high_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)			// CHECK-LABEL: define <2 x float> @test_vfmlal_high_f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)
	// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)			// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)
	// CHECK: ret <2 x float> [[RESULT]]			// CHECK: ret <2 x float> [[RESULT]]
	return vfmlal_high_u32(a, b, c);			return vfmlal_high_f16(a, b, c);
	}			}

	float32x2_t test_vfmlsl_high_u32(float32x2_t a, float16x4_t b, float16x4_t c) {			float32x2_t test_vfmlsl_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
	// CHECK-LABEL: define <2 x float> @test_vfmlsl_high_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)			// CHECK-LABEL: define <2 x float> @test_vfmlsl_high_f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)
	// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)			// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)
	// CHECK: ret <2 x float> [[RESULT]]			// CHECK: ret <2 x float> [[RESULT]]
	return vfmlsl_high_u32(a, b, c);			return vfmlsl_high_f16(a, b, c);
	}			}

	float32x4_t test_vfmlalq_low_u32(float32x4_t a, float16x8_t b, float16x8_t c) {			float32x4_t test_vfmlalq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
	// CHECK-LABEL: define <4 x float> @test_vfmlalq_low_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)			// CHECK-LABEL: define <4 x float> @test_vfmlalq_low_f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)
	// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)			// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)
	// CHECK: ret <4 x float> [[RESULT]]			// CHECK: ret <4 x float> [[RESULT]]
	return vfmlalq_low_u32(a, b, c);			return vfmlalq_low_f16(a, b, c);
	}			}

	float32x4_t test_vfmlslq_low_u32(float32x4_t a, float16x8_t b, float16x8_t c) {			float32x4_t test_vfmlslq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
	// CHECK-LABEL: define <4 x float> @test_vfmlslq_low_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)			// CHECK-LABEL: define <4 x float> @test_vfmlslq_low_f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)
	// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)			// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)
	// CHECK: ret <4 x float> [[RESULT]]			// CHECK: ret <4 x float> [[RESULT]]
	return vfmlslq_low_u32(a, b, c);			return vfmlslq_low_f16(a, b, c);
	}			}

	float32x4_t test_vfmlalq_high_u32(float32x4_t a, float16x8_t b, float16x8_t c) {			float32x4_t test_vfmlalq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
	// CHECK-LABEL: define <4 x float> @test_vfmlalq_high_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)			// CHECK-LABEL: define <4 x float> @test_vfmlalq_high_f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)
	// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)			// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)
	// CHECK: ret <4 x float> [[RESULT]]			// CHECK: ret <4 x float> [[RESULT]]
	return vfmlalq_high_u32(a, b, c);			return vfmlalq_high_f16(a, b, c);
	}			}

	float32x4_t test_vfmlslq_high_u32(float32x4_t a, float16x8_t b, float16x8_t c) {			float32x4_t test_vfmlslq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
	// CHECK-LABEL: define <4 x float> @test_vfmlslq_high_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)			// CHECK-LABEL: define <4 x float> @test_vfmlslq_high_f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)
	// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)			// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)
	// CHECK: ret <4 x float> [[RESULT]]			// CHECK: ret <4 x float> [[RESULT]]
	return vfmlslq_high_u32(a, b, c);			return vfmlslq_high_f16(a, b, c);
	}			}

	// Indexed form			// Indexed form

	float32x2_t test_vfmlal_lane_low_u32(float32x2_t a, float16x4_t b, float16x4_t c) {			float32x2_t test_vfmlal_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
	// CHECK-LABEL: define <2 x float> @test_vfmlal_lane_low_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)			// CHECK-LABEL: define <2 x float> @test_vfmlal_lane_low_f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)
	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> zeroinitializer			// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> zeroinitializer
	// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])			// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
	// CHECK: ret <2 x float> [[RESULT]]			// CHECK: ret <2 x float> [[RESULT]]
	return vfmlal_lane_low_u32(a, b, c, 0);			return vfmlal_lane_low_f16(a, b, c, 0);
	}			}

	float32x2_t test_vfmlal_lane_high_u32(float32x2_t a, float16x4_t b, float16x4_t c) {			float32x2_t test_vfmlal_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
	// CHECK-LABEL: define <2 x float> @test_vfmlal_lane_high_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)			// CHECK-LABEL: define <2 x float> @test_vfmlal_lane_high_f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)
	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>			// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
	// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])			// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
	// CHECK: ret <2 x float> [[RESULT]]			// CHECK: ret <2 x float> [[RESULT]]
	return vfmlal_lane_high_u32(a, b, c, 1);			return vfmlal_lane_high_f16(a, b, c, 1);
	}			}

	float32x4_t test_vfmlalq_lane_low_u32(float32x4_t a, float16x8_t b, float16x4_t c) {			float32x4_t test_vfmlalq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t c) {
	// CHECK-LABEL: define <4 x float> @test_vfmlalq_lane_low_u32(<4 x float> %a, <8 x half> %b, <4 x half> %c)			// CHECK-LABEL: define <4 x float> @test_vfmlalq_lane_low_f16(<4 x float> %a, <8 x half> %b, <4 x half> %c)
	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>			// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
	// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])			// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
	// CHECK: ret <4 x float> [[RESULT]]			// CHECK: ret <4 x float> [[RESULT]]
	return vfmlalq_lane_low_u32(a, b, c, 2);			return vfmlalq_lane_low_f16(a, b, c, 2);
	}			}

	float32x4_t test_vfmlalq_lane_high_u32(float32x4_t a, float16x8_t b, float16x4_t c) {			float32x4_t test_vfmlalq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t c) {
	// CHECK-LABEL: define <4 x float> @test_vfmlalq_lane_high_u32(<4 x float> %a, <8 x half> %b, <4 x half> %c)			// CHECK-LABEL: define <4 x float> @test_vfmlalq_lane_high_f16(<4 x float> %a, <8 x half> %b, <4 x half> %c)
	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>			// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
	// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])			// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
	// CHECK: ret <4 x float> [[RESULT]]			// CHECK: ret <4 x float> [[RESULT]]
	return vfmlalq_lane_high_u32(a, b, c, 3);			return vfmlalq_lane_high_f16(a, b, c, 3);
	}			}

	float32x2_t test_vfmlal_laneq_low_u32(float32x2_t a, float16x4_t b, float16x8_t c) {			float32x2_t test_vfmlal_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t c) {
	// CHECK-LABEL: define <2 x float> @test_vfmlal_laneq_low_u32(<2 x float> %a, <4 x half> %b, <8 x half> %c)			// CHECK-LABEL: define <2 x float> @test_vfmlal_laneq_low_f16(<2 x float> %a, <4 x half> %b, <8 x half> %c)
	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> <i32 4, i32 4, i32 4, i32 4>			// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
	// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])			// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
	// CHECK: ret <2 x float> [[RESULT]]			// CHECK: ret <2 x float> [[RESULT]]
	return vfmlal_laneq_low_u32(a, b, c, 4);			return vfmlal_laneq_low_f16(a, b, c, 4);
	}			}

	float32x2_t test_vfmlal_laneq_high_u32(float32x2_t a, float16x4_t b, float16x8_t c) {			float32x2_t test_vfmlal_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t c) {
	// CHECK-LABEL: define <2 x float> @test_vfmlal_laneq_high_u32(<2 x float> %a, <4 x half> %b, <8 x half> %c)			// CHECK-LABEL: define <2 x float> @test_vfmlal_laneq_high_f16(<2 x float> %a, <4 x half> %b, <8 x half> %c)
	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> <i32 5, i32 5, i32 5, i32 5>			// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
	// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])			// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
	// CHECK: ret <2 x float> [[RESULT]]			// CHECK: ret <2 x float> [[RESULT]]
	return vfmlal_laneq_high_u32(a, b, c, 5);			return vfmlal_laneq_high_f16(a, b, c, 5);
	}			}

	float32x4_t test_vfmlalq_laneq_low_u32(float32x4_t a, float16x8_t b, float16x8_t c) {			float32x4_t test_vfmlalq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
	// CHECK-LABEL: define <4 x float> @test_vfmlalq_laneq_low_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)			// CHECK-LABEL: define <4 x float> @test_vfmlalq_laneq_low_f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)
	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>			// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
	// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])			// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
	// CHECK: ret <4 x float> [[RESULT]]			// CHECK: ret <4 x float> [[RESULT]]
	return vfmlalq_laneq_low_u32(a, b, c, 6);			return vfmlalq_laneq_low_f16(a, b, c, 6);
	}			}

	float32x4_t test_vfmlalq_laneq_high_u32(float32x4_t a, float16x8_t b, float16x8_t c) {			float32x4_t test_vfmlalq_laneq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
	// CHECK-LABEL: define <4 x float> @test_vfmlalq_laneq_high_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)			// CHECK-LABEL: define <4 x float> @test_vfmlalq_laneq_high_f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)
	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>			// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
	// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])			// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
	// CHECK: ret <4 x float> [[RESULT]]			// CHECK: ret <4 x float> [[RESULT]]
	return vfmlalq_laneq_high_u32(a, b, c, 7);			return vfmlalq_laneq_high_f16(a, b, c, 7);
	}			}

	float32x2_t test_vfmlsl_lane_low_u32(float32x2_t a, float16x4_t b, float16x4_t c) {			float32x2_t test_vfmlsl_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
	// CHECK-LABEL: define <2 x float> @test_vfmlsl_lane_low_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)			// CHECK-LABEL: define <2 x float> @test_vfmlsl_lane_low_f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)
	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> zeroinitializer			// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> zeroinitializer
	// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])			// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
	// CHECK: ret <2 x float> [[RESULT]]			// CHECK: ret <2 x float> [[RESULT]]
	return vfmlsl_lane_low_u32(a, b, c, 0);			return vfmlsl_lane_low_f16(a, b, c, 0);
	}			}

	float32x2_t test_vfmlsl_lane_high_u32(float32x2_t a, float16x4_t b, float16x4_t c) {			float32x2_t test_vfmlsl_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
	// CHECK-LABEL: define <2 x float> @test_vfmlsl_lane_high_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)			// CHECK-LABEL: define <2 x float> @test_vfmlsl_lane_high_f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)
	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>			// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
	// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])			// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
	// CHECK: ret <2 x float> [[RESULT]]			// CHECK: ret <2 x float> [[RESULT]]
	return vfmlsl_lane_high_u32(a, b, c, 1);			return vfmlsl_lane_high_f16(a, b, c, 1);
	}			}

	float32x4_t test_vfmlslq_lane_low_u32(float32x4_t a, float16x8_t b, float16x4_t c) {			float32x4_t test_vfmlslq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t c) {
	// CHECK-LABEL: define <4 x float> @test_vfmlslq_lane_low_u32(<4 x float> %a, <8 x half> %b, <4 x half> %c)			// CHECK-LABEL: define <4 x float> @test_vfmlslq_lane_low_f16(<4 x float> %a, <8 x half> %b, <4 x half> %c)
	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>			// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
	// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])			// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
	// CHECK: ret <4 x float> [[RESULT]]			// CHECK: ret <4 x float> [[RESULT]]
	return vfmlslq_lane_low_u32(a, b, c, 2);			return vfmlslq_lane_low_f16(a, b, c, 2);
	}			}

	float32x4_t test_vfmlslq_lane_high_u32(float32x4_t a, float16x8_t b, float16x4_t c) {			float32x4_t test_vfmlslq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t c) {
	// CHECK-LABEL: define <4 x float> @test_vfmlslq_lane_high_u32(<4 x float> %a, <8 x half> %b, <4 x half> %c)			// CHECK-LABEL: define <4 x float> @test_vfmlslq_lane_high_f16(<4 x float> %a, <8 x half> %b, <4 x half> %c)
	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>			// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
	// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])			// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
	// CHECK: ret <4 x float> [[RESULT]]			// CHECK: ret <4 x float> [[RESULT]]
	return vfmlslq_lane_high_u32(a, b, c, 3);			return vfmlslq_lane_high_f16(a, b, c, 3);
	}			}

	float32x2_t test_vfmlsl_laneq_low_u32(float32x2_t a, float16x4_t b, float16x8_t c) {			float32x2_t test_vfmlsl_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t c) {
	// CHECK-LABEL: define <2 x float> @test_vfmlsl_laneq_low_u32(<2 x float> %a, <4 x half> %b, <8 x half> %c)			// CHECK-LABEL: define <2 x float> @test_vfmlsl_laneq_low_f16(<2 x float> %a, <4 x half> %b, <8 x half> %c)
	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> <i32 4, i32 4, i32 4, i32 4>			// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
	// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])			// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
	// CHECK: ret <2 x float> [[RESULT]]			// CHECK: ret <2 x float> [[RESULT]]
	return vfmlsl_laneq_low_u32(a, b, c, 4);			return vfmlsl_laneq_low_f16(a, b, c, 4);
	}			}

	float32x2_t test_vfmlsl_laneq_high_u32(float32x2_t a, float16x4_t b, float16x8_t c) {			float32x2_t test_vfmlsl_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t c) {
	// CHECK-LABEL: define <2 x float> @test_vfmlsl_laneq_high_u32(<2 x float> %a, <4 x half> %b, <8 x half> %c)			// CHECK-LABEL: define <2 x float> @test_vfmlsl_laneq_high_f16(<2 x float> %a, <4 x half> %b, <8 x half> %c)
	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> <i32 5, i32 5, i32 5, i32 5>			// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
	// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])			// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
	// CHECK: ret <2 x float> [[RESULT]]			// CHECK: ret <2 x float> [[RESULT]]
	return vfmlsl_laneq_high_u32(a, b, c, 5);			return vfmlsl_laneq_high_f16(a, b, c, 5);
	}			}

	float32x4_t test_vfmlslq_laneq_low_u32(float32x4_t a, float16x8_t b, float16x8_t c) {			float32x4_t test_vfmlslq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
	// CHECK-LABEL: define <4 x float> @test_vfmlslq_laneq_low_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)			// CHECK-LABEL: define <4 x float> @test_vfmlslq_laneq_low_f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)
	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>			// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
	// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])			// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
	// CHECK: ret <4 x float> [[RESULT]]			// CHECK: ret <4 x float> [[RESULT]]
	return vfmlslq_laneq_low_u32(a, b, c, 6);			return vfmlslq_laneq_low_f16(a, b, c, 6);
	}			}

	float32x4_t test_vfmlslq_laneq_high_u32(float32x4_t a, float16x8_t b, float16x8_t c) {			float32x4_t test_vfmlslq_laneq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
	// CHECK-LABEL: define <4 x float> @test_vfmlslq_laneq_high_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)			// CHECK-LABEL: define <4 x float> @test_vfmlslq_laneq_high_f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)
	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>			// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
	// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])			// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
	// CHECK: ret <4 x float> [[RESULT]]			// CHECK: ret <4 x float> [[RESULT]]
	return vfmlslq_laneq_high_u32(a, b, c, 7);			return vfmlslq_laneq_high_f16(a, b, c, 7);
	}			}