This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
clang/
-
include/clang/Basic/
-
clang/
-
Basic/
-
arm_mve.td
-
test/CodeGen/arm-mve-intrinsics/
-
CodeGen/
-
arm-mve-intrinsics/
-
vcvt.c
-
llvm/
-
include/llvm/IR/
-
llvm/
-
IR/
-
IntrinsicsARM.td
-
lib/Target/ARM/
-
Target/
-
ARM/
-
ARMInstrMVE.td
-
test/CodeGen/Thumb2/mve-intrinsics/
-
CodeGen/
-
Thumb2/
-
mve-intrinsics/
-
vcvt.ll

Differential D75254

[ARM,MVE] Add ACLE intrinsics for VCVT.F32.F16 family.
ClosedPublic

Authored by simon_tatham on Feb 27 2020, 6:57 AM.

Download Raw Diff

Details

Reviewers

MarkMurrayARM
dmgreen
miyuki
ostannard

Commits

rGb08d2ddd69b4: [ARM,MVE] Add ACLE intrinsics for VCVT.F32.F16 family.

Summary

These instructions make a vector of <4 x float> by widening every
other lane of a vector of <8 x half>.

I wondered about representing these using standard IR, along the lines
of a shufflevector to extract elements of the input into a <4 x half>
followed by an fpext to turn that into <4 x float>. But it looks as
if that would take a lot of work in isel lowering to make it match any
pattern I could sensibly write in Tablegen, and also I haven't been
able to think of any other case where that pattern might be generated
in IR, so there wouldn't be any extra code generation win from doing
it that way.

Therefore, I've just used another target-specific intrinsic. We can
always change it to the other way later if anyone thinks of a good
reason.

(In order to put the intrinsic definition near similar things in
IntrinsicsARM.td, I've also lifted the definition of the
MVEMXPredicated multiclass higher up the file, without changing it.)

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

simon_tatham created this revision.Feb 27 2020, 6:57 AM

Herald added projects: Restricted Project, Restricted Project. · View Herald TranscriptFeb 27 2020, 6:57 AM

Herald added subscribers: llvm-commits, cfe-commits, hiraditya, kristof.beyls. · View Herald Transcript

Harbormaster completed remote builds in B47419: Diff 246936.Feb 27 2020, 6:58 AM

simon_tatham added a parent revision: D75253: [ARM,MVE] Correct MC operands in VCVT.F32.F16. (NFC).Feb 27 2020, 6:59 AM

LGTM

This revision is now accepted and ready to land.Feb 28 2020, 2:11 AM

Closed by commit rGb08d2ddd69b4: [ARM,MVE] Add ACLE intrinsics for VCVT.F32.F16 family. (authored by simon_tatham). · Explain WhyMar 2 2020, 2:35 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

clang/

include/

clang/

Basic/

arm_mve.td

9 lines

test/

CodeGen/

arm-mve-intrinsics/

vcvt.c

68 lines

llvm/

include/

llvm/

IR/

IntrinsicsARM.td

26 lines

lib/

Target/

ARM/

ARMInstrMVE.td

11 lines

test/

CodeGen/

Thumb2/

mve-intrinsics/

vcvt.ll

50 lines

Diff 247582

clang/include/clang/Basic/arm_mve.td

Show First 20 Lines • Show All 447 Lines • ▼ Show 20 Lines	foreach half = [ "b", "t" ] in {
let params = [f32], pnt = PNT_None in {		let params = [f32], pnt = PNT_None in {
def vcvt#half#q_f16: Intrinsic<		def vcvt#half#q_f16: Intrinsic<
VecOf<f16>, (args VecOf<f16>:$inactive, Vector:$a),		VecOf<f16>, (args VecOf<f16>:$inactive, Vector:$a),
(IRInt<"vcvt_narrow"> $inactive, $a, halfconst)>;		(IRInt<"vcvt_narrow"> $inactive, $a, halfconst)>;
def vcvt#half#q_m_f16: Intrinsic<		def vcvt#half#q_m_f16: Intrinsic<
VecOf<f16>, (args VecOf<f16>:$inactive, Vector:$a, PredOf<f32>:$pred),		VecOf<f16>, (args VecOf<f16>:$inactive, Vector:$a, PredOf<f32>:$pred),
(IRInt<"vcvt_narrow_predicated"> $inactive, $a, halfconst, $pred)>;		(IRInt<"vcvt_narrow_predicated"> $inactive, $a, halfconst, $pred)>;
} // params = [f32], pnt = PNT_None		} // params = [f32], pnt = PNT_None

		let params = [f16], pnt = PNT_None in {
		def vcvt#half#q_f32: Intrinsic<VecOf<f32>, (args Vector:$a),
		(IRInt<"vcvt_widen"> $a, halfconst)>;
		defm vcvt#half#q: IntrinsicMX<
		VecOf<f32>, (args Vector:$a, PredOf<f32>:$pred),
		(IRInt<"vcvt_widen_predicated"> $inactive, $a, halfconst, $pred),
		1, "_f32">;
		} // params = [f16], pnt = PNT_None
} // loop over half = "b", "t"		} // loop over half = "b", "t"

multiclass float_int_conversions<Type FScalar, Type IScalar, IRBuilderBase ftoi, IRBuilderBase itof> {		multiclass float_int_conversions<Type FScalar, Type IScalar, IRBuilderBase ftoi, IRBuilderBase itof> {
defvar FVector = VecOf<FScalar>;		defvar FVector = VecOf<FScalar>;
defvar IVector = VecOf<IScalar>;		defvar IVector = VecOf<IScalar>;

let params = [IScalar] in {		let params = [IScalar] in {
let pnt = PNT_2Type in {		let pnt = PNT_2Type in {
▲ Show 20 Lines • Show All 944 Lines • Show Last 20 Lines

clang/test/CodeGen/arm-mve-intrinsics/vcvt.c

	Show First 20 Lines • Show All 691 Lines • ▼ Show 20 Lines
	// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])			// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
	// CHECK-NEXT: [[TMP2:%.]] = call <4 x i32> @llvm.arm.mve.vcvt.fix.predicated.v4i32.v4f32.v4i1(i32 1, <4 x i32> undef, <4 x float> [[A:%.]], i32 32, <4 x i1> [[TMP1]])			// CHECK-NEXT: [[TMP2:%.]] = call <4 x i32> @llvm.arm.mve.vcvt.fix.predicated.v4i32.v4f32.v4i1(i32 1, <4 x i32> undef, <4 x float> [[A:%.]], i32 32, <4 x i1> [[TMP1]])
	// CHECK-NEXT: ret <4 x i32> [[TMP2]]			// CHECK-NEXT: ret <4 x i32> [[TMP2]]
	//			//
	uint32x4_t test_vcvtq_x_n_u32_f32(float32x4_t a, mve_pred16_t p)			uint32x4_t test_vcvtq_x_n_u32_f32(float32x4_t a, mve_pred16_t p)
	{			{
	return vcvtq_x_n_u32_f32(a, 32, p);			return vcvtq_x_n_u32_f32(a, 32, p);
	}			}

				// CHECK-LABEL: @test_vcvtbq_f32_f16(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = call <4 x float> @llvm.arm.mve.vcvt.widen(<8 x half> [[A:%.]], i32 0)
				// CHECK-NEXT: ret <4 x float> [[TMP0]]
				//
				float32x4_t test_vcvtbq_f32_f16(float16x8_t a)
				{
				return vcvtbq_f32_f16(a);
				}

				// CHECK-LABEL: @test_vcvttq_f32_f16(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = call <4 x float> @llvm.arm.mve.vcvt.widen(<8 x half> [[A:%.]], i32 1)
				// CHECK-NEXT: ret <4 x float> [[TMP0]]
				//
				float32x4_t test_vcvttq_f32_f16(float16x8_t a)
				{
				return vcvttq_f32_f16(a);
				}

				// CHECK-LABEL: @test_vcvtbq_m_f32_f16(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = zext i16 [[P:%.]] to i32
				// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
				// CHECK-NEXT: [[TMP2:%.]] = call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> [[INACTIVE:%.]], <8 x half> [[A:%.*]], i32 0, <4 x i1> [[TMP1]])
				// CHECK-NEXT: ret <4 x float> [[TMP2]]
				//
				float32x4_t test_vcvtbq_m_f32_f16(float32x4_t inactive, float16x8_t a, mve_pred16_t p)
				{
				return vcvtbq_m_f32_f16(inactive, a, p);
				}

				// CHECK-LABEL: @test_vcvttq_m_f32_f16(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = zext i16 [[P:%.]] to i32
				// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
				// CHECK-NEXT: [[TMP2:%.]] = call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> [[INACTIVE:%.]], <8 x half> [[A:%.*]], i32 1, <4 x i1> [[TMP1]])
				// CHECK-NEXT: ret <4 x float> [[TMP2]]
				//
				float32x4_t test_vcvttq_m_f32_f16(float32x4_t inactive, float16x8_t a, mve_pred16_t p)
				{
				return vcvttq_m_f32_f16(inactive, a, p);
				}

				// CHECK-LABEL: @test_vcvtbq_x_f32_f16(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = zext i16 [[P:%.]] to i32
				// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
				// CHECK-NEXT: [[TMP2:%.]] = call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> undef, <8 x half> [[A:%.]], i32 0, <4 x i1> [[TMP1]])
				// CHECK-NEXT: ret <4 x float> [[TMP2]]
				//
				float32x4_t test_vcvtbq_x_f32_f16(float16x8_t a, mve_pred16_t p)
				{
				return vcvtbq_x_f32_f16(a, p);
				}

				// CHECK-LABEL: @test_vcvttq_x_f32_f16(
				// CHECK-NEXT: entry:
				// CHECK-NEXT: [[TMP0:%.]] = zext i16 [[P:%.]] to i32
				// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
				// CHECK-NEXT: [[TMP2:%.]] = call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> undef, <8 x half> [[A:%.]], i32 1, <4 x i1> [[TMP1]])
				// CHECK-NEXT: ret <4 x float> [[TMP2]]
				//
				float32x4_t test_vcvttq_x_f32_f16(float16x8_t a, mve_pred16_t p)
				{
				return vcvttq_x_f32_f16(a, p);
				}

llvm/include/llvm/IR/IntrinsicsARM.td

Show First 20 Lines • Show All 905 Lines • ▼ Show 20 Lines	multiclass MVEPredicatedM<list<LLVMType> rets, list<LLVMType> params,
LLVMType pred = llvm_anyvector_ty,		LLVMType pred = llvm_anyvector_ty,
list<IntrinsicProperty> props = [IntrNoMem]> {		list<IntrinsicProperty> props = [IntrNoMem]> {
def "": Intrinsic<rets, params, props>;		def "": Intrinsic<rets, params, props>;
def _predicated: Intrinsic<rets, params # [pred,		def _predicated: Intrinsic<rets, params # [pred,
!if(!eq(!cast<string>(rets[0]), "llvm_anyvector_ty"),		!if(!eq(!cast<string>(rets[0]), "llvm_anyvector_ty"),
LLVMMatchType<0>, rets[0])], props>;		LLVMMatchType<0>, rets[0])], props>;
}		}

		// Intrinsic with a predicated and a non-predicated case. The predicated case
		// has two additional parameters: inactive (the value for inactive lanes, can
		// be undef) and predicate.
		multiclass MVEMXPredicated<list<LLVMType> rets, list<LLVMType> flags,
		list<LLVMType> params, LLVMType inactive,
		LLVMType predicate,
		list<IntrinsicProperty> props = [IntrNoMem]> {
		def "": Intrinsic<rets, flags # params, props>;
		def _predicated: Intrinsic<rets, flags # [inactive] # params # [predicate],
		props>;
		}

defm int_arm_mve_vcvt_narrow: MVEPredicated<[llvm_v8f16_ty],		defm int_arm_mve_vcvt_narrow: MVEPredicated<[llvm_v8f16_ty],
[llvm_v8f16_ty, llvm_v4f32_ty, llvm_i32_ty], llvm_v4i1_ty>;		[llvm_v8f16_ty, llvm_v4f32_ty, llvm_i32_ty], llvm_v4i1_ty>;
		defm int_arm_mve_vcvt_widen: MVEMXPredicated<[llvm_v4f32_ty], [],
		[llvm_v8f16_ty, llvm_i32_ty], llvm_v4f32_ty, llvm_v4i1_ty>;

defm int_arm_mve_vldr_gather_base: MVEPredicated<		defm int_arm_mve_vldr_gather_base: MVEPredicated<
[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_i32_ty],		[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_i32_ty],
llvm_anyvector_ty, [IntrReadMem]>;		llvm_anyvector_ty, [IntrReadMem]>;
defm int_arm_mve_vldr_gather_base_wb: MVEPredicated<		defm int_arm_mve_vldr_gather_base_wb: MVEPredicated<
[llvm_anyvector_ty, llvm_anyvector_ty],		[llvm_anyvector_ty, llvm_anyvector_ty],
[LLVMMatchType<1>, llvm_i32_ty], llvm_anyvector_ty, [IntrReadMem]>;		[LLVMMatchType<1>, llvm_i32_ty], llvm_anyvector_ty, [IntrReadMem]>;
defm int_arm_mve_vstr_scatter_base: MVEPredicated<		defm int_arm_mve_vstr_scatter_base: MVEPredicated<
▲ Show 20 Lines • Show All 115 Lines • ▼ Show 20 Lines
def int_arm_mve_vmull: Intrinsic<		def int_arm_mve_vmull: Intrinsic<
[llvm_anyvector_ty],		[llvm_anyvector_ty],
[llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty /* unsigned */,		[llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty /* unsigned */,
llvm_i32_ty /* top */], [IntrNoMem]>;		llvm_i32_ty /* top */], [IntrNoMem]>;
def int_arm_mve_vmull_poly: Intrinsic<		def int_arm_mve_vmull_poly: Intrinsic<
[llvm_anyvector_ty],		[llvm_anyvector_ty],
[llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty], [IntrNoMem]>;		[llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty], [IntrNoMem]>;

// Intrinsic with a predicated and a non-predicated case. The predicated case
// has two additional parameters: inactive (the value for inactive lanes, can
// be undef) and predicate.
multiclass MVEMXPredicated<list<LLVMType> rets, list<LLVMType> flags,
list<LLVMType> params, LLVMType inactive,
LLVMType predicate,
list<IntrinsicProperty> props = [IntrNoMem]> {
def "": Intrinsic<rets, flags # params, props>;
def _predicated: Intrinsic<rets, flags # [inactive] # params # [predicate],
props>;
}

// The first two parameters are compile-time constants:		// The first two parameters are compile-time constants:
// * Halving: 0 means halving (vhcaddq), 1 means non-halving (vcaddq)		// * Halving: 0 means halving (vhcaddq), 1 means non-halving (vcaddq)
// instruction. Note: the flag is inverted to match the corresonding		// instruction. Note: the flag is inverted to match the corresonding
// bit in the instruction encoding		// bit in the instruction encoding
// * Rotation angle: 0 mean 90 deg, 1 means 180 deg		// * Rotation angle: 0 mean 90 deg, 1 means 180 deg
defm int_arm_mve_vcaddq : MVEMXPredicated<		defm int_arm_mve_vcaddq : MVEMXPredicated<
[llvm_anyvector_ty],		[llvm_anyvector_ty],
[llvm_i32_ty, llvm_i32_ty], [LLVMMatchType<0>, LLVMMatchType<0>],		[llvm_i32_ty, llvm_i32_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
▲ Show 20 Lines • Show All 157 Lines • Show Last 20 Lines

llvm/lib/Target/ARM/ARMInstrMVE.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 4,509 Lines • ▼ Show 20 Lines	def : Pat<(v8f16 (int_arm_mve_vcvt_narrow_predicated
(v4i1 VCCR:$mask))),		(v4i1 VCCR:$mask))),
(v8f16 (Inst (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm),		(v8f16 (Inst (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm),
ARMVCCThen, (v4i1 VCCR:$mask)))>;		ARMVCCThen, (v4i1 VCCR:$mask)))>;
}		}
}		}

multiclass MVE_VCVT_h2f_m<string iname, int half> {		multiclass MVE_VCVT_h2f_m<string iname, int half> {
def "": MVE_VCVT_ff<iname, "f32.f16", 0b1, half, (ins), vpred_r, "">;		def "": MVE_VCVT_ff<iname, "f32.f16", 0b1, half, (ins), vpred_r, "">;
		defvar Inst = !cast<Instruction>(NAME);

		let Predicates = [HasMVEFloat] in {
		def : Pat<(v4f32 (int_arm_mve_vcvt_widen (v8f16 MQPR:$Qm), (i32 half))),
		(v4f32 (Inst (v8f16 MQPR:$Qm)))>;
		def : Pat<(v4f32 (int_arm_mve_vcvt_widen_predicated
		(v4f32 MQPR:$inactive), (v8f16 MQPR:$Qm), (i32 half),
		(v4i1 VCCR:$mask))),
		(v4f32 (Inst (v8f16 MQPR:$Qm), ARMVCCThen,
		(v4i1 VCCR:$mask), (v4f32 MQPR:$inactive)))>;
		}
}		}

defm MVE_VCVTf16f32bh : MVE_VCVT_f2h_m<"vcvtb", 0b0>;		defm MVE_VCVTf16f32bh : MVE_VCVT_f2h_m<"vcvtb", 0b0>;
defm MVE_VCVTf16f32th : MVE_VCVT_f2h_m<"vcvtt", 0b1>;		defm MVE_VCVTf16f32th : MVE_VCVT_f2h_m<"vcvtt", 0b1>;
defm MVE_VCVTf32f16bh : MVE_VCVT_h2f_m<"vcvtb", 0b0>;		defm MVE_VCVTf32f16bh : MVE_VCVT_h2f_m<"vcvtb", 0b0>;
defm MVE_VCVTf32f16th : MVE_VCVT_h2f_m<"vcvtt", 0b1>;		defm MVE_VCVTf32f16th : MVE_VCVT_h2f_m<"vcvtt", 0b1>;

class MVE_VxCADD<string iname, string suffix, bits<2> size, bit halve,		class MVE_VxCADD<string iname, string suffix, bits<2> size, bit halve,
▲ Show 20 Lines • Show All 2,337 Lines • Show Last 20 Lines

llvm/test/CodeGen/Thumb2/mve-intrinsics/vcvt.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s \| FileCheck %s			; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s \| FileCheck %s

	declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)			declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
	declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)			declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)

	declare <8 x half> @llvm.arm.mve.vcvt.narrow(<8 x half>, <4 x float>, i32)			declare <8 x half> @llvm.arm.mve.vcvt.narrow(<8 x half>, <4 x float>, i32)
	declare <8 x half> @llvm.arm.mve.vcvt.narrow.predicated(<8 x half>, <4 x float>, i32, <4 x i1>)			declare <8 x half> @llvm.arm.mve.vcvt.narrow.predicated(<8 x half>, <4 x float>, i32, <4 x i1>)
				declare <4 x float> @llvm.arm.mve.vcvt.widen(<8 x half>, i32)
				declare <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float>, <8 x half>, i32, <4 x i1>)

	declare <8 x half> @llvm.arm.mve.vcvt.fix.v8f16.v8i16(i32, <8 x i16>, i32)			declare <8 x half> @llvm.arm.mve.vcvt.fix.v8f16.v8i16(i32, <8 x i16>, i32)
	declare <4 x float> @llvm.arm.mve.vcvt.fix.v4f32.v4i32(i32, <4 x i32>, i32)			declare <4 x float> @llvm.arm.mve.vcvt.fix.v4f32.v4i32(i32, <4 x i32>, i32)
	declare <8 x i16> @llvm.arm.mve.vcvt.fix.v8i16.v8f16(i32, <8 x half>, i32)			declare <8 x i16> @llvm.arm.mve.vcvt.fix.v8i16.v8f16(i32, <8 x half>, i32)
	declare <4 x i32> @llvm.arm.mve.vcvt.fix.v4i32.v4f32(i32, <4 x float>, i32)			declare <4 x i32> @llvm.arm.mve.vcvt.fix.v4i32.v4f32(i32, <4 x float>, i32)
	declare <8 x half> @llvm.arm.mve.vcvt.fix.predicated.v8f16.v8i16.v8i1(i32, <8 x half>, <8 x i16>, i32, <8 x i1>)			declare <8 x half> @llvm.arm.mve.vcvt.fix.predicated.v8f16.v8i16.v8i1(i32, <8 x half>, <8 x i16>, i32, <8 x i1>)
	declare <4 x float> @llvm.arm.mve.vcvt.fix.predicated.v4f32.v4i32.v4i1(i32, <4 x float>, <4 x i32>, i32, <4 x i1>)			declare <4 x float> @llvm.arm.mve.vcvt.fix.predicated.v4f32.v4i32.v4i1(i32, <4 x float>, <4 x i32>, i32, <4 x i1>)
	declare <8 x i16> @llvm.arm.mve.vcvt.fix.predicated.v8i16.v8f16.v8i1(i32, <8 x i16>, <8 x half>, i32, <8 x i1>)			declare <8 x i16> @llvm.arm.mve.vcvt.fix.predicated.v8i16.v8f16.v8i1(i32, <8 x i16>, <8 x half>, i32, <8 x i1>)
	▲ Show 20 Lines • Show All 345 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: vcvtt.u32.f32 q0, q0, #32			; CHECK-NEXT: vcvtt.u32.f32 q0, q0, #32
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	entry:			entry:
	%0 = zext i16 %p to i32			%0 = zext i16 %p to i32
	%1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)			%1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
	%2 = call <4 x i32> @llvm.arm.mve.vcvt.fix.predicated.v4i32.v4f32.v4i1(i32 1, <4 x i32> undef, <4 x float> %a, i32 32, <4 x i1> %1)			%2 = call <4 x i32> @llvm.arm.mve.vcvt.fix.predicated.v4i32.v4f32.v4i1(i32 1, <4 x i32> undef, <4 x float> %a, i32 32, <4 x i1> %1)
	ret <4 x i32> %2			ret <4 x i32> %2
	}			}

				define arm_aapcs_vfpcc <4 x float> @test_vcvtbq_f32_f16(<8 x half> %a) {
				; CHECK-LABEL: test_vcvtbq_f32_f16:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vcvtb.f32.f16 q0, q0
				; CHECK-NEXT: bx lr
				entry:
				%0 = tail call <4 x float> @llvm.arm.mve.vcvt.widen(<8 x half> %a, i32 0)
				ret <4 x float> %0
				}

				define arm_aapcs_vfpcc <4 x float> @test_vcvttq_f32_f16(<8 x half> %a) {
				; CHECK-LABEL: test_vcvttq_f32_f16:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vcvtt.f32.f16 q0, q0
				; CHECK-NEXT: bx lr
				entry:
				%0 = tail call <4 x float> @llvm.arm.mve.vcvt.widen(<8 x half> %a, i32 1)
				ret <4 x float> %0
				}

				define arm_aapcs_vfpcc <4 x float> @test_vcvtbq_m_f32_f16(<4 x float> %inactive, <8 x half> %a, i16 zeroext %p) {
				; CHECK-LABEL: test_vcvtbq_m_f32_f16:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmsr p0, r0
				; CHECK-NEXT: vpst
				; CHECK-NEXT: vcvtbt.f32.f16 q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = zext i16 %p to i32
				%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
				%2 = tail call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> %inactive, <8 x half> %a, i32 0, <4 x i1> %1)
				ret <4 x float> %2
				}

				define arm_aapcs_vfpcc <4 x float> @test_vcvttq_m_f32_f16(<4 x float> %inactive, <8 x half> %a, i16 zeroext %p) {
				; CHECK-LABEL: test_vcvttq_m_f32_f16:
				; CHECK: @ %bb.0: @ %entry
				; CHECK-NEXT: vmsr p0, r0
				; CHECK-NEXT: vpst
				; CHECK-NEXT: vcvttt.f32.f16 q0, q1
				; CHECK-NEXT: bx lr
				entry:
				%0 = zext i16 %p to i32
				%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
				%2 = tail call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> %inactive, <8 x half> %a, i32 1, <4 x i1> %1)
				ret <4 x float> %2
				}