This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/
-
CodeGen/SelectionDAG/
-
SelectionDAG/
-
DAGCombiner.cpp
-
Target/AArch64/
-
AArch64/
1/8
AArch64ISelLowering.cpp
-
test/CodeGen/AArch64/
-
CodeGen/
-
AArch64/
-
sve-fixed-length-fp-extend-trunc.ll

Differential D114580

[AArch64][SVE] Mark fixed-type FP extending/truncating loads/stores as custom
ClosedPublic

Authored by bsmith on Nov 25 2021, 4:12 AM.

Download Raw Diff

Details

Reviewers

paulwalker-arm
peterwaller-arm
sdesmalen
efriedma

Commits

rG61808066325f: [AArch64][SVE] Mark fixed-type FP extending/truncating loads/stores as custom

Summary

This allows the generic DAG combine to fold fp_extend/fp_trunc into
loads/stores which we can then lower into a integer extending
load/truncating store plus an FP_EXTEND/FP_ROUND.

The nuance here is that fixed-type FP_EXTEND/FP_ROUND require unpacked
types hence lowering them introduces an unpack/zip. By allowing these
nodes to be combined with loads/store we make it much easier to have
this unpack/zip combined into the load/store by our custom lowering.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

bsmith created this revision.Nov 25 2021, 4:12 AM

Herald added a reviewer: efriedma. · View Herald TranscriptNov 25 2021, 4:12 AM

Herald added subscribers: ecnelises, psnobl, hiraditya and 2 others. · View Herald Transcript

bsmith requested review of this revision.Nov 25 2021, 4:12 AM

Herald added a project: Restricted Project. · View Herald TranscriptNov 25 2021, 4:12 AM

Herald added a subscriber: llvm-commits. · View Herald Transcript

Harbormaster completed remote builds in B136026: Diff 389733.Nov 25 2021, 4:59 AM

paulwalker-arm added inline comments.Nov 25 2021, 5:42 AM

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
1563	This looks weird to me, shouldn't InnerVT be floating point? I guess the reason this works is because the i8 case is essentially bogus and you end up with the necessary floating point types from the second iteration. Either way I think this wants to be `MVT::f16`.
18680	Purely to help with formatting I believe you can just do `VT.isFloatingPoint()` here.
18681	Can you be specific here as in use `== ISD::EXTLOAD` as we should need to support any other extension types.
18688	I think it'll be safer to mirror the other if condition as the two are linked (i.e. we need to do both or none based on the same requirement).

bsmith added inline comments.Nov 25 2021, 6:00 AM

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
1563	I meant to change this but clearly forgot!

Fix incorrect type used in addTypeForFixedLengthSVE
Cleanup conditions in LowerFixedLengthVectorLoadToSVE

Harbormaster completed remote builds in B136051: Diff 389763.Nov 25 2021, 7:03 AM

Matt added a subscriber: Matt.Nov 25 2021, 8:53 AM

paulwalker-arm accepted this revision.Nov 26 2021, 3:48 AM

paulwalker-arm added inline comments.

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
18673–18674	You could move this into the if block where it's used.
18770–18771	Perhaps `TruncVT`? Plus as above this could be moved into the if block where it is used.
18775	As above this can be just `VT`.

This revision is now accepted and ready to land.Nov 26 2021, 3:48 AM

bsmith added a child revision: D114628: [AArch64][SVE] Duplicate FP_EXTEND/FP_TRUNC -> LOAD/STORE dag combines.Nov 26 2021, 4:07 AM

bsmith mentioned this in D110237: [AArch64][SVE] Add DAG combines to improve SVE fixed type FP_EXTEND lowering.Nov 26 2021, 4:10 AM

bsmith mentioned this in D110531: [AArch64][SVE] Perform FP_EXTEND combine on legal types to fold extend into load.

paulwalker-arm mentioned this in D108115: [DAG][sve] Lowering for VLS masked truncating stores.Nov 26 2021, 9:50 AM

Closed by commit rG61808066325f: [AArch64][SVE] Mark fixed-type FP extending/truncating loads/stores as custom (authored by bsmith). · Explain WhyNov 29 2021, 3:57 AM

This revision was automatically updated to reflect the committed changes.

bsmith added a commit: rG61808066325f: [AArch64][SVE] Mark fixed-type FP extending/truncating loads/stores as custom.

paulwalker-arm mentioned this in D114628: [AArch64][SVE] Duplicate FP_EXTEND/FP_TRUNC -> LOAD/STORE dag combines.Nov 30 2021, 8:52 AM

Revision Contents

Path

Size

llvm/

lib/

CodeGen/

SelectionDAG/

DAGCombiner.cpp

2 lines

Target/

AArch64/

AArch64ISelLowering.cpp

62 lines

test/

CodeGen/

AArch64/

sve-fixed-length-fp-extend-trunc.ll

100 lines

Diff 390301

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 15,001 Lines • ▼ Show 20 Lines	if (N0.getOpcode() == ISD::FP_ROUND
if (VT.bitsLT(In.getValueType()))		if (VT.bitsLT(In.getValueType()))
return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT,		return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT,
In, N0.getOperand(1));		In, N0.getOperand(1));
return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, In);		return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, In);
}		}

// fold (fpext (load x)) -> (fpext (fptrunc (extload x)))		// fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&		if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {		TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, VT, N0.getValueType())) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);		LoadSDNode *LN0 = cast<LoadSDNode>(N0);
SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,		SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
LN0->getChain(),		LN0->getChain(),
LN0->getBasePtr(), N0.getValueType(),		LN0->getBasePtr(), N0.getValueType(),
LN0->getMemOperand());		LN0->getMemOperand());
CombineTo(N, ExtLoad);		CombineTo(N, ExtLoad);
CombineTo(N0.getNode(),		CombineTo(N0.getNode(),
DAG.getNode(ISD::FP_ROUND, SDLoc(N0),		DAG.getNode(ISD::FP_ROUND, SDLoc(N0),
▲ Show 20 Lines • Show All 8,744 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,539 Lines • ▼ Show 20 Lines	if (VT.isFloatingPoint()) {
setCondCodeAction(ISD::SETULT, VT, Expand);		setCondCodeAction(ISD::SETULT, VT, Expand);
setCondCodeAction(ISD::SETULE, VT, Expand);		setCondCodeAction(ISD::SETULE, VT, Expand);
setCondCodeAction(ISD::SETUGE, VT, Expand);		setCondCodeAction(ISD::SETUGE, VT, Expand);
setCondCodeAction(ISD::SETUGT, VT, Expand);		setCondCodeAction(ISD::SETUGT, VT, Expand);
setCondCodeAction(ISD::SETUEQ, VT, Expand);		setCondCodeAction(ISD::SETUEQ, VT, Expand);
setCondCodeAction(ISD::SETUNE, VT, Expand);		setCondCodeAction(ISD::SETUNE, VT, Expand);
}		}

// Mark integer truncating stores as having custom lowering		// Mark integer truncating stores/extending loads as having custom lowering
if (VT.isInteger()) {		if (VT.isInteger()) {
MVT InnerVT = VT.changeVectorElementType(MVT::i8);		MVT InnerVT = VT.changeVectorElementType(MVT::i8);
while (InnerVT != VT) {		while (InnerVT != VT) {
setTruncStoreAction(VT, InnerVT, Custom);		setTruncStoreAction(VT, InnerVT, Custom);
setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom);		setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom);
setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom);		setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom);
InnerVT = InnerVT.changeVectorElementType(		InnerVT = InnerVT.changeVectorElementType(
MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));		MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
}		}
}		}

		// Mark floating-point truncating stores/extending loads as having custom
		// lowering
		if (VT.isFloatingPoint()) {
		MVT InnerVT = VT.changeVectorElementType(MVT::f16);
		paulwalker-armUnsubmitted Not Done Reply Inline Actions This looks weird to me, shouldn't InnerVT be floating point? I guess the reason this works is because the i8 case is essentially bogus and you end up with the necessary floating point types from the second iteration. Either way I think this wants to be `MVT::f16`. paulwalker-arm: This looks weird to me, shouldn't InnerVT be floating point? I guess the reason this works is…
		bsmithAuthorUnsubmitted Done Reply Inline Actions I meant to change this but clearly forgot! bsmith: I meant to change this but clearly forgot!
		while (InnerVT != VT) {
		setTruncStoreAction(VT, InnerVT, Custom);
		setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom);
		InnerVT = InnerVT.changeVectorElementType(
		MVT::getFloatingPointVT(2 * InnerVT.getScalarSizeInBits()));
		}
		}

// Lower fixed length vector operations to scalable equivalents.		// Lower fixed length vector operations to scalable equivalents.
setOperationAction(ISD::ABS, VT, Custom);		setOperationAction(ISD::ABS, VT, Custom);
setOperationAction(ISD::ADD, VT, Custom);		setOperationAction(ISD::ADD, VT, Custom);
setOperationAction(ISD::AND, VT, Custom);		setOperationAction(ISD::AND, VT, Custom);
setOperationAction(ISD::ANY_EXTEND, VT, Custom);		setOperationAction(ISD::ANY_EXTEND, VT, Custom);
setOperationAction(ISD::BITCAST, VT, Custom);		setOperationAction(ISD::BITCAST, VT, Custom);
setOperationAction(ISD::BITREVERSE, VT, Custom);		setOperationAction(ISD::BITREVERSE, VT, Custom);
setOperationAction(ISD::BSWAP, VT, Custom);		setOperationAction(ISD::BSWAP, VT, Custom);
▲ Show 20 Lines • Show All 17,085 Lines • ▼ Show 20 Lines
// Convert all fixed length vector loads larger than NEON to masked_loads.		// Convert all fixed length vector loads larger than NEON to masked_loads.
SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(		SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
SDValue Op, SelectionDAG &DAG) const {		SDValue Op, SelectionDAG &DAG) const {
auto Load = cast<LoadSDNode>(Op);		auto Load = cast<LoadSDNode>(Op);

SDLoc DL(Op);		SDLoc DL(Op);
EVT VT = Op.getValueType();		EVT VT = Op.getValueType();
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);		EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
		EVT LoadVT = ContainerVT;
		EVT MemVT = Load->getMemoryVT();
		paulwalker-armUnsubmitted Not Done Reply Inline Actions You could move this into the if block where it's used. paulwalker-arm: You could move this into the if block where it's used.

		auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);

		if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
		LoadVT = ContainerVT.changeTypeToInteger();
		MemVT = MemVT.changeTypeToInteger();
		paulwalker-armUnsubmitted Not Done Reply Inline Actions Purely to help with formatting I believe you can just do `VT.isFloatingPoint()` here. paulwalker-arm: Purely to help with formatting I believe you can just do `VT.isFloatingPoint()` here.
		}
		paulwalker-armUnsubmitted Not Done Reply Inline Actions Can you be specific here as in use `== ISD::EXTLOAD` as we should need to support any other extension types. paulwalker-arm: Can you be specific here as in use `== ISD::EXTLOAD` as we should need to support any other…

auto NewLoad = DAG.getMaskedLoad(		auto NewLoad = DAG.getMaskedLoad(
ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),		LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
getPredicateForFixedLengthVector(DAG, DL, VT), DAG.getUNDEF(ContainerVT),		DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(),		Load->getAddressingMode(), Load->getExtensionType());
Load->getExtensionType());
		if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
		paulwalker-armUnsubmitted Not Done Reply Inline Actions I think it'll be safer to mirror the other if condition as the two are linked (i.e. we need to do both or none based on the same requirement). paulwalker-arm: I think it'll be safer to mirror the other if condition as the two are linked (i.e. we need to…
		EVT ExtendVT = ContainerVT.changeVectorElementType(
		Load->getMemoryVT().getVectorElementType());

		NewLoad = getSVESafeBitCast(ExtendVT, NewLoad, DAG);
		NewLoad = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
		Pg, NewLoad, DAG.getUNDEF(ContainerVT));
		}

auto Result = convertFromScalableVector(DAG, VT, NewLoad);		auto Result = convertFromScalableVector(DAG, VT, NewLoad);
SDValue MergedValues[2] = {Result, Load->getChain()};		SDValue MergedValues[2] = {Result, Load->getChain()};
return DAG.getMergeValues(MergedValues, DL);		return DAG.getMergeValues(MergedValues, DL);
}		}

static SDValue convertFixedMaskToScalableVector(SDValue Mask,		static SDValue convertFixedMaskToScalableVector(SDValue Mask,
SelectionDAG &DAG) {		SelectionDAG &DAG) {
▲ Show 20 Lines • Show All 58 Lines • ▼ Show 20 Lines
// Convert all fixed length vector stores larger than NEON to masked_stores.		// Convert all fixed length vector stores larger than NEON to masked_stores.
SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(		SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
SDValue Op, SelectionDAG &DAG) const {		SDValue Op, SelectionDAG &DAG) const {
auto Store = cast<StoreSDNode>(Op);		auto Store = cast<StoreSDNode>(Op);

SDLoc DL(Op);		SDLoc DL(Op);
EVT VT = Store->getValue().getValueType();		EVT VT = Store->getValue().getValueType();
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);		EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
		EVT MemVT = Store->getMemoryVT();

		paulwalker-armUnsubmitted Not Done Reply Inline Actions Perhaps `TruncVT`? Plus as above this could be moved into the if block where it is used. paulwalker-arm: Perhaps `TruncVT`? Plus as above this could be moved into the if block where it is used.
		auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());		auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
return DAG.getMaskedStore(
Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),		if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
		paulwalker-armUnsubmitted Not Done Reply Inline Actions As above this can be just `VT`. paulwalker-arm: As above this can be just `VT`.
getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(),		EVT TruncVT = ContainerVT.changeVectorElementType(
		Store->getMemoryVT().getVectorElementType());
		MemVT = MemVT.changeTypeToInteger();
		NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
		NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
		DAG.getUNDEF(TruncVT));
		NewValue =
		getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
		}

		return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
		Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
Store->getMemOperand(), Store->getAddressingMode(),		Store->getMemOperand(), Store->getAddressingMode(),
Store->isTruncatingStore());		Store->isTruncatingStore());
}		}

SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(		SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
SDValue Op, SelectionDAG &DAG) const {		SDValue Op, SelectionDAG &DAG) const {
auto Store = cast<MaskedStoreSDNode>(Op);		auto Store = cast<MaskedStoreSDNode>(Op);

if (Store->isTruncatingStore())		if (Store->isTruncatingStore())
return SDValue();		return SDValue();
▲ Show 20 Lines • Show All 734 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll

Show First 20 Lines • Show All 42 Lines • ▼ Show 20 Lines
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%res = fpext <4 x half> %op1 to <4 x float>		%res = fpext <4 x half> %op1 to <4 x float>
ret <4 x float> %res		ret <4 x float> %res
}		}

define void @fcvt_v8f16_v8f32(<8 x half>* %a, <8 x float>* %b) #0 {		define void @fcvt_v8f16_v8f32(<8 x half>* %a, <8 x float>* %b) #0 {
; CHECK-LABEL: fcvt_v8f16_v8f32:		; CHECK-LABEL: fcvt_v8f16_v8f32:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ptrue p0.s, vl8		; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: uunpklo z0.s, z0.h		; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
; CHECK-NEXT: fcvt z0.s, p0/m, z0.h		; CHECK-NEXT: fcvt z0.s, p0/m, z0.h
; CHECK-NEXT: st1w { z0.s }, p0, [x1]		; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%op1 = load <8 x half>, <8 x half>* %a		%op1 = load <8 x half>, <8 x half>* %a
%res = fpext <8 x half> %op1 to <8 x float>		%res = fpext <8 x half> %op1 to <8 x float>
store <8 x float> %res, <8 x float>* %b		store <8 x float> %res, <8 x float>* %b
ret void		ret void
}		}
Show All 9 Lines
; VBITS_EQ_256-NEXT: uunpklo z1.s, z0.h		; VBITS_EQ_256-NEXT: uunpklo z1.s, z0.h
; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16		; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h		; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h
; VBITS_EQ_256-NEXT: fcvt z1.s, p0/m, z1.h		; VBITS_EQ_256-NEXT: fcvt z1.s, p0/m, z1.h
; VBITS_EQ_256-NEXT: fcvt z0.s, p0/m, z0.h		; VBITS_EQ_256-NEXT: fcvt z0.s, p0/m, z0.h
; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1]		; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1]
; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]		; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
; VBITS_EQ_256-NEXT: ret		; VBITS_EQ_256-NEXT: ret
		;
; VBITS_GE_512-LABEL: fcvt_v16f16_v16f32:		; VBITS_GE_512-LABEL: fcvt_v16f16_v16f32:
; VBITS_GE_512: // %bb.0:		; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl16
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: ptrue p0.s, vl16		; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h		; VBITS_GE_512-NEXT: ld1sh { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: fcvt z0.s, p0/m, z0.h		; VBITS_GE_512-NEXT: fcvt z0.s, p0/m, z0.h
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]		; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_512-NEXT: ret		; VBITS_GE_512-NEXT: ret

%op1 = load <16 x half>, <16 x half>* %a		%op1 = load <16 x half>, <16 x half>* %a
%res = fpext <16 x half> %op1 to <16 x float>		%res = fpext <16 x half> %op1 to <16 x float>
store <16 x float> %res, <16 x float>* %b		store <16 x float> %res, <16 x float>* %b
ret void		ret void
}		}

define void @fcvt_v32f16_v32f32(<32 x half>* %a, <32 x float>* %b) #0 {		define void @fcvt_v32f16_v32f32(<32 x half>* %a, <32 x float>* %b) #0 {
; VBITS_GE_1024-LABEL: fcvt_v32f16_v32f32:		; VBITS_GE_1024-LABEL: fcvt_v32f16_v32f32:
; VBITS_GE_1024: // %bb.0:		; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32		; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h		; VBITS_GE_1024-NEXT: ld1sh { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: fcvt z0.s, p0/m, z0.h		; VBITS_GE_1024-NEXT: fcvt z0.s, p0/m, z0.h
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]		; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_1024-NEXT: ret		; VBITS_GE_1024-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a		%op1 = load <32 x half>, <32 x half>* %a
%res = fpext <32 x half> %op1 to <32 x float>		%res = fpext <32 x half> %op1 to <32 x float>
store <32 x float> %res, <32 x float>* %b		store <32 x float> %res, <32 x float>* %b
ret void		ret void
}		}

define void @fcvt_v64f16_v64f32(<64 x half>* %a, <64 x float>* %b) #0 {		define void @fcvt_v64f16_v64f32(<64 x half>* %a, <64 x float>* %b) #0 {
; VBITS_GE_2048-LABEL: fcvt_v64f16_v64f32:		; VBITS_GE_2048-LABEL: fcvt_v64f16_v64f32:
; VBITS_GE_2048: // %bb.0:		; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.h, vl64
; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64		; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h		; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: fcvt z0.s, p0/m, z0.h		; VBITS_GE_2048-NEXT: fcvt z0.s, p0/m, z0.h
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]		; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_2048-NEXT: ret		; VBITS_GE_2048-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a		%op1 = load <64 x half>, <64 x half>* %a
%res = fpext <64 x half> %op1 to <64 x float>		%res = fpext <64 x half> %op1 to <64 x float>
store <64 x float> %res, <64 x float>* %b		store <64 x float> %res, <64 x float>* %b
ret void		ret void
}		}
Show All 25 Lines
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%res = fpext <2 x half> %op1 to <2 x double>		%res = fpext <2 x half> %op1 to <2 x double>
ret <2 x double> %res		ret <2 x double> %res
}		}

define void @fcvt_v4f16_v4f64(<4 x half>* %a, <4 x double>* %b) #0 {		define void @fcvt_v4f16_v4f64(<4 x half>* %a, <4 x double>* %b) #0 {
; CHECK-LABEL: fcvt_v4f16_v4f64:		; CHECK-LABEL: fcvt_v4f16_v4f64:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ptrue p0.d, vl4		; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: uunpklo z0.s, z0.h		; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0]
; CHECK-NEXT: uunpklo z0.d, z0.s
; CHECK-NEXT: fcvt z0.d, p0/m, z0.h		; CHECK-NEXT: fcvt z0.d, p0/m, z0.h
; CHECK-NEXT: st1d { z0.d }, p0, [x1]		; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%op1 = load <4 x half>, <4 x half>* %a		%op1 = load <4 x half>, <4 x half>* %a
%res = fpext <4 x half> %op1 to <4 x double>		%res = fpext <4 x half> %op1 to <4 x double>
store <4 x double> %res, <4 x double>* %b		store <4 x double> %res, <4 x double>* %b
ret void		ret void
}		}

define void @fcvt_v8f16_v8f64(<8 x half>* %a, <8 x double>* %b) #0 {		define void @fcvt_v8f16_v8f64(<8 x half>* %a, <8 x double>* %b) #0 {
; Ensure sensible type legalisation.
; VBITS_EQ_256-LABEL: fcvt_v8f16_v8f64:		; VBITS_EQ_256-LABEL: fcvt_v8f16_v8f64:
; VBITS_EQ_256: // %bb.0:		; VBITS_EQ_256: // %bb.0:
; VBITS_EQ_256-NEXT: ldr q0, [x0]		; VBITS_EQ_256-NEXT: ldr q0, [x0]
; VBITS_EQ_256-NEXT: mov x8, #4		; VBITS_EQ_256-NEXT: mov x8, #4
; VBITS_EQ_256-NEXT: ptrue p0.d, vl4		; VBITS_EQ_256-NEXT: ptrue p0.d, vl4
; VBITS_EQ_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8		; VBITS_EQ_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h		; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h
; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s		; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s
; VBITS_EQ_256-NEXT: fcvt z0.d, p0/m, z0.h		; VBITS_EQ_256-NEXT: fcvt z0.d, p0/m, z0.h
; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1]		; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_EQ_256-NEXT: uunpklo z1.s, z1.h		; VBITS_EQ_256-NEXT: uunpklo z1.s, z1.h
; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s		; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s
; VBITS_EQ_256-NEXT: fcvt z1.d, p0/m, z1.h		; VBITS_EQ_256-NEXT: fcvt z1.d, p0/m, z1.h
; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]		; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]
; VBITS_EQ_256-NEXT: ret		; VBITS_EQ_256-NEXT: ret
		;
; VBITS_GE_512-LABEL: fcvt_v8f16_v8f64:		; VBITS_GE_512-LABEL: fcvt_v8f16_v8f64:
; VBITS_GE_512: // %bb.0:		; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ldr q0, [x0]
; VBITS_GE_512-NEXT: ptrue p0.d, vl8		; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h		; VBITS_GE_512-NEXT: ld1sh { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_512-NEXT: fcvt z0.d, p0/m, z0.h		; VBITS_GE_512-NEXT: fcvt z0.d, p0/m, z0.h
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]		; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_512-NEXT: ret		; VBITS_GE_512-NEXT: ret

%op1 = load <8 x half>, <8 x half>* %a		%op1 = load <8 x half>, <8 x half>* %a
%res = fpext <8 x half> %op1 to <8 x double>		%res = fpext <8 x half> %op1 to <8 x double>
store <8 x double> %res, <8 x double>* %b		store <8 x double> %res, <8 x double>* %b
ret void		ret void
}		}

define void @fcvt_v16f16_v16f64(<16 x half>* %a, <16 x double>* %b) #0 {		define void @fcvt_v16f16_v16f64(<16 x half>* %a, <16 x double>* %b) #0 {
; VBITS_GE_1024-LABEL: fcvt_v16f16_v16f64:		; VBITS_GE_1024-LABEL: fcvt_v16f16_v16f64:
; VBITS_GE_1024: // %bb.0:		; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.h, vl16
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16		; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h		; VBITS_GE_1024-NEXT: ld1sh { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_1024-NEXT: fcvt z0.d, p0/m, z0.h		; VBITS_GE_1024-NEXT: fcvt z0.d, p0/m, z0.h
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]		; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret		; VBITS_GE_1024-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a		%op1 = load <16 x half>, <16 x half>* %a
%res = fpext <16 x half> %op1 to <16 x double>		%res = fpext <16 x half> %op1 to <16 x double>
store <16 x double> %res, <16 x double>* %b		store <16 x double> %res, <16 x double>* %b
ret void		ret void
}		}

define void @fcvt_v32f16_v32f64(<32 x half>* %a, <32 x double>* %b) #0 {		define void @fcvt_v32f16_v32f64(<32 x half>* %a, <32 x double>* %b) #0 {
; VBITS_GE_2048-LABEL: fcvt_v32f16_v32f64:		; VBITS_GE_2048-LABEL: fcvt_v32f16_v32f64:
; VBITS_GE_2048: // %bb.0:		; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.h, vl32
; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32		; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h		; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_2048-NEXT: fcvt z0.d, p0/m, z0.h		; VBITS_GE_2048-NEXT: fcvt z0.d, p0/m, z0.h
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]		; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret		; VBITS_GE_2048-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a		%op1 = load <32 x half>, <32 x half>* %a
%res = fpext <32 x half> %op1 to <32 x double>		%res = fpext <32 x half> %op1 to <32 x double>
store <32 x double> %res, <32 x double>* %b		store <32 x double> %res, <32 x double>* %b
ret void		ret void
}		}
Show All 21 Lines
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%res = fpext <2 x float> %op1 to <2 x double>		%res = fpext <2 x float> %op1 to <2 x double>
ret <2 x double> %res		ret <2 x double> %res
}		}

define void @fcvt_v4f32_v4f64(<4 x float>* %a, <4 x double>* %b) #0 {		define void @fcvt_v4f32_v4f64(<4 x float>* %a, <4 x double>* %b) #0 {
; CHECK-LABEL: fcvt_v4f32_v4f64:		; CHECK-LABEL: fcvt_v4f32_v4f64:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ptrue p0.d, vl4		; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: uunpklo z0.d, z0.s		; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
; CHECK-NEXT: fcvt z0.d, p0/m, z0.s		; CHECK-NEXT: fcvt z0.d, p0/m, z0.s
; CHECK-NEXT: st1d { z0.d }, p0, [x1]		; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%op1 = load <4 x float>, <4 x float>* %a		%op1 = load <4 x float>, <4 x float>* %a
%res = fpext <4 x float> %op1 to <4 x double>		%res = fpext <4 x float> %op1 to <4 x double>
store <4 x double> %res, <4 x double>* %b		store <4 x double> %res, <4 x double>* %b
ret void		ret void
}		}
Show All 9 Lines
; VBITS_EQ_256-NEXT: uunpklo z1.d, z0.s		; VBITS_EQ_256-NEXT: uunpklo z1.d, z0.s
; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16		; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s		; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s
; VBITS_EQ_256-NEXT: fcvt z1.d, p0/m, z1.s		; VBITS_EQ_256-NEXT: fcvt z1.d, p0/m, z1.s
; VBITS_EQ_256-NEXT: fcvt z0.d, p0/m, z0.s		; VBITS_EQ_256-NEXT: fcvt z0.d, p0/m, z0.s
; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1]		; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1]
; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]		; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_EQ_256-NEXT: ret		; VBITS_EQ_256-NEXT: ret
		;
; VBITS_GE_512-LABEL: fcvt_v8f32_v8f64:		; VBITS_GE_512-LABEL: fcvt_v8f32_v8f64:
; VBITS_GE_512: // %bb.0:		; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl8
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ptrue p0.d, vl8		; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s		; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: fcvt z0.d, p0/m, z0.s		; VBITS_GE_512-NEXT: fcvt z0.d, p0/m, z0.s
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]		; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_512-NEXT: ret		; VBITS_GE_512-NEXT: ret

%op1 = load <8 x float>, <8 x float>* %a		%op1 = load <8 x float>, <8 x float>* %a
%res = fpext <8 x float> %op1 to <8 x double>		%res = fpext <8 x float> %op1 to <8 x double>
store <8 x double> %res, <8 x double>* %b		store <8 x double> %res, <8 x double>* %b
ret void		ret void
}		}

define void @fcvt_v16f32_v16f64(<16 x float>* %a, <16 x double>* %b) #0 {		define void @fcvt_v16f32_v16f64(<16 x float>* %a, <16 x double>* %b) #0 {
; VBITS_GE_1024-LABEL: fcvt_v16f32_v16f64:		; VBITS_GE_1024-LABEL: fcvt_v16f32_v16f64:
; VBITS_GE_1024: // %bb.0:		; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16		; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s		; VBITS_GE_1024-NEXT: ld1sw { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: fcvt z0.d, p0/m, z0.s		; VBITS_GE_1024-NEXT: fcvt z0.d, p0/m, z0.s
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]		; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret		; VBITS_GE_1024-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a		%op1 = load <16 x float>, <16 x float>* %a
%res = fpext <16 x float> %op1 to <16 x double>		%res = fpext <16 x float> %op1 to <16 x double>
store <16 x double> %res, <16 x double>* %b		store <16 x double> %res, <16 x double>* %b
ret void		ret void
}		}

define void @fcvt_v32f32_v32f64(<32 x float>* %a, <32 x double>* %b) #0 {		define void @fcvt_v32f32_v32f64(<32 x float>* %a, <32 x double>* %b) #0 {
; VBITS_GE_2048-LABEL: fcvt_v32f32_v32f64:		; VBITS_GE_2048-LABEL: fcvt_v32f32_v32f64:
; VBITS_GE_2048: // %bb.0:		; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32		; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s		; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: fcvt z0.d, p0/m, z0.s		; VBITS_GE_2048-NEXT: fcvt z0.d, p0/m, z0.s
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]		; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret		; VBITS_GE_2048-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a		%op1 = load <32 x float>, <32 x float>* %a
%res = fpext <32 x float> %op1 to <32 x double>		%res = fpext <32 x float> %op1 to <32 x double>
store <32 x double> %res, <32 x double>* %b		store <32 x double> %res, <32 x double>* %b
ret void		ret void
}		}
▲ Show 20 Lines • Show All 56 Lines • ▼ Show 20 Lines
; VBITS_EQ_256-NEXT: ptrue p0.h, vl16		; VBITS_EQ_256-NEXT: ptrue p0.h, vl16
; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x1]		; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x1]
; VBITS_EQ_256-NEXT: ret		; VBITS_EQ_256-NEXT: ret
;		;
; VBITS_GE_512-LABEL: fcvt_v16f32_v16f16:		; VBITS_GE_512-LABEL: fcvt_v16f32_v16f16:
; VBITS_GE_512: // %bb.0:		; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16		; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]		; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ptrue p0.s
; VBITS_GE_512-NEXT: fcvt z0.h, p0/m, z0.s		; VBITS_GE_512-NEXT: fcvt z0.h, p0/m, z0.s
; VBITS_GE_512-NEXT: ptrue p0.h, vl16		; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x1]
; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_512-NEXT: ret		; VBITS_GE_512-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a		%op1 = load <16 x float>, <16 x float>* %a
%res = fptrunc <16 x float> %op1 to <16 x half>		%res = fptrunc <16 x float> %op1 to <16 x half>
store <16 x half> %res, <16 x half>* %b		store <16 x half> %res, <16 x half>* %b
ret void		ret void
}		}

define void @fcvt_v32f32_v32f16(<32 x float>* %a, <32 x half>* %b) #0 {		define void @fcvt_v32f32_v32f16(<32 x float>* %a, <32 x half>* %b) #0 {
; VBITS_GE_1024-LABEL: fcvt_v32f32_v32f16:		; VBITS_GE_1024-LABEL: fcvt_v32f32_v32f16:
; VBITS_GE_1024: // %bb.0:		; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32		; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]		; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.s
; VBITS_GE_1024-NEXT: fcvt z0.h, p0/m, z0.s		; VBITS_GE_1024-NEXT: fcvt z0.h, p0/m, z0.s
; VBITS_GE_1024-NEXT: ptrue p0.h, vl32		; VBITS_GE_1024-NEXT: st1h { z0.s }, p0, [x1]
; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_1024-NEXT: ret		; VBITS_GE_1024-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a		%op1 = load <32 x float>, <32 x float>* %a
%res = fptrunc <32 x float> %op1 to <32 x half>		%res = fptrunc <32 x float> %op1 to <32 x half>
store <32 x half> %res, <32 x half>* %b		store <32 x half> %res, <32 x half>* %b
ret void		ret void
}		}

define void @fcvt_v64f32_v64f16(<64 x float>* %a, <64 x half>* %b) #0 {		define void @fcvt_v64f32_v64f16(<64 x float>* %a, <64 x half>* %b) #0 {
; VBITS_GE_2048-LABEL: fcvt_v64f32_v64f16:		; VBITS_GE_2048-LABEL: fcvt_v64f32_v64f16:
; VBITS_GE_2048: // %bb.0:		; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64		; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]		; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.s
; VBITS_GE_2048-NEXT: fcvt z0.h, p0/m, z0.s		; VBITS_GE_2048-NEXT: fcvt z0.h, p0/m, z0.s
; VBITS_GE_2048-NEXT: ptrue p0.h, vl64		; VBITS_GE_2048-NEXT: st1h { z0.s }, p0, [x1]
; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_2048-NEXT: ret		; VBITS_GE_2048-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a		%op1 = load <64 x float>, <64 x float>* %a
%res = fptrunc <64 x float> %op1 to <64 x half>		%res = fptrunc <64 x float> %op1 to <64 x half>
store <64 x half> %res, <64 x half>* %b		store <64 x half> %res, <64 x half>* %b
ret void		ret void
}		}

;		;
▲ Show 20 Lines • Show All 75 Lines • ▼ Show 20 Lines	; VBITS_GE_512-NEXT: ret
ret <8 x half> %res		ret <8 x half> %res
}		}

define void @fcvt_v16f64_v16f16(<16 x double>* %a, <16 x half>* %b) #0 {		define void @fcvt_v16f64_v16f16(<16 x double>* %a, <16 x half>* %b) #0 {
; VBITS_GE_1024-LABEL: fcvt_v16f64_v16f16:		; VBITS_GE_1024-LABEL: fcvt_v16f64_v16f16:
; VBITS_GE_1024: // %bb.0:		; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16		; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]		; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.d
; VBITS_GE_1024-NEXT: fcvt z0.h, p0/m, z0.d		; VBITS_GE_1024-NEXT: fcvt z0.h, p0/m, z0.d
; VBITS_GE_1024-NEXT: ptrue p0.h, vl16		; VBITS_GE_1024-NEXT: st1h { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_1024-NEXT: ret		; VBITS_GE_1024-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a		%op1 = load <16 x double>, <16 x double>* %a
%res = fptrunc <16 x double> %op1 to <16 x half>		%res = fptrunc <16 x double> %op1 to <16 x half>
store <16 x half> %res, <16 x half>* %b		store <16 x half> %res, <16 x half>* %b
ret void		ret void
}		}

define void @fcvt_v32f64_v32f16(<32 x double>* %a, <32 x half>* %b) #0 {		define void @fcvt_v32f64_v32f16(<32 x double>* %a, <32 x half>* %b) #0 {
; VBITS_GE_2048-LABEL: fcvt_v32f64_v32f16:		; VBITS_GE_2048-LABEL: fcvt_v32f64_v32f16:
; VBITS_GE_2048: // %bb.0:		; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32		; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]		; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.d
; VBITS_GE_2048-NEXT: fcvt z0.h, p0/m, z0.d		; VBITS_GE_2048-NEXT: fcvt z0.h, p0/m, z0.d
; VBITS_GE_2048-NEXT: ptrue p0.h, vl32		; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_2048-NEXT: ret		; VBITS_GE_2048-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a		%op1 = load <32 x double>, <32 x double>* %a
%res = fptrunc <32 x double> %op1 to <32 x half>		%res = fptrunc <32 x double> %op1 to <32 x half>
store <32 x half> %res, <32 x half>* %b		store <32 x half> %res, <32 x half>* %b
ret void		ret void
}		}

;		;
▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines
; VBITS_EQ_256-NEXT: ptrue p0.s, vl8		; VBITS_EQ_256-NEXT: ptrue p0.s, vl8
; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1]		; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1]
; VBITS_EQ_256-NEXT: ret		; VBITS_EQ_256-NEXT: ret
;		;
; VBITS_GE_512-LABEL: fcvt_v8f64_v8f32:		; VBITS_GE_512-LABEL: fcvt_v8f64_v8f32:
; VBITS_GE_512: // %bb.0:		; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8		; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]		; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ptrue p0.d
; VBITS_GE_512-NEXT: fcvt z0.s, p0/m, z0.d		; VBITS_GE_512-NEXT: fcvt z0.s, p0/m, z0.d
; VBITS_GE_512-NEXT: ptrue p0.s, vl8		; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [x1]
; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_512-NEXT: ret		; VBITS_GE_512-NEXT: ret
%op1 = load <8 x double>, <8 x double>* %a		%op1 = load <8 x double>, <8 x double>* %a
%res = fptrunc <8 x double> %op1 to <8 x float>		%res = fptrunc <8 x double> %op1 to <8 x float>
store <8 x float> %res, <8 x float>* %b		store <8 x float> %res, <8 x float>* %b
ret void		ret void
}		}

define void @fcvt_v16f64_v16f32(<16 x double>* %a, <16 x float>* %b) #0 {		define void @fcvt_v16f64_v16f32(<16 x double>* %a, <16 x float>* %b) #0 {
; VBITS_GE_1024-LABEL: fcvt_v16f64_v16f32:		; VBITS_GE_1024-LABEL: fcvt_v16f64_v16f32:
; VBITS_GE_1024: // %bb.0:		; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16		; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]		; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.d
; VBITS_GE_1024-NEXT: fcvt z0.s, p0/m, z0.d		; VBITS_GE_1024-NEXT: fcvt z0.s, p0/m, z0.d
; VBITS_GE_1024-NEXT: ptrue p0.s, vl16		; VBITS_GE_1024-NEXT: st1w { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_1024-NEXT: ret		; VBITS_GE_1024-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a		%op1 = load <16 x double>, <16 x double>* %a
%res = fptrunc <16 x double> %op1 to <16 x float>		%res = fptrunc <16 x double> %op1 to <16 x float>
store <16 x float> %res, <16 x float>* %b		store <16 x float> %res, <16 x float>* %b
ret void		ret void
}		}

define void @fcvt_v32f64_v32f32(<32 x double>* %a, <32 x float>* %b) #0 {		define void @fcvt_v32f64_v32f32(<32 x double>* %a, <32 x float>* %b) #0 {
; VBITS_GE_2048-LABEL: fcvt_v32f64_v32f32:		; VBITS_GE_2048-LABEL: fcvt_v32f64_v32f32:
; VBITS_GE_2048: // %bb.0:		; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32		; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]		; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.d
; VBITS_GE_2048-NEXT: fcvt z0.s, p0/m, z0.d		; VBITS_GE_2048-NEXT: fcvt z0.s, p0/m, z0.d
; VBITS_GE_2048-NEXT: ptrue p0.s, vl32		; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_2048-NEXT: ret		; VBITS_GE_2048-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a		%op1 = load <32 x double>, <32 x double>* %a
%res = fptrunc <32 x double> %op1 to <32 x float>		%res = fptrunc <32 x double> %op1 to <32 x float>
store <32 x float> %res, <32 x float>* %b		store <32 x float> %res, <32 x float>* %b
ret void		ret void
}		}

attributes #0 = { "target-features"="+sve" }		attributes #0 = { "target-features"="+sve" }