diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -15007,7 +15007,7 @@ // fold (fpext (load x)) -> (fpext (fptrunc (extload x))) if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && - TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) { + TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, VT, N0.getValueType())) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, LN0->getChain(), diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1545,7 +1545,7 @@ setCondCodeAction(ISD::SETUNE, VT, Expand); } - // Mark integer truncating stores as having custom lowering + // Mark integer truncating stores/extending loads as having custom lowering if (VT.isInteger()) { MVT InnerVT = VT.changeVectorElementType(MVT::i8); while (InnerVT != VT) { @@ -1557,6 +1557,18 @@ } } + // Mark floating-point truncating stores/extending loads as having custom + // lowering + if (VT.isFloatingPoint()) { + MVT InnerVT = VT.changeVectorElementType(MVT::f16); + while (InnerVT != VT) { + setTruncStoreAction(VT, InnerVT, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom); + InnerVT = InnerVT.changeVectorElementType( + MVT::getFloatingPointVT(2 * InnerVT.getScalarSizeInBits())); + } + } + // Lower fixed length vector operations to scalable equivalents. setOperationAction(ISD::ABS, VT, Custom); setOperationAction(ISD::ADD, VT, Custom); @@ -18658,12 +18670,29 @@ SDLoc DL(Op); EVT VT = Op.getValueType(); EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); + EVT LoadVT = ContainerVT; + EVT MemVT = Load->getMemoryVT(); + + auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT); + + if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) { + LoadVT = ContainerVT.changeTypeToInteger(); + MemVT = MemVT.changeTypeToInteger(); + } auto NewLoad = DAG.getMaskedLoad( - ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), - getPredicateForFixedLengthVector(DAG, DL, VT), DAG.getUNDEF(ContainerVT), - Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(), - Load->getExtensionType()); + LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg, + DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(), + Load->getAddressingMode(), Load->getExtensionType()); + + if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) { + EVT ExtendVT = ContainerVT.changeVectorElementType( + Load->getMemoryVT().getVectorElementType()); + + NewLoad = getSVESafeBitCast(ExtendVT, NewLoad, DAG); + NewLoad = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT, + Pg, NewLoad, DAG.getUNDEF(ContainerVT)); + } auto Result = convertFromScalableVector(DAG, VT, NewLoad); SDValue MergedValues[2] = {Result, Load->getChain()}; @@ -18738,13 +18767,26 @@ SDLoc DL(Op); EVT VT = Store->getValue().getValueType(); EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); + EVT MemVT = Store->getMemoryVT(); + auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT); auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue()); - return DAG.getMaskedStore( - Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(), - getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(), - Store->getMemOperand(), Store->getAddressingMode(), - Store->isTruncatingStore()); + + if (VT.isFloatingPoint() && Store->isTruncatingStore()) { + EVT TruncVT = ContainerVT.changeVectorElementType( + Store->getMemoryVT().getVectorElementType()); + MemVT = MemVT.changeTypeToInteger(); + NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg, + NewValue, DAG.getTargetConstant(0, DL, MVT::i64), + DAG.getUNDEF(TruncVT)); + NewValue = + getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG); + } + + return DAG.getMaskedStore(Store->getChain(), DL, NewValue, + Store->getBasePtr(), Store->getOffset(), Pg, MemVT, + Store->getMemOperand(), Store->getAddressingMode(), + Store->isTruncatingStore()); } SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE( diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll @@ -48,9 +48,8 @@ define void @fcvt_v8f16_v8f32(<8 x half>* %a, <8 x float>* %b) #0 { ; CHECK-LABEL: fcvt_v8f16_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl8 -; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0] ; CHECK-NEXT: fcvt z0.s, p0/m, z0.h ; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret @@ -76,16 +75,15 @@ ; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1] ; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] ; VBITS_EQ_256-NEXT: ret - +; ; VBITS_GE_512-LABEL: fcvt_v16f16_v16f32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl16 -; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_512-NEXT: ld1sh { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: fcvt z0.s, p0/m, z0.h ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_512-NEXT: ret + %op1 = load <16 x half>, <16 x half>* %a %res = fpext <16 x half> %op1 to <16 x float> store <16 x float> %res, <16 x float>* %b @@ -95,10 +93,8 @@ define void @fcvt_v32f16_v32f32(<32 x half>* %a, <32 x float>* %b) #0 { ; VBITS_GE_1024-LABEL: fcvt_v32f16_v32f32: ; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 -; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 -; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_1024-NEXT: ld1sh { z0.s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: fcvt z0.s, p0/m, z0.h ; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_1024-NEXT: ret @@ -111,10 +107,8 @@ define void @fcvt_v64f16_v64f32(<64 x half>* %a, <64 x float>* %b) #0 { ; VBITS_GE_2048-LABEL: fcvt_v64f16_v64f32: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl64 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 -; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: fcvt z0.s, p0/m, z0.h ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_2048-NEXT: ret @@ -156,10 +150,8 @@ define void @fcvt_v4f16_v4f64(<4 x half>* %a, <4 x double>* %b) #0 { ; CHECK-LABEL: fcvt_v4f16_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0] ; CHECK-NEXT: fcvt z0.d, p0/m, z0.h ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret @@ -170,7 +162,6 @@ } define void @fcvt_v8f16_v8f64(<8 x half>* %a, <8 x double>* %b) #0 { -; Ensure sensible type legalisation. ; VBITS_EQ_256-LABEL: fcvt_v8f16_v8f64: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: ldr q0, [x0] @@ -186,16 +177,15 @@ ; VBITS_EQ_256-NEXT: fcvt z1.d, p0/m, z1.h ; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] ; VBITS_EQ_256-NEXT: ret - +; ; VBITS_GE_512-LABEL: fcvt_v8f16_v8f64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ldr q0, [x0] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_512-NEXT: ld1sh { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: fcvt z0.d, p0/m, z0.h ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_GE_512-NEXT: ret + %op1 = load <8 x half>, <8 x half>* %a %res = fpext <8 x half> %op1 to <8 x double> store <8 x double> %res, <8 x double>* %b @@ -205,11 +195,8 @@ define void @fcvt_v16f16_v16f64(<16 x half>* %a, <16 x double>* %b) #0 { ; VBITS_GE_1024-LABEL: fcvt_v16f16_v16f64: ; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 -; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 -; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_1024-NEXT: ld1sh { z0.d }, p0/z, [x0] ; VBITS_GE_1024-NEXT: fcvt z0.d, p0/m, z0.h ; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_GE_1024-NEXT: ret @@ -222,11 +209,8 @@ define void @fcvt_v32f16_v32f64(<32 x half>* %a, <32 x double>* %b) #0 { ; VBITS_GE_2048-LABEL: fcvt_v32f16_v32f64: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 -; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: fcvt z0.d, p0/m, z0.h ; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_GE_2048-NEXT: ret @@ -264,9 +248,8 @@ define void @fcvt_v4f32_v4f64(<4 x float>* %a, <4 x double>* %b) #0 { ; CHECK-LABEL: fcvt_v4f32_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0] ; CHECK-NEXT: fcvt z0.d, p0/m, z0.s ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret @@ -292,16 +275,15 @@ ; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1] ; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] ; VBITS_EQ_256-NEXT: ret - +; ; VBITS_GE_512-LABEL: fcvt_v8f32_v8f64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl8 -; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: fcvt z0.d, p0/m, z0.s ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_GE_512-NEXT: ret + %op1 = load <8 x float>, <8 x float>* %a %res = fpext <8 x float> %op1 to <8 x double> store <8 x double> %res, <8 x double>* %b @@ -311,10 +293,8 @@ define void @fcvt_v16f32_v16f64(<16 x float>* %a, <16 x double>* %b) #0 { ; VBITS_GE_1024-LABEL: fcvt_v16f32_v16f64: ; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 -; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 -; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_1024-NEXT: ld1sw { z0.d }, p0/z, [x0] ; VBITS_GE_1024-NEXT: fcvt z0.d, p0/m, z0.s ; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_GE_1024-NEXT: ret @@ -327,10 +307,8 @@ define void @fcvt_v32f32_v32f64(<32 x float>* %a, <32 x double>* %b) #0 { ; VBITS_GE_2048-LABEL: fcvt_v32f32_v32f64: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: fcvt z0.d, p0/m, z0.s ; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_GE_2048-NEXT: ret @@ -403,11 +381,8 @@ ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: ptrue p0.s ; VBITS_GE_512-NEXT: fcvt z0.h, p0/m, z0.s -; VBITS_GE_512-NEXT: ptrue p0.h, vl16 -; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x1] ; VBITS_GE_512-NEXT: ret %op1 = load <16 x float>, <16 x float>* %a %res = fptrunc <16 x float> %op1 to <16 x half> @@ -420,11 +395,8 @@ ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 ; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_1024-NEXT: ptrue p0.s ; VBITS_GE_1024-NEXT: fcvt z0.h, p0/m, z0.s -; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 -; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_1024-NEXT: st1h { z0.s }, p0, [x1] ; VBITS_GE_1024-NEXT: ret %op1 = load <32 x float>, <32 x float>* %a %res = fptrunc <32 x float> %op1 to <32 x half> @@ -437,11 +409,8 @@ ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 ; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ptrue p0.s ; VBITS_GE_2048-NEXT: fcvt z0.h, p0/m, z0.s -; VBITS_GE_2048-NEXT: ptrue p0.h, vl64 -; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_2048-NEXT: st1h { z0.s }, p0, [x1] ; VBITS_GE_2048-NEXT: ret %op1 = load <64 x float>, <64 x float>* %a %res = fptrunc <64 x float> %op1 to <64 x half> @@ -533,12 +502,8 @@ ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 ; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_1024-NEXT: ptrue p0.d ; VBITS_GE_1024-NEXT: fcvt z0.h, p0/m, z0.d -; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 -; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_1024-NEXT: st1h { z0.d }, p0, [x1] ; VBITS_GE_1024-NEXT: ret %op1 = load <16 x double>, <16 x double>* %a %res = fptrunc <16 x double> %op1 to <16 x half> @@ -551,12 +516,8 @@ ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 ; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ptrue p0.d ; VBITS_GE_2048-NEXT: fcvt z0.h, p0/m, z0.d -; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [x1] ; VBITS_GE_2048-NEXT: ret %op1 = load <32 x double>, <32 x double>* %a %res = fptrunc <32 x double> %op1 to <32 x half> @@ -627,11 +588,8 @@ ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ptrue p0.d ; VBITS_GE_512-NEXT: fcvt z0.s, p0/m, z0.d -; VBITS_GE_512-NEXT: ptrue p0.s, vl8 -; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [x1] ; VBITS_GE_512-NEXT: ret %op1 = load <8 x double>, <8 x double>* %a %res = fptrunc <8 x double> %op1 to <8 x float> @@ -644,11 +602,8 @@ ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 ; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_1024-NEXT: ptrue p0.d ; VBITS_GE_1024-NEXT: fcvt z0.s, p0/m, z0.d -; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 -; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_1024-NEXT: st1w { z0.d }, p0, [x1] ; VBITS_GE_1024-NEXT: ret %op1 = load <16 x double>, <16 x double>* %a %res = fptrunc <16 x double> %op1 to <16 x float> @@ -661,11 +616,8 @@ ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 ; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ptrue p0.d ; VBITS_GE_2048-NEXT: fcvt z0.s, p0/m, z0.d -; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 -; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [x1] ; VBITS_GE_2048-NEXT: ret %op1 = load <32 x double>, <32 x double>* %a %res = fptrunc <32 x double> %op1 to <32 x float>