Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -951,6 +951,7 @@ setTargetDAGCombine(ISD::UMIN); setTargetDAGCombine(ISD::SMAX); setTargetDAGCombine(ISD::UMAX); + setTargetDAGCombine(ISD::FP_EXTEND); } if (!Subtarget->hasFP64()) { @@ -14919,9 +14920,10 @@ return SDValue(); } -// Look for a sign/zero extend of a larger than legal load. This can be split -// into two extending loads, which are simpler to deal with than an arbitrary -// sign extend. +// Look for a sign/zero/fpextend extend of a larger than legal load. This can be +// split into multiple extending loads, which are simpler to deal with than an +// arbitrary extend. For fp extends we use an integer extending load and a VCVTL +// to convert the type to an f32. static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); if (N0.getOpcode() != ISD::LOAD) @@ -14943,12 +14945,15 @@ NumElements = 4; if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8) NumElements = 8; + if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16) + NumElements = 4; if (NumElements == 0 || - FromVT.getVectorNumElements() == NumElements || + (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) || FromVT.getVectorNumElements() % NumElements != 0 || !isPowerOf2_32(NumElements)) return SDValue(); + LLVMContext &C = *DAG.getContext(); SDLoc DL(LD); // Details about the old load SDValue Ch = LD->getChain(); @@ -14960,28 +14965,58 @@ ISD::LoadExtType NewExtType = N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD; SDValue Offset = DAG.getUNDEF(BasePtr.getValueType()); - EVT NewFromVT = FromVT.getHalfNumVectorElementsVT(*DAG.getContext()); - EVT NewToVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext()); - unsigned NewOffset = NewFromVT.getSizeInBits() / 8; - SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset); - - // Split the load in half, each side of which is extended separately. This - // is good enough, as legalisation will take it from there. They are either - // already legal or they will be split further into something that is - // legal. - SDValue NewLoad1 = DAG.getLoad( - ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, BasePtr, Offset, - LD->getPointerInfo(), NewFromVT, Alignment.value(), MMOFlags, AAInfo); - SDValue NewLoad2 = - DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, - LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, - Alignment.value(), MMOFlags, AAInfo); - - SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, - SDValue(NewLoad1.getNode(), 1), - SDValue(NewLoad2.getNode(), 1)); + EVT NewFromVT = EVT::getVectorVT( + C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements); + EVT NewToVT = EVT::getVectorVT( + C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements); + + SmallVector Loads; + SmallVector Chains; + for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { + unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8; + SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset); + + SDValue NewLoad = + DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, + LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, + Alignment.value(), MMOFlags, AAInfo); + Loads.push_back(NewLoad); + Chains.push_back(SDValue(NewLoad.getNode(), 1)); + } + + // Float truncs need to extended with VCVTB's into their floating point types. + if (FromEltVT == MVT::f16) { + SmallVector Extends; + + for (unsigned i = 0; i < Loads.size(); i++) { + SDValue LoadBC = + DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]); + SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC, + DAG.getConstant(0, DL, MVT::i32)); + Extends.push_back(FPExt); + } + + Loads = Extends; + } + + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, NewLoad1, NewLoad2); + + // Combine all loads together using pairwise CONCAT's + while (Loads.size() > 1) { + assert(isPowerOf2_32(Loads.size()) && "Expected a power of 2 loads."); + EVT LoadVT = Loads[0].getValueType(); + EVT ConcatVT = + EVT::getVectorVT(C, ToEltVT, LoadVT.getVectorElementCount() * 2); + + SmallVector Concats; + for (unsigned i = 0; i < Loads.size() / 2; i++) + Concats.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, + Loads[i * 2 + 0], Loads[i * 2 + 1])); + Loads = Concats; + } + + return Loads[0]; } /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, @@ -15029,6 +15064,15 @@ return SDValue(); } +static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + if (ST->hasMVEFloatOps()) + if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG)) + return NewLoad; + + return SDValue(); +} + /// PerformMinMaxCombine - Target-specific DAG combining for creating truncating /// saturates. static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, @@ -15687,6 +15731,8 @@ case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); + case ISD::FP_EXTEND: + return PerformFPExtendCombine(N, DCI.DAG, Subtarget); case ISD::SMIN: case ISD::UMIN: case ISD::SMAX: Index: llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll +++ llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll @@ -166,18 +166,9 @@ ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r2, [r0] -; CHECK-NEXT: ldr r3, [r0, #4] -; CHECK-NEXT: adds r0, #8 -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: vmov.32 q1[1], r3 -; CHECK-NEXT: vmovx.f16 s10, s5 -; CHECK-NEXT: vmovx.f16 s8, s4 -; CHECK-NEXT: vcvtb.f32.f16 s15, s10 -; CHECK-NEXT: vcvtb.f32.f16 s14, s5 -; CHECK-NEXT: vcvtb.f32.f16 s13, s8 -; CHECK-NEXT: vcvtb.f32.f16 s12, s4 -; CHECK-NEXT: vmul.f32 q1, q3, q0 +; CHECK-NEXT: vldrh.u32 q1, [r0], #8 +; CHECK-NEXT: vcvtb.f32.f16 q1, q1 +; CHECK-NEXT: vmul.f32 q1, q1, q0 ; CHECK-NEXT: vstrb.8 q1, [r1], #16 ; CHECK-NEXT: le lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup @@ -215,34 +206,22 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adr r2, .LCPI4_0 ; CHECK-NEXT: mov.w lr, #128 ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q1, [r0], #16 -; CHECK-NEXT: vmovx.f16 s8, s5 -; CHECK-NEXT: vmovx.f16 s13, s7 -; CHECK-NEXT: vcvtb.f32.f16 s11, s8 -; CHECK-NEXT: vmovx.f16 s14, s6 -; CHECK-NEXT: vcvtb.f32.f16 s10, s5 -; CHECK-NEXT: vcvtb.f32.f16 s19, s13 -; CHECK-NEXT: vcvtb.f32.f16 s18, s7 -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vcvtb.f32.f16 s17, s14 -; CHECK-NEXT: vcvtb.f32.f16 s16, s6 -; CHECK-NEXT: vcvtb.f32.f16 s9, s12 -; CHECK-NEXT: vcvtb.f32.f16 s8, s4 -; CHECK-NEXT: vmul.f32 q1, q4, q0 +; CHECK-NEXT: vldrh.u32 q1, [r0, #8] +; CHECK-NEXT: vcvtb.f32.f16 q1, q1 +; CHECK-NEXT: vmul.f32 q1, q1, q0 ; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vmul.f32 q1, q2, q0 +; CHECK-NEXT: vldrh.u32 q1, [r0], #16 +; CHECK-NEXT: vcvtb.f32.f16 q1, q1 +; CHECK-NEXT: vmul.f32 q1, q1, q0 ; CHECK-NEXT: vstrw.32 q1, [r1], #32 ; CHECK-NEXT: le lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: @@ -277,51 +256,30 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: adr r2, .LCPI5_0 ; CHECK-NEXT: mov.w lr, #64 ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB5_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q1, [r0], #32 -; CHECK-NEXT: vmovx.f16 s12, s5 -; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vcvtb.f32.f16 s15, s12 -; CHECK-NEXT: vmovx.f16 s18, s4 -; CHECK-NEXT: vcvtb.f32.f16 s14, s5 -; CHECK-NEXT: vmovx.f16 s16, s7 -; CHECK-NEXT: vcvtb.f32.f16 s13, s18 -; CHECK-NEXT: vmovx.f16 s20, s9 -; CHECK-NEXT: vcvtb.f32.f16 s12, s4 -; CHECK-NEXT: vcvtb.f32.f16 s19, s16 -; CHECK-NEXT: vmovx.f16 s22, s6 -; CHECK-NEXT: vcvtb.f32.f16 s18, s7 -; CHECK-NEXT: vcvtb.f32.f16 s17, s22 -; CHECK-NEXT: vcvtb.f32.f16 s23, s20 -; CHECK-NEXT: vmovx.f16 s28, s11 -; CHECK-NEXT: vcvtb.f32.f16 s22, s9 -; CHECK-NEXT: vcvtb.f32.f16 s31, s28 -; CHECK-NEXT: vmovx.f16 s26, s10 -; CHECK-NEXT: vcvtb.f32.f16 s30, s11 -; CHECK-NEXT: vmovx.f16 s24, s8 -; CHECK-NEXT: vcvtb.f32.f16 s29, s26 -; CHECK-NEXT: vcvtb.f32.f16 s28, s10 -; CHECK-NEXT: vcvtb.f32.f16 s21, s24 -; CHECK-NEXT: vcvtb.f32.f16 s20, s8 -; CHECK-NEXT: vcvtb.f32.f16 s16, s6 -; CHECK-NEXT: vmul.f32 q1, q7, q0 +; CHECK-NEXT: vldrh.u32 q1, [r0, #24] +; CHECK-NEXT: vcvtb.f32.f16 q1, q1 +; CHECK-NEXT: vmul.f32 q1, q1, q0 ; CHECK-NEXT: vstrw.32 q1, [r1, #48] -; CHECK-NEXT: vmul.f32 q1, q5, q0 +; CHECK-NEXT: vldrh.u32 q1, [r0, #16] +; CHECK-NEXT: vcvtb.f32.f16 q1, q1 +; CHECK-NEXT: vmul.f32 q1, q1, q0 ; CHECK-NEXT: vstrw.32 q1, [r1, #32] -; CHECK-NEXT: vmul.f32 q1, q4, q0 +; CHECK-NEXT: vldrh.u32 q1, [r0, #8] +; CHECK-NEXT: vcvtb.f32.f16 q1, q1 +; CHECK-NEXT: vmul.f32 q1, q1, q0 ; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vmul.f32 q1, q3, q0 +; CHECK-NEXT: vldrh.u32 q1, [r0], #32 +; CHECK-NEXT: vcvtb.f32.f16 q1, q1 +; CHECK-NEXT: vmul.f32 q1, q1, q0 ; CHECK-NEXT: vstrw.32 q1, [r1], #64 ; CHECK-NEXT: le lr, .LBB5_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: @@ -362,18 +320,9 @@ ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r2, [r0] -; CHECK-NEXT: ldr r3, [r0, #4] -; CHECK-NEXT: adds r0, #8 -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: vmov.32 q1[1], r3 -; CHECK-NEXT: vmovx.f16 s10, s5 -; CHECK-NEXT: vmovx.f16 s8, s4 -; CHECK-NEXT: vcvtb.f32.f16 s15, s10 -; CHECK-NEXT: vcvtb.f32.f16 s14, s5 -; CHECK-NEXT: vcvtb.f32.f16 s13, s8 -; CHECK-NEXT: vcvtb.f32.f16 s12, s4 -; CHECK-NEXT: vmul.f32 q1, q3, q0 +; CHECK-NEXT: vldrh.u32 q1, [r0], #8 +; CHECK-NEXT: vcvtb.f32.f16 q1, q1 +; CHECK-NEXT: vmul.f32 q1, q1, q0 ; CHECK-NEXT: vcvtb.f16.f32 q1, q1 ; CHECK-NEXT: vstrh.32 q1, [r1], #8 ; CHECK-NEXT: le lr, .LBB6_1 @@ -419,23 +368,14 @@ ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q1, [r0], #16 -; CHECK-NEXT: vmovx.f16 s10, s7 -; CHECK-NEXT: vmovx.f16 s8, s6 -; CHECK-NEXT: vcvtb.f32.f16 s15, s10 -; CHECK-NEXT: vcvtb.f32.f16 s14, s7 -; CHECK-NEXT: vcvtb.f32.f16 s13, s8 -; CHECK-NEXT: vcvtb.f32.f16 s12, s6 -; CHECK-NEXT: vmul.f32 q2, q3, q0 -; CHECK-NEXT: vcvtb.f16.f32 q2, q2 -; CHECK-NEXT: vstrh.32 q2, [r1, #8] -; CHECK-NEXT: vmovx.f16 s10, s5 -; CHECK-NEXT: vcvtb.f32.f16 s15, s10 -; CHECK-NEXT: vmovx.f16 s8, s4 -; CHECK-NEXT: vcvtb.f32.f16 s14, s5 -; CHECK-NEXT: vcvtb.f32.f16 s13, s8 -; CHECK-NEXT: vcvtb.f32.f16 s12, s4 -; CHECK-NEXT: vmul.f32 q1, q3, q0 +; CHECK-NEXT: vldrh.u32 q1, [r0, #8] +; CHECK-NEXT: vcvtb.f32.f16 q1, q1 +; CHECK-NEXT: vmul.f32 q1, q1, q0 +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vstrh.32 q1, [r1, #8] +; CHECK-NEXT: vldrh.u32 q1, [r0], #16 +; CHECK-NEXT: vcvtb.f32.f16 q1, q1 +; CHECK-NEXT: vmul.f32 q1, q1, q0 ; CHECK-NEXT: vcvtb.f16.f32 q1, q1 ; CHECK-NEXT: vstrh.32 q1, [r1], #16 ; CHECK-NEXT: le lr, .LBB7_1 @@ -481,42 +421,24 @@ ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q1, [r0, #16] -; CHECK-NEXT: vmovx.f16 s10, s7 -; CHECK-NEXT: vmovx.f16 s8, s6 -; CHECK-NEXT: vcvtb.f32.f16 s15, s10 -; CHECK-NEXT: vcvtb.f32.f16 s14, s7 -; CHECK-NEXT: vcvtb.f32.f16 s13, s8 -; CHECK-NEXT: vcvtb.f32.f16 s12, s6 -; CHECK-NEXT: vmul.f32 q2, q3, q0 -; CHECK-NEXT: vcvtb.f16.f32 q2, q2 -; CHECK-NEXT: vstrh.32 q2, [r1, #24] -; CHECK-NEXT: vmovx.f16 s10, s5 -; CHECK-NEXT: vcvtb.f32.f16 s15, s10 -; CHECK-NEXT: vmovx.f16 s8, s4 -; CHECK-NEXT: vcvtb.f32.f16 s14, s5 -; CHECK-NEXT: vcvtb.f32.f16 s13, s8 -; CHECK-NEXT: vcvtb.f32.f16 s12, s4 -; CHECK-NEXT: vmul.f32 q1, q3, q0 +; CHECK-NEXT: vldrh.u32 q1, [r0, #24] +; CHECK-NEXT: vcvtb.f32.f16 q1, q1 +; CHECK-NEXT: vmul.f32 q1, q1, q0 +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vstrh.32 q1, [r1, #24] +; CHECK-NEXT: vldrh.u32 q1, [r0, #16] +; CHECK-NEXT: vcvtb.f32.f16 q1, q1 +; CHECK-NEXT: vmul.f32 q1, q1, q0 ; CHECK-NEXT: vcvtb.f16.f32 q1, q1 ; CHECK-NEXT: vstrh.32 q1, [r1, #16] -; CHECK-NEXT: vldrh.u16 q1, [r0], #32 -; CHECK-NEXT: vmovx.f16 s10, s7 -; CHECK-NEXT: vmovx.f16 s8, s6 -; CHECK-NEXT: vcvtb.f32.f16 s15, s10 -; CHECK-NEXT: vcvtb.f32.f16 s14, s7 -; CHECK-NEXT: vcvtb.f32.f16 s13, s8 -; CHECK-NEXT: vcvtb.f32.f16 s12, s6 -; CHECK-NEXT: vmul.f32 q2, q3, q0 -; CHECK-NEXT: vcvtb.f16.f32 q2, q2 -; CHECK-NEXT: vstrh.32 q2, [r1, #8] -; CHECK-NEXT: vmovx.f16 s10, s5 -; CHECK-NEXT: vcvtb.f32.f16 s15, s10 -; CHECK-NEXT: vmovx.f16 s8, s4 -; CHECK-NEXT: vcvtb.f32.f16 s14, s5 -; CHECK-NEXT: vcvtb.f32.f16 s13, s8 -; CHECK-NEXT: vcvtb.f32.f16 s12, s4 -; CHECK-NEXT: vmul.f32 q1, q3, q0 +; CHECK-NEXT: vldrh.u32 q1, [r0, #8] +; CHECK-NEXT: vcvtb.f32.f16 q1, q1 +; CHECK-NEXT: vmul.f32 q1, q1, q0 +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vstrh.32 q1, [r1, #8] +; CHECK-NEXT: vldrh.u32 q1, [r0], #32 +; CHECK-NEXT: vcvtb.f32.f16 q1, q1 +; CHECK-NEXT: vmul.f32 q1, q1, q0 ; CHECK-NEXT: vcvtb.f16.f32 q1, q1 ; CHECK-NEXT: vstrh.32 q1, [r1], #32 ; CHECK-NEXT: le lr, .LBB8_1 Index: llvm/test/CodeGen/Thumb2/mve-vcvt16.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vcvt16.ll +++ llvm/test/CodeGen/Thumb2/mve-vcvt16.ll @@ -217,15 +217,8 @@ define arm_aapcs_vfpcc <4 x float> @load_ext_4(<4 x half>* %src) { ; CHECK-LABEL: load_ext_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r0, [r0] -; CHECK-NEXT: vmov.32 q1[0], r1 -; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmovx.f16 s0, s5 -; CHECK-NEXT: vmovx.f16 s8, s4 -; CHECK-NEXT: vcvtb.f32.f16 s3, s0 -; CHECK-NEXT: vcvtb.f32.f16 s2, s5 -; CHECK-NEXT: vcvtb.f32.f16 s1, s8 -; CHECK-NEXT: vcvtb.f32.f16 s0, s4 +; CHECK-NEXT: vldrh.u32 q0, [r0] +; CHECK-NEXT: vcvtb.f32.f16 q0, q0 ; CHECK-NEXT: bx lr entry: %wide.load = load <4 x half>, <4 x half>* %src, align 4 @@ -236,19 +229,10 @@ define arm_aapcs_vfpcc <8 x float> @load_ext_8(<8 x half>* %src) { ; CHECK-LABEL: load_ext_8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmovx.f16 s0, s9 -; CHECK-NEXT: vmovx.f16 s6, s8 -; CHECK-NEXT: vcvtb.f32.f16 s3, s0 -; CHECK-NEXT: vmovx.f16 s4, s11 -; CHECK-NEXT: vcvtb.f32.f16 s2, s9 -; CHECK-NEXT: vmovx.f16 s12, s10 -; CHECK-NEXT: vcvtb.f32.f16 s1, s6 -; CHECK-NEXT: vcvtb.f32.f16 s0, s8 -; CHECK-NEXT: vcvtb.f32.f16 s7, s4 -; CHECK-NEXT: vcvtb.f32.f16 s6, s11 -; CHECK-NEXT: vcvtb.f32.f16 s5, s12 -; CHECK-NEXT: vcvtb.f32.f16 s4, s10 +; CHECK-NEXT: vldrh.u32 q0, [r0] +; CHECK-NEXT: vldrh.u32 q1, [r0, #8] +; CHECK-NEXT: vcvtb.f32.f16 q0, q0 +; CHECK-NEXT: vcvtb.f32.f16 q1, q1 ; CHECK-NEXT: bx lr entry: %wide.load = load <8 x half>, <8 x half>* %src, align 4 @@ -259,35 +243,14 @@ define arm_aapcs_vfpcc <16 x float> @load_ext_16(<16 x half>* %src) { ; CHECK-LABEL: load_ext_16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10} -; CHECK-NEXT: vpush {d8, d9, d10} -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q4, [r0, #16] -; CHECK-NEXT: vmovx.f16 s0, s9 -; CHECK-NEXT: vmovx.f16 s6, s8 -; CHECK-NEXT: vcvtb.f32.f16 s3, s0 -; CHECK-NEXT: vmovx.f16 s4, s11 -; CHECK-NEXT: vcvtb.f32.f16 s2, s9 -; CHECK-NEXT: vmovx.f16 s15, s10 -; CHECK-NEXT: vcvtb.f32.f16 s1, s6 -; CHECK-NEXT: vmovx.f16 s13, s17 -; CHECK-NEXT: vcvtb.f32.f16 s0, s8 -; CHECK-NEXT: vcvtb.f32.f16 s7, s4 -; CHECK-NEXT: vcvtb.f32.f16 s6, s11 -; CHECK-NEXT: vmovx.f16 s14, s16 -; CHECK-NEXT: vcvtb.f32.f16 s5, s15 -; CHECK-NEXT: vmovx.f16 s12, s19 -; CHECK-NEXT: vcvtb.f32.f16 s4, s10 -; CHECK-NEXT: vcvtb.f32.f16 s11, s13 -; CHECK-NEXT: vcvtb.f32.f16 s10, s17 -; CHECK-NEXT: vmovx.f16 s20, s18 -; CHECK-NEXT: vcvtb.f32.f16 s9, s14 -; CHECK-NEXT: vcvtb.f32.f16 s8, s16 -; CHECK-NEXT: vcvtb.f32.f16 s15, s12 -; CHECK-NEXT: vcvtb.f32.f16 s14, s19 -; CHECK-NEXT: vcvtb.f32.f16 s13, s20 -; CHECK-NEXT: vcvtb.f32.f16 s12, s18 -; CHECK-NEXT: vpop {d8, d9, d10} +; CHECK-NEXT: vldrh.u32 q0, [r0] +; CHECK-NEXT: vldrh.u32 q1, [r0, #8] +; CHECK-NEXT: vldrh.u32 q2, [r0, #16] +; CHECK-NEXT: vldrh.u32 q3, [r0, #24] +; CHECK-NEXT: vcvtb.f32.f16 q0, q0 +; CHECK-NEXT: vcvtb.f32.f16 q1, q1 +; CHECK-NEXT: vcvtb.f32.f16 q2, q2 +; CHECK-NEXT: vcvtb.f32.f16 q3, q3 ; CHECK-NEXT: bx lr entry: %wide.load = load <16 x half>, <16 x half>* %src, align 4 Index: llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll +++ llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll @@ -134,14 +134,14 @@ define void @foo_int32_int8_double(<16 x i32>* %dest, <16 x i8>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_int32_int8_double: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s32 q0, [r1, #4] -; CHECK-NEXT: vldrb.s32 q1, [r1] -; CHECK-NEXT: vldrb.s32 q2, [r1, #12] -; CHECK-NEXT: vldrb.s32 q3, [r1, #8] -; CHECK-NEXT: vstrw.32 q1, [r0] -; CHECK-NEXT: vstrw.32 q0, [r0, #16] -; CHECK-NEXT: vstrw.32 q3, [r0, #32] -; CHECK-NEXT: vstrw.32 q2, [r0, #48] +; CHECK-NEXT: vldrb.s32 q0, [r1] +; CHECK-NEXT: vldrb.s32 q1, [r1, #4] +; CHECK-NEXT: vldrb.s32 q2, [r1, #8] +; CHECK-NEXT: vldrb.s32 q3, [r1, #12] +; CHECK-NEXT: vstrw.32 q1, [r0, #16] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vstrw.32 q3, [r0, #48] +; CHECK-NEXT: vstrw.32 q2, [r0, #32] ; CHECK-NEXT: bx lr entry: %wide.load = load <16 x i8>, <16 x i8>* %src, align 1 @@ -224,14 +224,14 @@ define void @foo_uint32_uint8_double(<16 x i32>* %dest, <16 x i8>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_uint32_uint8_double: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1, #4] -; CHECK-NEXT: vldrb.u32 q1, [r1] -; CHECK-NEXT: vldrb.u32 q2, [r1, #12] -; CHECK-NEXT: vldrb.u32 q3, [r1, #8] -; CHECK-NEXT: vstrw.32 q1, [r0] -; CHECK-NEXT: vstrw.32 q0, [r0, #16] -; CHECK-NEXT: vstrw.32 q3, [r0, #32] -; CHECK-NEXT: vstrw.32 q2, [r0, #48] +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vldrb.u32 q1, [r1, #4] +; CHECK-NEXT: vldrb.u32 q2, [r1, #8] +; CHECK-NEXT: vldrb.u32 q3, [r1, #12] +; CHECK-NEXT: vstrw.32 q1, [r0, #16] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vstrw.32 q3, [r0, #48] +; CHECK-NEXT: vstrw.32 q2, [r0, #32] ; CHECK-NEXT: bx lr entry: %wide.load = load <16 x i8>, <16 x i8>* %src, align 1 @@ -347,12 +347,12 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q0, [r1, #32]! ; CHECK-NEXT: vldrh.s32 q1, [r1, #8] -; CHECK-NEXT: vldrh.s32 q2, [r1, #24] -; CHECK-NEXT: vldrh.s32 q3, [r1, #16] +; CHECK-NEXT: vldrh.s32 q2, [r1, #16] +; CHECK-NEXT: vldrh.s32 q3, [r1, #24] ; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: vstrw.32 q2, [r0, #48] +; CHECK-NEXT: vstrw.32 q2, [r0, #32] ; CHECK-NEXT: vstrw.32 q1, [r0, #16] -; CHECK-NEXT: vstrw.32 q3, [r0, #32] +; CHECK-NEXT: vstrw.32 q3, [r0, #48] ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bx lr entry: