diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -14268,7 +14268,7 @@ if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) return SDValue(); SDValue Trunc = St->getValue(); - if (Trunc->getOpcode() != ISD::TRUNCATE) + if (Trunc->getOpcode() != ISD::TRUNCATE && Trunc->getOpcode() != ISD::FP_ROUND) return SDValue(); EVT FromVT = Trunc->getOperand(0).getValueType(); EVT ToVT = Trunc.getValueType(); @@ -14283,7 +14283,10 @@ NumElements = 4; if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8) NumElements = 8; - if (NumElements == 0 || FromVT.getVectorNumElements() == NumElements || + if (FromEltVT == MVT::f32 && ToEltVT == MVT::f16) + NumElements = 4; + if (NumElements == 0 || + (FromEltVT != MVT::f32 && FromVT.getVectorNumElements() == NumElements) || FromVT.getVectorNumElements() % NumElements != 0) return SDValue(); @@ -14293,7 +14296,7 @@ // rev: N 0 N+1 1 N+2 2 ... auto isVMOVNOriginalMask = [&](ArrayRef M, bool rev) { unsigned NumElts = ToVT.getVectorNumElements(); - if (NumElts != M.size() || (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)) + if (NumElts != M.size()) return false; unsigned Off0 = rev ? NumElts : 0; @@ -14314,6 +14317,7 @@ isVMOVNOriginalMask(Shuffle->getMask(), true)) return SDValue(); + LLVMContext &C = *DAG.getContext(); SDLoc DL(St); // Details about the old store SDValue Ch = St->getChain(); @@ -14322,8 +14326,11 @@ MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); AAMDNodes AAInfo = St->getAAInfo(); - EVT NewFromVT = EVT::getVectorVT(*DAG.getContext(), FromEltVT, NumElements); - EVT NewToVT = EVT::getVectorVT(*DAG.getContext(), ToEltVT, NumElements); + // We split the store into slices of NumElements. fp16 trunc stores are vcvt + // and then stored as truncating integer stores. + EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements); + EVT NewToVT = EVT::getVectorVT( + C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements); SmallVector Stores; for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { @@ -14333,6 +14340,14 @@ SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0), DAG.getConstant(i * NumElements, DL, MVT::i32)); + + if (ToEltVT == MVT::f16) { + SDValue FPTrunc = + DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16), + Extract, DAG.getConstant(0, DL, MVT::i32)); + Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc); + } + SDValue Store = DAG.getTruncStore( Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset), NewToVT, Alignment.value(), MMOFlags, AAInfo); diff --git a/llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll b/llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll --- a/llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll @@ -14,23 +14,8 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vcvtb.f16.f32 s8, s4 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vcvtb.f16.f32 s8, s5 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vcvtb.f16.f32 s12, s6 -; CHECK-NEXT: vmov.16 q2[1], r3 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vcvtb.f16.f32 s4, s7 -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: str r2, [r1] -; CHECK-NEXT: str r3, [r1, #4] -; CHECK-NEXT: adds r1, #8 +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vstrh.32 q1, [r1], #8 ; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -73,35 +58,14 @@ ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmul.f32 q1, q1, q0 +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vstrh.32 q1, [r1, #8] ; CHECK-NEXT: vldrw.u32 q1, [r0], #32 -; CHECK-NEXT: vmul.f32 q2, q1, q0 -; CHECK-NEXT: vcvtb.f16.f32 s4, s8 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vcvtb.f16.f32 s4, s9 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vcvtb.f16.f32 s12, s10 -; CHECK-NEXT: vmov.16 q1[1], r3 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vcvtb.f16.f32 s8, s11 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vldrw.u32 q2, [r0, #-16] -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmul.f32 q2, q2, q0 -; CHECK-NEXT: vcvtb.f16.f32 s12, s8 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s9 -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s10 -; CHECK-NEXT: vmov.16 q1[5], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vcvtb.f16.f32 s8, s11 -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: vmul.f32 q1, q1, q0 +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vstrh.32 q1, [r1], #16 ; CHECK-NEXT: le lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -144,64 +108,22 @@ ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vmul.f32 q1, q1, q0 +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vstrh.32 q1, [r1, #24] ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vmul.f32 q2, q1, q0 -; CHECK-NEXT: vcvtb.f16.f32 s4, s8 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vcvtb.f16.f32 s4, s9 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vcvtb.f16.f32 s12, s10 -; CHECK-NEXT: vmov.16 q1[1], r3 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vcvtb.f16.f32 s8, s11 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmul.f32 q2, q2, q0 -; CHECK-NEXT: vcvtb.f16.f32 s12, s8 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s9 -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s10 -; CHECK-NEXT: vmov.16 q1[5], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vcvtb.f16.f32 s8, s11 -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vstrh.16 q1, [r1, #16] +; CHECK-NEXT: vmul.f32 q1, q1, q0 +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vstrh.32 q1, [r1, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmul.f32 q1, q1, q0 +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vstrh.32 q1, [r1, #8] ; CHECK-NEXT: vldrw.u32 q1, [r0], #64 -; CHECK-NEXT: vmul.f32 q2, q1, q0 -; CHECK-NEXT: vcvtb.f16.f32 s4, s9 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vcvtb.f16.f32 s4, s8 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vcvtb.f16.f32 s12, s10 -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vcvtb.f16.f32 s8, s11 -; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vldrw.u32 q2, [r0, #-48] -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmul.f32 q2, q2, q0 -; CHECK-NEXT: vcvtb.f16.f32 s12, s8 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s9 -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s10 -; CHECK-NEXT: vmov.16 q1[5], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vcvtb.f16.f32 s8, s11 -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vstrh.16 q1, [r1], #32 +; CHECK-NEXT: vmul.f32 q1, q1, q0 +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vstrh.32 q1, [r1], #32 ; CHECK-NEXT: le lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -452,23 +374,8 @@ ; CHECK-NEXT: vcvtb.f32.f16 s13, s8 ; CHECK-NEXT: vcvtb.f32.f16 s12, s4 ; CHECK-NEXT: vmul.f32 q1, q3, q0 -; CHECK-NEXT: vcvtb.f16.f32 s8, s4 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vcvtb.f16.f32 s8, s5 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vcvtb.f16.f32 s12, s6 -; CHECK-NEXT: vmov.16 q2[1], r3 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vcvtb.f16.f32 s4, s7 -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: str r2, [r1] -; CHECK-NEXT: str r3, [r1, #4] -; CHECK-NEXT: adds r1, #8 +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vstrh.32 q1, [r1], #8 ; CHECK-NEXT: le lr, .LBB6_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -506,57 +413,33 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adr r2, .LCPI7_0 ; CHECK-NEXT: mov.w lr, #128 ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q2, [r0], #16 -; CHECK-NEXT: vmovx.f16 s6, s9 -; CHECK-NEXT: vmovx.f16 s4, s8 -; CHECK-NEXT: vcvtb.f32.f16 s15, s6 -; CHECK-NEXT: vcvtb.f32.f16 s14, s9 -; CHECK-NEXT: vcvtb.f32.f16 s13, s4 -; CHECK-NEXT: vcvtb.f32.f16 s12, s8 -; CHECK-NEXT: vmul.f32 q3, q3, q0 -; CHECK-NEXT: vcvtb.f16.f32 s4, s12 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vcvtb.f16.f32 s4, s13 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vcvtb.f16.f32 s16, s14 -; CHECK-NEXT: vcvtb.f16.f32 s12, s15 -; CHECK-NEXT: vmovx.f16 s14, s11 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vcvtb.f32.f16 s19, s14 -; CHECK-NEXT: vmov.16 q1[1], r3 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmovx.f16 s12, s10 -; CHECK-NEXT: vcvtb.f32.f16 s18, s11 -; CHECK-NEXT: vcvtb.f32.f16 s17, s12 -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vcvtb.f32.f16 s16, s10 -; CHECK-NEXT: vmul.f32 q2, q4, q0 -; CHECK-NEXT: vcvtb.f16.f32 s12, s8 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s9 -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s10 -; CHECK-NEXT: vmov.16 q1[5], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vcvtb.f16.f32 s8, s11 -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: vldrh.u16 q1, [r0], #16 +; CHECK-NEXT: vmovx.f16 s10, s7 +; CHECK-NEXT: vmovx.f16 s8, s6 +; CHECK-NEXT: vcvtb.f32.f16 s15, s10 +; CHECK-NEXT: vcvtb.f32.f16 s14, s7 +; CHECK-NEXT: vcvtb.f32.f16 s13, s8 +; CHECK-NEXT: vcvtb.f32.f16 s12, s6 +; CHECK-NEXT: vmul.f32 q2, q3, q0 +; CHECK-NEXT: vcvtb.f16.f32 q2, q2 +; CHECK-NEXT: vstrh.32 q2, [r1, #8] +; CHECK-NEXT: vmovx.f16 s10, s5 +; CHECK-NEXT: vcvtb.f32.f16 s15, s10 +; CHECK-NEXT: vmovx.f16 s8, s4 +; CHECK-NEXT: vcvtb.f32.f16 s14, s5 +; CHECK-NEXT: vcvtb.f32.f16 s13, s8 +; CHECK-NEXT: vcvtb.f32.f16 s12, s4 +; CHECK-NEXT: vmul.f32 q1, q3, q0 +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vstrh.32 q1, [r1], #16 ; CHECK-NEXT: le lr, .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: @@ -592,97 +475,52 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adr r2, .LCPI8_0 ; CHECK-NEXT: mov.w lr, #64 ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q2, [r0, #16] -; CHECK-NEXT: vmovx.f16 s6, s9 -; CHECK-NEXT: vmovx.f16 s4, s8 -; CHECK-NEXT: vcvtb.f32.f16 s15, s6 -; CHECK-NEXT: vcvtb.f32.f16 s14, s9 -; CHECK-NEXT: vcvtb.f32.f16 s13, s4 -; CHECK-NEXT: vcvtb.f32.f16 s12, s8 -; CHECK-NEXT: vmul.f32 q3, q3, q0 -; CHECK-NEXT: vcvtb.f16.f32 s4, s12 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vcvtb.f16.f32 s4, s13 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vcvtb.f16.f32 s16, s14 -; CHECK-NEXT: vcvtb.f16.f32 s12, s15 -; CHECK-NEXT: vmovx.f16 s14, s11 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vcvtb.f32.f16 s19, s14 -; CHECK-NEXT: vmov.16 q1[1], r3 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmovx.f16 s12, s10 -; CHECK-NEXT: vcvtb.f32.f16 s18, s11 -; CHECK-NEXT: vcvtb.f32.f16 s17, s12 -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vcvtb.f32.f16 s16, s10 -; CHECK-NEXT: vmul.f32 q2, q4, q0 -; CHECK-NEXT: vcvtb.f16.f32 s12, s8 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s9 -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s10 -; CHECK-NEXT: vmov.16 q1[5], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vcvtb.f16.f32 s8, s11 -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vldrh.u16 q2, [r0], #32 -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vstrh.16 q1, [r1, #16] -; CHECK-NEXT: vmovx.f16 s6, s9 -; CHECK-NEXT: vmovx.f16 s4, s8 -; CHECK-NEXT: vcvtb.f32.f16 s15, s6 -; CHECK-NEXT: vcvtb.f32.f16 s14, s9 -; CHECK-NEXT: vcvtb.f32.f16 s13, s4 -; CHECK-NEXT: vcvtb.f32.f16 s12, s8 -; CHECK-NEXT: vmul.f32 q3, q3, q0 -; CHECK-NEXT: vcvtb.f16.f32 s4, s12 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vcvtb.f16.f32 s4, s13 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vcvtb.f16.f32 s16, s14 -; CHECK-NEXT: vcvtb.f16.f32 s12, s15 -; CHECK-NEXT: vmovx.f16 s14, s11 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vcvtb.f32.f16 s19, s14 -; CHECK-NEXT: vmov.16 q1[1], r3 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmovx.f16 s12, s10 -; CHECK-NEXT: vcvtb.f32.f16 s18, s11 -; CHECK-NEXT: vcvtb.f32.f16 s17, s12 -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vcvtb.f32.f16 s16, s10 -; CHECK-NEXT: vmul.f32 q2, q4, q0 -; CHECK-NEXT: vcvtb.f16.f32 s12, s8 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s9 -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vcvtb.f16.f32 s12, s10 -; CHECK-NEXT: vmov.16 q1[5], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vcvtb.f16.f32 s8, s11 -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vstrh.16 q1, [r1], #32 +; CHECK-NEXT: vldrh.u16 q1, [r0, #16] +; CHECK-NEXT: vmovx.f16 s10, s7 +; CHECK-NEXT: vmovx.f16 s8, s6 +; CHECK-NEXT: vcvtb.f32.f16 s15, s10 +; CHECK-NEXT: vcvtb.f32.f16 s14, s7 +; CHECK-NEXT: vcvtb.f32.f16 s13, s8 +; CHECK-NEXT: vcvtb.f32.f16 s12, s6 +; CHECK-NEXT: vmul.f32 q2, q3, q0 +; CHECK-NEXT: vcvtb.f16.f32 q2, q2 +; CHECK-NEXT: vstrh.32 q2, [r1, #24] +; CHECK-NEXT: vmovx.f16 s10, s5 +; CHECK-NEXT: vcvtb.f32.f16 s15, s10 +; CHECK-NEXT: vmovx.f16 s8, s4 +; CHECK-NEXT: vcvtb.f32.f16 s14, s5 +; CHECK-NEXT: vcvtb.f32.f16 s13, s8 +; CHECK-NEXT: vcvtb.f32.f16 s12, s4 +; CHECK-NEXT: vmul.f32 q1, q3, q0 +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vstrh.32 q1, [r1, #16] +; CHECK-NEXT: vldrh.u16 q1, [r0], #32 +; CHECK-NEXT: vmovx.f16 s10, s7 +; CHECK-NEXT: vmovx.f16 s8, s6 +; CHECK-NEXT: vcvtb.f32.f16 s15, s10 +; CHECK-NEXT: vcvtb.f32.f16 s14, s7 +; CHECK-NEXT: vcvtb.f32.f16 s13, s8 +; CHECK-NEXT: vcvtb.f32.f16 s12, s6 +; CHECK-NEXT: vmul.f32 q2, q3, q0 +; CHECK-NEXT: vcvtb.f16.f32 q2, q2 +; CHECK-NEXT: vstrh.32 q2, [r1, #8] +; CHECK-NEXT: vmovx.f16 s10, s5 +; CHECK-NEXT: vcvtb.f32.f16 s15, s10 +; CHECK-NEXT: vmovx.f16 s8, s4 +; CHECK-NEXT: vcvtb.f32.f16 s14, s5 +; CHECK-NEXT: vcvtb.f32.f16 s13, s8 +; CHECK-NEXT: vcvtb.f32.f16 s12, s4 +; CHECK-NEXT: vmul.f32 q1, q3, q0 +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vstrh.32 q1, [r1], #32 ; CHECK-NEXT: le lr, .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: diff --git a/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll --- a/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll @@ -345,21 +345,8 @@ define arm_aapcs_vfpcc void @store_trunc_4(<4 x half>* %src, <4 x float> %val) { ; CHECK-LABEL: store_trunc_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcvtb.f16.f32 s4, s0 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vcvtb.f16.f32 s4, s1 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q1[0], r1 -; CHECK-NEXT: vcvtb.f16.f32 s8, s2 -; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vcvtb.f16.f32 s0, s3 -; CHECK-NEXT: vmov.16 q1[2], r1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov.16 q1[3], r1 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: strd r1, r2, [r0] +; CHECK-NEXT: vcvtb.f16.f32 q0, q0 +; CHECK-NEXT: vstrh.32 q0, [r0] ; CHECK-NEXT: bx lr entry: %e = fptrunc <4 x float> %val to <4 x half> @@ -370,31 +357,10 @@ define arm_aapcs_vfpcc void @store_trunc_8(<8 x half>* %src, <8 x float> %val) { ; CHECK-LABEL: store_trunc_8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcvtb.f16.f32 s8, s0 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vcvtb.f16.f32 s8, s1 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vcvtb.f16.f32 s12, s2 -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vcvtb.f16.f32 s0, s3 -; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s4 -; CHECK-NEXT: vmov.16 q2[3], r1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s5 -; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s6 -; CHECK-NEXT: vmov.16 q2[5], r1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s7 -; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov.16 q2[7], r1 -; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vcvtb.f16.f32 q0, q0 +; CHECK-NEXT: vstrh.32 q1, [r0, #8] +; CHECK-NEXT: vstrh.32 q0, [r0] ; CHECK-NEXT: bx lr entry: %e = fptrunc <8 x float> %val to <8 x half> @@ -405,59 +371,14 @@ define arm_aapcs_vfpcc void @store_trunc_16(<16 x half>* %src, <16 x float> %val) { ; CHECK-LABEL: store_trunc_16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10} -; CHECK-NEXT: vpush {d8, d9, d10} -; CHECK-NEXT: vcvtb.f16.f32 s16, s8 -; CHECK-NEXT: vmov r1, s16 -; CHECK-NEXT: vcvtb.f16.f32 s16, s9 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.16 q4[0], r1 -; CHECK-NEXT: vcvtb.f16.f32 s20, s10 -; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov r1, s20 -; CHECK-NEXT: vcvtb.f16.f32 s8, s11 -; CHECK-NEXT: vmov.16 q4[2], r1 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vcvtb.f16.f32 s8, s12 -; CHECK-NEXT: vmov.16 q4[3], r1 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vcvtb.f16.f32 s8, s13 -; CHECK-NEXT: vmov.16 q4[4], r1 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vcvtb.f16.f32 s8, s14 -; CHECK-NEXT: vmov.16 q4[5], r1 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vcvtb.f16.f32 s8, s15 -; CHECK-NEXT: vmov.16 q4[6], r1 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov.16 q4[7], r1 -; CHECK-NEXT: vstrw.32 q4, [r0, #16] -; CHECK-NEXT: vcvtb.f16.f32 s8, s0 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vcvtb.f16.f32 s8, s1 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vcvtb.f16.f32 s12, s2 -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vcvtb.f16.f32 s0, s3 -; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s4 -; CHECK-NEXT: vmov.16 q2[3], r1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s5 -; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s6 -; CHECK-NEXT: vmov.16 q2[5], r1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s7 -; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov.16 q2[7], r1 -; CHECK-NEXT: vstrw.32 q2, [r0] -; CHECK-NEXT: vpop {d8, d9, d10} +; CHECK-NEXT: vcvtb.f16.f32 q3, q3 +; CHECK-NEXT: vcvtb.f16.f32 q2, q2 +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vcvtb.f16.f32 q0, q0 +; CHECK-NEXT: vstrh.32 q3, [r0, #24] +; CHECK-NEXT: vstrh.32 q2, [r0, #16] +; CHECK-NEXT: vstrh.32 q1, [r0, #8] +; CHECK-NEXT: vstrh.32 q0, [r0] ; CHECK-NEXT: bx lr entry: %e = fptrunc <16 x float> %val to <16 x half>