diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -300,6 +300,9 @@ // Pseudo-instruction representing a memory copy using a tail predicated // loop MEMCPYLOOP, + // Pseudo-instruction representing a memset using a tail predicated + // loop + MEMSETLOOP, // V8.1MMainline condition select CSINV, // Conditional select invert. diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1802,6 +1802,7 @@ MAKE_CASE(ARMISD::CSNEG) MAKE_CASE(ARMISD::CSINC) MAKE_CASE(ARMISD::MEMCPYLOOP) + MAKE_CASE(ARMISD::MEMSETLOOP) #undef MAKE_CASE } return nullptr; @@ -11068,7 +11069,6 @@ MachineBasicBlock *TpExit, Register OpSizeReg, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI) { - // Calculates loop iteration count = ceil(n/16)/16 = ((n + 15)&(-16)) / 16. Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg) @@ -11110,17 +11110,21 @@ const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI, Register OpSrcReg, Register OpDestReg, Register ElementCountReg, - Register TotalIterationsReg) { - - // First insert 4 PHI nodes for: Current pointer to Src, Dest array, loop - // iteration counter, predication counter Current position in the src array - Register SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); - Register CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); - BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg) - .addUse(OpSrcReg) - .addMBB(TpEntry) - .addUse(CurrSrcReg) - .addMBB(TpLoopBody); + Register TotalIterationsReg, bool IsMemcpy) { + // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest + // array, loop iteration counter, predication counter. + + Register SrcPhiReg, CurrSrcReg; + if (IsMemcpy) { + // Current position in the src array + SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); + CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); + BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg) + .addUse(OpSrcReg) + .addMBB(TpEntry) + .addUse(CurrSrcReg) + .addMBB(TpLoopBody); + } // Current position in the dest array Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); @@ -11163,19 +11167,23 @@ .add(predOps(ARMCC::AL)) .addReg(0); - // VLDRB and VSTRB instructions, predicated using VPR - Register LoadedValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass); - BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post)) - .addDef(CurrSrcReg) - .addDef(LoadedValueReg) - .addReg(SrcPhiReg) - .addImm(16) - .addImm(ARMVCC::Then) - .addUse(VccrReg); + // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR + Register SrcValueReg; + if (IsMemcpy) { + SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass); + BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post)) + .addDef(CurrSrcReg) + .addDef(SrcValueReg) + .addReg(SrcPhiReg) + .addImm(16) + .addImm(ARMVCC::Then) + .addUse(VccrReg); + } else + SrcValueReg = OpSrcReg; BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post)) .addDef(CurrDestReg) - .addUse(LoadedValueReg, RegState::Kill) + .addUse(SrcValueReg) .addReg(DestPhiReg) .addImm(16) .addImm(ARMVCC::Then) @@ -11222,9 +11230,10 @@ return BB; } - case ARM::MVE_MEMCPYLOOPINST: { + case ARM::MVE_MEMCPYLOOPINST: + case ARM::MVE_MEMSETLOOPINST: { - // Transformation below expands MVE_MEMCPYLOOPINST Pseudo instruction + // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo // into a Tail Predicated (TP) Loop. It adds the instructions to calculate // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and // adds the relevant instructions in the TP loop Body for generation of a @@ -11263,23 +11272,24 @@ MF->push_back(TpLoopBody); // If any instructions are present in the current block after - // MVE_MEMCPYLOOPINST, split the current block and move the instructions - // into the newly created exit block. If there are no instructions - // add an explicit branch to the FallThrough block and then split. + // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and + // move the instructions into the newly created exit block. If there are no + // instructions add an explicit branch to the FallThrough block and then + // split. // // The split is required for two reasons: // 1) A terminator(t2WhileLoopStart) will be placed at that site. // 2) Since a TPLoopBody will be added later, any phis in successive blocks // need to be updated. splitAt() already handles this. - TpExit = BB->splitAt(MI, false); + TpExit = BB->splitAt(MI); if (TpExit == BB) { - assert(BB->canFallThrough() && - "Exit block must be FallThrough of the block containing memcpy"); + assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the " + "block containing memcpy/memset Pseudo"); TpExit = BB->getFallThrough(); BuildMI(BB, dl, TII->get(ARM::t2B)) .addMBB(TpExit) .add(predOps(ARMCC::AL)); - TpExit = BB->splitAt(MI, false); + TpExit = BB->splitAt(MI); } // Add logic for iteration count @@ -11287,8 +11297,9 @@ genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI); // Add the vectorized (and predicated) loads/store instructions + bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST; genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg, - OpDestReg, OpSizeReg, TotalIterationsReg); + OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy); // Connect the blocks TpEntry->addSuccessor(TpLoopBody); diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -6876,6 +6876,18 @@ [(MVE_MEMCPYLOOPNODE rGPR:$dst, rGPR:$src, rGPR:$sz)]>; } +def SDT_MVEMEMSETLOOPNODE + : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisVT<1, v16i8>, SDTCisVT<2, i32>]>; +def MVE_MEMSETLOOPNODE : SDNode<"ARMISD::MEMSETLOOP", SDT_MVEMEMSETLOOPNODE, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>; + +let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { + def MVE_MEMSETLOOPINST : PseudoInst<(outs), + (ins rGPR:$dst, MQPR:$src, rGPR:$sz), + NoItinerary, + [(MVE_MEMSETLOOPNODE rGPR:$dst, MQPR:$src, rGPR:$sz)]>; +} + def MVE_DLSTP_8 : MVE_DLSTP<"dlstp.8", 0b00>; def MVE_DLSTP_16 : MVE_DLSTP<"dlstp.16", 0b01>; def MVE_DLSTP_32 : MVE_DLSTP<"dlstp.32", 0b10>; diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp --- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp +++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -139,6 +139,33 @@ return CallResult.second; } +static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget, + const SelectionDAG &DAG, + ConstantSDNode *ConstantSize, + Align Alignment, bool IsMemcpy) { + auto &F = DAG.getMachineFunction().getFunction(); + if (!EnableMemtransferTPLoop) + return false; + if (EnableMemtransferTPLoop == TPLoop::ForceEnabled) + return true; + // Do not generate inline TP loop if optimizations is disabled, + // or if optimization for size (-Os or -Oz) is on. + if (F.hasOptNone() || F.hasOptSize()) + return false; + // If cli option is unset, for memset always generate inline TP. + // For memcpy, check some conditions + if (!IsMemcpy) + return true; + if (!ConstantSize && Alignment >= Align(4)) + return true; + if (ConstantSize && + ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() && + ConstantSize->getZExtValue() < + Subtarget.getMaxMemcpyTPInlineSizeThreshold()) + return true; + return false; +} + SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy( SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline, @@ -147,29 +174,8 @@ DAG.getMachineFunction().getSubtarget(); ConstantSDNode *ConstantSize = dyn_cast(Size); - auto GenInlineTP = [&](const ARMSubtarget &Subtarget, - const SelectionDAG &DAG) { - auto &F = DAG.getMachineFunction().getFunction(); - if (!EnableMemtransferTPLoop) - return false; - if (EnableMemtransferTPLoop == TPLoop::ForceEnabled) - return true; - // Do not generate inline TP loop if optimizations is disabled, - // or if optimization for size (-Os or -Oz) is on. - if (F.hasOptNone() || F.hasOptSize()) - return false; - // If cli option is unset - if (!ConstantSize && Alignment >= Align(4)) - return true; - if (ConstantSize && - ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() && - ConstantSize->getZExtValue() < - Subtarget.getMaxTPLoopInlineSizeThreshold()) - return true; - return false; - }; - - if (Subtarget.hasMVEIntegerOps() && GenInlineTP(Subtarget, DAG)) + if (Subtarget.hasMVEIntegerOps() && + shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, true)) return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src, DAG.getZExtOrTrunc(Size, dl, MVT::i32)); @@ -292,6 +298,22 @@ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVolatile, MachinePointerInfo DstPtrInfo) const { + + const ARMSubtarget &Subtarget = + DAG.getMachineFunction().getSubtarget(); + + ConstantSDNode *ConstantSize = dyn_cast(Size); + + // Generate TP loop for llvm.memset + if (Subtarget.hasMVEIntegerOps() && + shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, + false)) { + Src = DAG.getSplatBuildVector(MVT::v16i8, dl, + DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src)); + return DAG.getNode(ARMISD::MEMSETLOOP, dl, MVT::Other, Chain, Dst, Src, + DAG.getZExtOrTrunc(Size, dl, MVT::i32)); + } + return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Alignment.value(), RTLIB::MEMSET); } diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -538,10 +538,11 @@ return 64; } - /// getMaxTPLoopSizeThreshold - Returns the maximum memcpy size - /// that still makes it profitable to inline the call as a Tail - /// Predicated loop - unsigned getMaxTPLoopInlineSizeThreshold() const { return 128; } + /// getMaxMemcpyTPInlineSizeThreshold - Returns the maximum size + /// that still makes it profitable to inline a llvm.memcpy as a Tail + /// Predicated loop. + /// This threshold should only be used for constant size inputs. + unsigned getMaxMemcpyTPInlineSizeThreshold() const { return 128; } /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll @@ -58,28 +58,35 @@ define void @test_memset(i32* nocapture %x, i32 %n, i32 %m) { ; CHECK-LABEL: test_memset: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r1, #1 -; CHECK-NEXT: blt .LBB1_3 -; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: mov r4, r2 -; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: lsls r7, r2, #2 +; CHECK-NEXT: it lt +; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: .LBB1_1: @ %for.body.preheader +; CHECK-NEXT: lsl.w r12, r2, #2 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: b .LBB1_2 ; CHECK-NEXT: .LBB1_2: @ %for.body -; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: bl __aeabi_memclr4 -; CHECK-NEXT: add r6, r7 -; CHECK-NEXT: subs r5, #1 -; CHECK-NEXT: bne .LBB1_2 -; CHECK-NEXT: .LBB1_3: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB1_4 Depth 2 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r3, r2 +; CHECK-NEXT: wlstp.8 lr, r3, .LBB1_3 +; CHECK-NEXT: b .LBB1_4 +; CHECK-NEXT: .LBB1_3: @ %for.body +; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1 +; CHECK-NEXT: add r0, r12 +; CHECK-NEXT: subs r1, #1 +; CHECK-NEXT: beq .LBB1_5 +; CHECK-NEXT: b .LBB1_2 +; CHECK-NEXT: .LBB1_4: @ Parent Loop BB1_2 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: vstrb.8 q0, [r4], #16 +; CHECK-NEXT: letp lr, .LBB1_4 +; CHECK-NEXT: b .LBB1_3 +; CHECK-NEXT: .LBB1_5: @ %for.cond.cleanup +; CHECK-NEXT: pop {r4, pc} entry: %cmp5 = icmp sgt i32 %n, 0 br i1 %cmp5, label %for.body, label %for.cond.cleanup diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o 2>/dev/null - | FileCheck %s +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp --arm-memtransfer-tploop=allow %s -o 2>/dev/null - | FileCheck %s !0 = !{i32 1, !"wchar_size", i32 4} !1 = !{i32 1, !"min_enum_size", i32 4} @@ -592,141 +592,148 @@ ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: .pad #24 -; CHECK-NEXT: sub sp, #24 -; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: strd r0, r2, [sp, #24] @ 8-byte Folded Spill ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: mov r0, r3 ; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrne r0, [sp, #112] +; CHECK-NEXT: ldrne r0, [sp, #136] ; CHECK-NEXT: cmpne r0, #0 ; CHECK-NEXT: bne .LBB10_2 ; CHECK-NEXT: .LBB10_1: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #24 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .LBB10_2: @ %for.cond1.preheader.us.preheader -; CHECK-NEXT: ldr.w r9, [sp, #116] -; CHECK-NEXT: mov r6, r1 -; CHECK-NEXT: movs r1, #1 -; CHECK-NEXT: mov r11, r2 -; CHECK-NEXT: bic r10, r9, #3 -; CHECK-NEXT: mov.w r8, #0 -; CHECK-NEXT: sub.w r0, r10, #4 -; CHECK-NEXT: add.w r0, r1, r0, lsr #2 -; CHECK-NEXT: ldr r1, [sp, #112] -; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: lsl.w r0, r9, #1 -; CHECK-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-NEXT: adr r0, .LCPI10_0 -; CHECK-NEXT: vdup.32 q4, r1 -; CHECK-NEXT: vldrw.u32 q5, [r0] -; CHECK-NEXT: lsls r4, r1, #1 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: vshl.i32 q6, q4, #2 -; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: ldr.w r12, [sp, #140] +; CHECK-NEXT: movs r7, #1 +; CHECK-NEXT: mov.w r11, #0 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: bic r2, r12, #3 +; CHECK-NEXT: subs r3, r2, #4 +; CHECK-NEXT: add.w r0, r7, r3, lsr #2 +; CHECK-NEXT: ldr r7, [sp, #136] +; CHECK-NEXT: adr r3, .LCPI10_0 ; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill -; CHECK-NEXT: b .LBB10_5 -; CHECK-NEXT: .LBB10_3: @ %for.cond5.preheader.us73.preheader -; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1 -; CHECK-NEXT: add.w r0, r11, r12, lsl #1 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: bl __aeabi_memclr -; CHECK-NEXT: .LBB10_4: @ %for.cond1.for.cond.cleanup3_crit_edge.us -; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1 -; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload -; CHECK-NEXT: add r8, r9 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: add r1, r0 -; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill -; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: adds r1, #1 -; CHECK-NEXT: cmp r1, r0 -; CHECK-NEXT: beq .LBB10_1 -; CHECK-NEXT: .LBB10_5: @ %for.cond1.preheader.us +; CHECK-NEXT: lsl.w r0, r12, #1 +; CHECK-NEXT: vdup.32 q1, r7 +; CHECK-NEXT: vldrw.u32 q2, [r3] +; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: lsls r6, r7, #1 +; CHECK-NEXT: vshl.i32 q3, q1, #2 +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: b .LBB10_3 +; CHECK-NEXT: .LBB10_3: @ %for.cond1.preheader.us ; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB10_8 Depth 2 -; CHECK-NEXT: @ Child Loop BB10_11 Depth 3 -; CHECK-NEXT: @ Child Loop BB10_14 Depth 3 -; CHECK-NEXT: ldr r0, [sp, #112] -; CHECK-NEXT: cmp.w r9, #0 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: mul r12, r1, r0 -; CHECK-NEXT: beq .LBB10_3 -; CHECK-NEXT: @ %bb.6: @ %for.cond5.preheader.us.us.preheader -; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1 -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: b .LBB10_8 -; CHECK-NEXT: .LBB10_7: @ %for.cond5.for.cond.cleanup7_crit_edge.us.us -; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: ldr r0, [sp, #112] -; CHECK-NEXT: add.w r3, r1, r12 -; CHECK-NEXT: adds r1, #1 -; CHECK-NEXT: cmp r1, r0 -; CHECK-NEXT: strh.w r2, [r11, r3, lsl #1] -; CHECK-NEXT: beq .LBB10_4 -; CHECK-NEXT: .LBB10_8: @ %for.cond5.preheader.us.us -; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 +; CHECK-NEXT: @ Child Loop BB10_6 Depth 2 +; CHECK-NEXT: @ Child Loop BB10_9 Depth 3 +; CHECK-NEXT: @ Child Loop BB10_12 Depth 3 +; CHECK-NEXT: @ Child Loop BB10_15 Depth 2 +; CHECK-NEXT: mul r5, r3, r7 +; CHECK-NEXT: cmp.w r12, #0 +; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: beq .LBB10_13 +; CHECK-NEXT: @ %bb.4: @ %for.cond5.preheader.us.us.preheader +; CHECK-NEXT: @ in Loop: Header=BB10_3 Depth=1 +; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: b .LBB10_6 +; CHECK-NEXT: .LBB10_5: @ %for.cond5.for.cond.cleanup7_crit_edge.us.us +; CHECK-NEXT: @ in Loop: Header=BB10_6 Depth=2 +; CHECK-NEXT: ldr r3, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: add.w r0, r8, r5 +; CHECK-NEXT: add.w r8, r8, #1 +; CHECK-NEXT: cmp r8, r7 +; CHECK-NEXT: strh.w r10, [r3, r0, lsl #1] +; CHECK-NEXT: beq .LBB10_14 +; CHECK-NEXT: .LBB10_6: @ %for.cond5.preheader.us.us +; CHECK-NEXT: @ Parent Loop BB10_3 Depth=1 ; CHECK-NEXT: @ => This Loop Header: Depth=2 -; CHECK-NEXT: @ Child Loop BB10_11 Depth 3 -; CHECK-NEXT: @ Child Loop BB10_14 Depth 3 -; CHECK-NEXT: cmp.w r9, #3 -; CHECK-NEXT: bhi .LBB10_10 -; CHECK-NEXT: @ %bb.9: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: movs r7, #0 -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: b .LBB10_13 -; CHECK-NEXT: .LBB10_10: @ %vector.ph -; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: vmov q1, q4 -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: vmlas.u32 q1, q5, r1 +; CHECK-NEXT: @ Child Loop BB10_9 Depth 3 +; CHECK-NEXT: @ Child Loop BB10_12 Depth 3 +; CHECK-NEXT: cmp.w r12, #3 +; CHECK-NEXT: bhi .LBB10_8 +; CHECK-NEXT: @ %bb.7: @ in Loop: Header=BB10_6 Depth=2 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: mov.w r10, #0 +; CHECK-NEXT: b .LBB10_11 +; CHECK-NEXT: .LBB10_8: @ %vector.ph +; CHECK-NEXT: @ in Loop: Header=BB10_6 Depth=2 +; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: vmov q5, q1 +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vmlas.u32 q5, q2, r8 ; CHECK-NEXT: dls lr, r0 -; CHECK-NEXT: ldr r2, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: .LBB10_11: @ %vector.body -; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 -; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2 +; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: .LBB10_9: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB10_3 Depth=1 +; CHECK-NEXT: @ Parent Loop BB10_6 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vadd.i32 q2, q1, q6 -; CHECK-NEXT: vldrh.s32 q3, [r6, q1, uxtw #1] -; CHECK-NEXT: vldrh.s32 q1, [r2], #8 -; CHECK-NEXT: vmul.i32 q1, q3, q1 -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vmov q1, q2 -; CHECK-NEXT: le lr, .LBB10_11 -; CHECK-NEXT: @ %bb.12: @ %middle.block -; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: vaddv.u32 r2, q0 -; CHECK-NEXT: cmp r10, r9 -; CHECK-NEXT: mov r7, r10 -; CHECK-NEXT: beq .LBB10_7 -; CHECK-NEXT: .LBB10_13: @ %for.body8.us.us.preheader -; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: ldr r0, [sp, #112] -; CHECK-NEXT: add.w r5, r8, r7 -; CHECK-NEXT: sub.w lr, r9, r7 -; CHECK-NEXT: mla r3, r0, r7, r1 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: add.w r5, r0, r5, lsl #1 -; CHECK-NEXT: add.w r3, r6, r3, lsl #1 -; CHECK-NEXT: .LBB10_14: @ %for.body8.us.us -; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 -; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2 +; CHECK-NEXT: vadd.i32 q6, q5, q3 +; CHECK-NEXT: vldrh.s32 q7, [r1, q5, uxtw #1] +; CHECK-NEXT: vldrh.s32 q5, [r3], #8 +; CHECK-NEXT: vmul.i32 q5, q7, q5 +; CHECK-NEXT: vadd.i32 q4, q5, q4 +; CHECK-NEXT: vmov q5, q6 +; CHECK-NEXT: le lr, .LBB10_9 +; CHECK-NEXT: @ %bb.10: @ %middle.block +; CHECK-NEXT: @ in Loop: Header=BB10_6 Depth=2 +; CHECK-NEXT: vaddv.u32 r10, q4 +; CHECK-NEXT: cmp r2, r12 +; CHECK-NEXT: mov r4, r2 +; CHECK-NEXT: beq .LBB10_5 +; CHECK-NEXT: .LBB10_11: @ %for.body8.us.us.preheader +; CHECK-NEXT: @ in Loop: Header=BB10_6 Depth=2 +; CHECK-NEXT: mla r3, r7, r4, r8 +; CHECK-NEXT: add.w r0, r11, r4 +; CHECK-NEXT: ldr r7, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: sub.w lr, r12, r4 +; CHECK-NEXT: add.w r9, r7, r0, lsl #1 +; CHECK-NEXT: ldr r7, [sp, #136] +; CHECK-NEXT: add.w r3, r1, r3, lsl #1 +; CHECK-NEXT: .LBB10_12: @ %for.body8.us.us +; CHECK-NEXT: @ Parent Loop BB10_3 Depth=1 +; CHECK-NEXT: @ Parent Loop BB10_6 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: ldrsh.w r0, [r3] -; CHECK-NEXT: add r3, r4 -; CHECK-NEXT: ldrsh r7, [r5], #2 -; CHECK-NEXT: smlabb r2, r0, r7, r2 -; CHECK-NEXT: le lr, .LBB10_14 -; CHECK-NEXT: b .LBB10_7 +; CHECK-NEXT: ldrsh.w r4, [r3] +; CHECK-NEXT: add r3, r6 +; CHECK-NEXT: ldrsh r0, [r9], #2 +; CHECK-NEXT: smlabb r10, r4, r0, r10 +; CHECK-NEXT: le lr, .LBB10_12 +; CHECK-NEXT: b .LBB10_5 +; CHECK-NEXT: .LBB10_13: @ %for.cond5.preheader.us73.preheader +; CHECK-NEXT: @ in Loop: Header=BB10_3 Depth=1 +; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: add.w r3, r0, r5, lsl #1 +; CHECK-NEXT: mov r5, r6 +; CHECK-NEXT: wlstp.8 lr, r5, .LBB10_14 +; CHECK-NEXT: b .LBB10_15 +; CHECK-NEXT: .LBB10_14: @ %for.cond1.for.cond.cleanup3_crit_edge.us +; CHECK-NEXT: @ in Loop: Header=BB10_3 Depth=1 +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: add r11, r12 +; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: add r3, r0 +; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: adds r3, #1 +; CHECK-NEXT: cmp r3, r0 +; CHECK-NEXT: beq.w .LBB10_1 +; CHECK-NEXT: b .LBB10_3 +; CHECK-NEXT: .LBB10_15: @ Parent Loop BB10_3 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: vstrb.8 q0, [r3], #16 +; CHECK-NEXT: letp lr, .LBB10_15 +; CHECK-NEXT: b .LBB10_14 ; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.15: +; CHECK-NEXT: @ %bb.16: ; CHECK-NEXT: .LCPI10_0: ; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .long 1 @ 0x1 diff --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll --- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll +++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -O3 -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -O3 -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp --arm-memtransfer-tploop=allow -verify-machineinstrs %s -o - | FileCheck %s ; verify-machineinstrs previously caught the incorrect use of QPR in the stack reloads. @@ -147,65 +147,74 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: .pad #392 -; CHECK-NEXT: sub sp, #392 -; CHECK-NEXT: movw r9, :lower16:.L_MergedGlobals -; CHECK-NEXT: vldr s0, .LCPI1_0 -; CHECK-NEXT: movt r9, :upper16:.L_MergedGlobals -; CHECK-NEXT: vldr s3, .LCPI1_1 -; CHECK-NEXT: mov r7, r9 -; CHECK-NEXT: mov r5, r9 -; CHECK-NEXT: ldr r0, [r7, #4]! -; CHECK-NEXT: movw r4, :lower16:e -; CHECK-NEXT: ldr r1, [r5, #8]! -; CHECK-NEXT: movt r4, :upper16:e -; CHECK-NEXT: vmov r6, s3 -; CHECK-NEXT: vdup.32 q4, r7 -; CHECK-NEXT: vmov s1, r7 -; CHECK-NEXT: vmov q1[2], q1[0], r5, r5 -; CHECK-NEXT: vmov s9, r4 -; CHECK-NEXT: vmov q1[3], q1[1], r6, r4 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov q3, q4 -; CHECK-NEXT: vmov.f32 s8, s0 -; CHECK-NEXT: vmov q5, q4 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vstrw.32 q1, [sp, #76] -; CHECK-NEXT: vmov q1[2], q1[0], r7, r6 -; CHECK-NEXT: mov.w r8, #4 -; CHECK-NEXT: mov.w r10, #0 -; CHECK-NEXT: vmov q1[3], q1[1], r7, r4 -; CHECK-NEXT: vmov.32 q3[0], r4 -; CHECK-NEXT: vmov.32 q5[1], r4 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov.f32 s11, s3 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #416 +; CHECK-NEXT: sub sp, #416 +; CHECK-NEXT: movw r7, :lower16:.L_MergedGlobals +; CHECK-NEXT: vldr s12, .LCPI1_0 +; CHECK-NEXT: movt r7, :upper16:.L_MergedGlobals +; CHECK-NEXT: vldr s15, .LCPI1_1 +; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: mov r4, r7 +; CHECK-NEXT: ldr r0, [r3, #4]! +; CHECK-NEXT: movw r2, :lower16:e +; CHECK-NEXT: ldr r6, [r4, #8]! +; CHECK-NEXT: vmov r5, s15 +; CHECK-NEXT: vmov s13, r3 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: movt r2, :upper16:e +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov q0[2], q0[0], r4, r4 +; CHECK-NEXT: vmov s21, r2 +; CHECK-NEXT: vmov.f32 s14, s13 +; CHECK-NEXT: vmov q0[3], q0[1], r5, r2 +; CHECK-NEXT: vmov.f32 s20, s12 +; CHECK-NEXT: vdup.32 q7, r3 +; CHECK-NEXT: vmov q6[2], q6[0], r3, r5 +; CHECK-NEXT: vmov.f32 s22, s13 +; CHECK-NEXT: vstrw.32 q0, [sp, #100] +; CHECK-NEXT: vmov q0, q7 +; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 +; CHECK-NEXT: vmov q4, q7 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.32 q7[1], r2 +; CHECK-NEXT: vmov.f32 s23, s15 ; CHECK-NEXT: movs r1, #64 -; CHECK-NEXT: strh.w r8, [sp, #390] -; CHECK-NEXT: strd r0, r10, [sp, #24] -; CHECK-NEXT: vstrw.32 q0, [sp, #44] -; CHECK-NEXT: str r0, [r0] -; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: str r0, [sp, #48] ; CHECK-NEXT: vstrw.32 q5, [r0] -; CHECK-NEXT: vstrw.32 q3, [r0] -; CHECK-NEXT: vstrw.32 q1, [r0] -; CHECK-NEXT: bl __aeabi_memclr4 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r7 -; CHECK-NEXT: vmov q1[2], q1[0], r7, r7 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 -; CHECK-NEXT: vmov q1[3], q1[1], r5, r6 -; CHECK-NEXT: vmov.32 q4[0], r10 +; CHECK-NEXT: str r6, [r0] +; CHECK-NEXT: vstrw.32 q7, [r0] +; CHECK-NEXT: str r0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: str.w r10, [r9] -; CHECK-NEXT: vstrw.32 q4, [r0] +; CHECK-NEXT: vstrw.32 q6, [r0] +; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: vmov q1[2], q1[0], r4, r3 +; CHECK-NEXT: vmov q2[2], q2[0], r3, r3 +; CHECK-NEXT: mov.w r12, #4 +; CHECK-NEXT: vmov q1[3], q1[1], r2, r4 +; CHECK-NEXT: vmov q2[3], q2[1], r4, r5 +; CHECK-NEXT: vmov.32 q4[0], r8 +; CHECK-NEXT: @ implicit-def: $r2 +; CHECK-NEXT: str.w r8, [sp, #52] +; CHECK-NEXT: strh.w r12, [sp, #414] +; CHECK-NEXT: vstrw.32 q3, [sp, #68] +; CHECK-NEXT: wlstp.8 lr, r1, .LBB1_2 +; CHECK-NEXT: .LBB1_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vstrb.8 q0, [r2], #16 +; CHECK-NEXT: letp lr, .LBB1_1 +; CHECK-NEXT: .LBB1_2: @ %entry ; CHECK-NEXT: vstrw.32 q1, [r0] -; CHECK-NEXT: str.w r8, [sp, #308] -; CHECK-NEXT: .LBB1_1: @ %for.cond +; CHECK-NEXT: str.w r8, [r7] +; CHECK-NEXT: vstrw.32 q4, [r0] +; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: str.w r12, [sp, #332] +; CHECK-NEXT: .LBB1_3: @ %for.cond ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: b .LBB1_1 +; CHECK-NEXT: b .LBB1_3 ; CHECK-NEXT: .p2align 2 -; CHECK-NEXT: @ %bb.2: +; CHECK-NEXT: @ %bb.4: ; CHECK-NEXT: .LCPI1_0: ; CHECK-NEXT: .long 0x00000004 @ float 5.60519386E-45 ; CHECK-NEXT: .LCPI1_1: diff --git a/llvm/test/CodeGen/Thumb2/mve-tp-loop.ll b/llvm/test/CodeGen/Thumb2/mve-tp-loop.ll --- a/llvm/test/CodeGen/Thumb2/mve-tp-loop.ll +++ b/llvm/test/CodeGen/Thumb2/mve-tp-loop.ll @@ -8,6 +8,7 @@ declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg) #1 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1 +declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i1 immarg) define void @test1(i8* noalias nocapture %X, i8* noalias nocapture readonly %Y, i32 %n){ ; CHECK-LABEL: test1: @@ -281,5 +282,132 @@ ret void } +; Check that WLSTP loop is generated for simplest case of align = 1 +define void @test12(i8* %X, i8 zeroext %c, i32 %n) { +; CHECK-LABEL: test12: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vdup.8 q0, r1 +; CHECK-NEXT: wlstp.8 lr, r2, .LBB11_2 +; CHECK-NEXT: .LBB11_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vstrb.8 q0, [r0], #16 +; CHECK-NEXT: letp lr, .LBB11_1 +; CHECK-NEXT: .LBB11_2: @ %entry +; CHECK-NEXT: pop {r7, pc} +entry: + call void @llvm.memset.p0i8.i32(i8* align 1 %X, i8 %c, i32 %n, i1 false) + ret void +} + + +; Check that WLSTP loop is generated for alignment >= 4 +define void @test13(i32* %X, i8 zeroext %c, i32 %n) { +; CHECK-LABEL: test13: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vdup.8 q0, r1 +; CHECK-NEXT: wlstp.8 lr, r2, .LBB12_2 +; CHECK-NEXT: .LBB12_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vstrb.8 q0, [r0], #16 +; CHECK-NEXT: letp lr, .LBB12_1 +; CHECK-NEXT: .LBB12_2: @ %entry +; CHECK-NEXT: pop {r7, pc} +entry: + %0 = bitcast i32* %X to i8* + call void @llvm.memset.p0i8.i32(i8* align 4 %0, i8 %c, i32 %n, i1 false) + ret void +} + + +; Checks that transform correctly handles input with some arithmetic on input arguments. +; void test14(int* X, char c, int n) +; { +; memset(X+2, c, (n*2)+10); +; } + +define void @test14(i32* %X, i8 zeroext %c, i32 %n) { +; CHECK-LABEL: test14: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: movs r3, #10 +; CHECK-NEXT: add.w r2, r3, r2, lsl #1 +; CHECK-NEXT: vdup.8 q0, r1 +; CHECK-NEXT: adds r0, #8 +; CHECK-NEXT: wlstp.8 lr, r2, .LBB13_2 +; CHECK-NEXT: .LBB13_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vstrb.8 q0, [r0], #16 +; CHECK-NEXT: letp lr, .LBB13_1 +; CHECK-NEXT: .LBB13_2: @ %entry +; CHECK-NEXT: pop {r7, pc} +entry: + %add.ptr = getelementptr inbounds i32, i32* %X, i32 2 + %0 = bitcast i32* %add.ptr to i8* + %mul = shl nsw i32 %n, 1 + %add = add nsw i32 %mul, 10 + call void @llvm.memset.p0i8.i32(i8* nonnull align 4 %0, i8 %c, i32 %add, i1 false) + ret void +} + + + + +; Checks that transform handles for-loops (that get implicitly converted to memset) +; void test15(int* X, char Y, int n){ +; for(int i = 0; i < n; ++i){ +; X[i] = c; +; } +; } + +define void @test15(i8* nocapture %X, i8 zeroext %c, i32 %n) { +; CHECK-LABEL: test15: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB14_1: @ %for.body.preheader +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vdup.8 q0, r1 +; CHECK-NEXT: wlstp.8 lr, r2, .LBB14_3 +; CHECK-NEXT: .LBB14_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vstrb.8 q0, [r0], #16 +; CHECK-NEXT: letp lr, .LBB14_2 +; CHECK-NEXT: .LBB14_3: @ %for.body.preheader +; CHECK-NEXT: pop.w {r7, lr} +; CHECK-NEXT: bx lr +entry: + %cmp4 = icmp sgt i32 %n, 0 + br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + call void @llvm.memset.p0i8.i32(i8* align 4 %X, i8 %c, i32 %n, i1 false) + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body.preheader, %entry + ret void +} + +; Checks that transform handles case with 0 as src value. No difference is expected. +define void @test16(i32* %X, i8 zeroext %c, i32 %n) { +; CHECK-LABEL: test16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: wlstp.8 lr, r2, .LBB15_2 +; CHECK-NEXT: .LBB15_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vstrb.8 q0, [r0], #16 +; CHECK-NEXT: letp lr, .LBB15_1 +; CHECK-NEXT: .LBB15_2: @ %entry +; CHECK-NEXT: pop {r7, pc} +entry: + %0 = bitcast i32* %X to i8* + call void @llvm.memset.p0i8.i32(i8* align 4 %0, i8 0, i32 %n, i1 false) + ret void +} + attributes #0 = { noinline optnone } attributes #1 = { optsize } diff --git a/llvm/test/CodeGen/Thumb2/mve-tp-loop.mir b/llvm/test/CodeGen/Thumb2/mve-tp-loop.mir --- a/llvm/test/CodeGen/Thumb2/mve-tp-loop.mir +++ b/llvm/test/CodeGen/Thumb2/mve-tp-loop.mir @@ -1,11 +1,13 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve -simplify-mir -run-pass=finalize-isel %s -o - | FileCheck %s +# RUN: llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve --arm-memtransfer-tploop=allow -simplify-mir -run-pass=finalize-isel %s -o - | FileCheck %s --- | target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "arm-arm-none-eabi" ; Function Attrs: argmemonly nofree nosync nounwind willreturn declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg) + ; Function Attrs: argmemonly nofree nosync nounwind willreturn writeonly + declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i1 immarg) define void @test1(i32* noalias %X, i32* noalias readonly %Y, i32 %n) { entry: @@ -30,6 +32,27 @@ ret void } + define void @test3(i32* nocapture %X, i8 zeroext %c, i32 %n) { + entry: + %0 = bitcast i32* %X to i8* + tail call void @llvm.memset.p0i8.i32(i8* align 4 %0, i8 %c, i32 %n, i1 false) + ret void + } + + + define void @test4(i8* nocapture %X, i8 zeroext %c, i32 %n) { + entry: + %cmp4 = icmp sgt i32 %n, 0 + br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + call void @llvm.memset.p0i8.i32(i8* align 1 %X, i8 %c, i32 %n, i1 false) + br label %for.cond.cleanup + + for.cond.cleanup: ; preds = %for.body.preheader, %entry + ret void + } + ... --- name: test1 @@ -56,7 +79,7 @@ ; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI3]], 0, $noreg ; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI3]], 16, 14 /* CC::al */, $noreg, $noreg ; CHECK: [[MVE_VLDRBU8_post:%[0-9]+]]:rgpr, [[MVE_VLDRBU8_post1:%[0-9]+]]:mqpr = MVE_VLDRBU8_post [[PHI]], 16, 1, [[MVE_VCTP8_]] - ; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post killed [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]] + ; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]] ; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI2]], 1 ; CHECK: t2LoopEnd [[t2LoopDec]], %bb.1, implicit-def $cpsr ; CHECK: t2B %bb.2, 14 /* CC::al */, $noreg @@ -97,7 +120,7 @@ ; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI3]], 0, $noreg ; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI3]], 16, 14 /* CC::al */, $noreg, $noreg ; CHECK: [[MVE_VLDRBU8_post:%[0-9]+]]:rgpr, [[MVE_VLDRBU8_post1:%[0-9]+]]:mqpr = MVE_VLDRBU8_post [[PHI]], 16, 1, [[MVE_VCTP8_]] - ; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post killed [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]] + ; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]] ; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI2]], 1 ; CHECK: t2LoopEnd [[t2LoopDec]], %bb.3, implicit-def $cpsr ; CHECK: t2B %bb.4, 14 /* CC::al */, $noreg @@ -125,3 +148,92 @@ tBX_RET 14 /* CC::al */, $noreg ... +--- +name: test3 +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $r0, $r1, $r2 + + ; CHECK-LABEL: name: test3 + ; CHECK: liveins: $r0, $r1, $r2 + ; CHECK: [[COPY:%[0-9]+]]:rgpr = COPY $r2 + ; CHECK: [[COPY1:%[0-9]+]]:mqpr = COPY $r1 + ; CHECK: [[COPY2:%[0-9]+]]:rgpr = COPY $r0 + ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 15, 14 /* CC::al */, $noreg, $noreg + ; CHECK: [[t2BICri:%[0-9]+]]:rgpr = t2BICri killed [[t2ADDri]], 16, 14 /* CC::al */, $noreg, $noreg + ; CHECK: [[t2LSRri:%[0-9]+]]:gprlr = t2LSRri killed [[t2BICri]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK: [[t2WhileLoopSetup:%[0-9]+]]:gprlr = t2WhileLoopSetup killed [[t2LSRri]] + ; CHECK: t2WhileLoopStart [[t2WhileLoopSetup]], %bb.2, implicit-def $cpsr + ; CHECK: .1: + ; CHECK: [[PHI:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.0, %8, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:gprlr = PHI [[t2WhileLoopSetup]], %bb.0, %10, %bb.1 + ; CHECK: [[PHI2:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.0, %12, %bb.1 + ; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI2]], 0, $noreg + ; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI2]], 16, 14 /* CC::al */, $noreg, $noreg + ; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post [[COPY1]], [[PHI]], 16, 1, [[MVE_VCTP8_]] + ; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI1]], 1 + ; CHECK: t2LoopEnd [[t2LoopDec]], %bb.1, implicit-def $cpsr + ; CHECK: t2B %bb.2, 14 /* CC::al */, $noreg + ; CHECK: .2.entry: + ; CHECK: tBX_RET 14 /* CC::al */, $noreg + %2:rgpr = COPY $r2 + %1:mqpr = COPY $r1 + %0:rgpr = COPY $r0 + MVE_MEMSETLOOPINST %0, %1, %2 + tBX_RET 14 /* CC::al */, $noreg + +... +--- +name: test4 +alignment: 2 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: test4 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x50000000), %bb.2(0x30000000) + ; CHECK: liveins: $r0, $r1, $r2 + ; CHECK: [[COPY:%[0-9]+]]:rgpr = COPY $r2 + ; CHECK: [[COPY1:%[0-9]+]]:mqpr = COPY $r1 + ; CHECK: [[COPY2:%[0-9]+]]:rgpr = COPY $r0 + ; CHECK: t2CMPri [[COPY]], 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2Bcc %bb.2, 11 /* CC::lt */, $cpsr + ; CHECK: t2B %bb.1, 14 /* CC::al */, $noreg + ; CHECK: bb.1.for.body.preheader: + ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 15, 14 /* CC::al */, $noreg, $noreg + ; CHECK: [[t2BICri:%[0-9]+]]:rgpr = t2BICri killed [[t2ADDri]], 16, 14 /* CC::al */, $noreg, $noreg + ; CHECK: [[t2LSRri:%[0-9]+]]:gprlr = t2LSRri killed [[t2BICri]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK: [[t2WhileLoopSetup:%[0-9]+]]:gprlr = t2WhileLoopSetup killed [[t2LSRri]] + ; CHECK: t2WhileLoopStart [[t2WhileLoopSetup]], %bb.4, implicit-def $cpsr + ; CHECK: bb.3: + ; CHECK: [[PHI:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.1, %8, %bb.3 + ; CHECK: [[PHI1:%[0-9]+]]:gprlr = PHI [[t2WhileLoopSetup]], %bb.1, %10, %bb.3 + ; CHECK: [[PHI2:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.1, %12, %bb.3 + ; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI2]], 0, $noreg + ; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI2]], 16, 14 /* CC::al */, $noreg, $noreg + ; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post [[COPY1]], [[PHI]], 16, 1, [[MVE_VCTP8_]] + ; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI1]], 1 + ; CHECK: t2LoopEnd [[t2LoopDec]], %bb.3, implicit-def $cpsr + ; CHECK: t2B %bb.4, 14 /* CC::al */, $noreg + ; CHECK: bb.4.for.body.preheader: + ; CHECK: t2B %bb.2, 14 /* CC::al */, $noreg + ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: tBX_RET 14 /* CC::al */, $noreg + bb.0.entry: + successors: %bb.1(0x50000000), %bb.2(0x30000000) + liveins: $r0, $r1, $r2 + + %2:rgpr = COPY $r2 + %1:mqpr = COPY $r1 + %0:rgpr = COPY $r0 + t2CMPri %2, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2Bcc %bb.2, 11 /* CC::lt */, $cpsr + t2B %bb.1, 14 /* CC::al */, $noreg + + bb.1.for.body.preheader: + MVE_MEMSETLOOPINST %0, %1, %2 + + bb.2.for.cond.cleanup: + tBX_RET 14 /* CC::al */, $noreg + +...