diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -292,6 +292,8 @@ // Pseudo-instruction representing a memory copy using ldm/stm // instructions. MEMCPY, + // Pseudo-instruction representing a memory copy using a tail predicated loop + MEMCPYLOOP, // V8.1MMainline condition select CSINV, // Conditional select invert. diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1813,6 +1813,7 @@ case ARMISD::CSINV: return "ARMISD::CSINV"; case ARMISD::CSNEG: return "ARMISD::CSNEG"; case ARMISD::CSINC: return "ARMISD::CSINC"; + case ARMISD::MEMCPYLOOP: return "ARMISD::MEMCPYLOOP"; } return nullptr; } @@ -11071,6 +11072,148 @@ return true; } +/// Adds logic in loop entry MBB to calculate loop iteration count and adds +/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop +static Register genTPEntry(MachineBasicBlock *MBB_TP_entry, + MachineBasicBlock *MBB_TP_ph, + MachineBasicBlock *MBB_TP_loopBody, + MachineBasicBlock *MBB_TP_exit, Register opSizeReg, + const TargetInstrInfo *TII, DebugLoc dl, + MachineRegisterInfo &MRI) { + + // Calculates loop iteration count = ceil(n/16)/16 = ((n + 15)&(-16)) / 16. + Register t2ADDriDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); + BuildMI(MBB_TP_entry, dl, TII->get(ARM::t2ADDri), t2ADDriDestReg) + .addUse(opSizeReg) + .addImm(15) + .add(predOps(ARMCC::AL)) + .addReg(0); + + Register t2BICriDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); + BuildMI(MBB_TP_entry, dl, TII->get(ARM::t2BICri), t2BICriDestReg) + .addUse(t2ADDriDestReg, RegState::Kill) + .addImm(16) + .add(predOps(ARMCC::AL)) + .addReg(0); + + Register t2LSRiDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); + BuildMI(MBB_TP_entry, dl, TII->get(ARM::t2LSRri), t2LSRiDestReg) + .addUse(t2BICriDestReg, RegState::Kill) + .addImm(4) + .add(predOps(ARMCC::AL)) + .addReg(0); + + Register totalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass); + BuildMI(MBB_TP_entry, dl, TII->get(ARM::t2WhileLoopSetup), totalIterationsReg) + .addUse(t2LSRiDestReg, RegState::Kill); + + BuildMI(MBB_TP_entry, dl, TII->get(ARM::t2WhileLoopStart)) + .addUse(totalIterationsReg) + .addMBB(MBB_TP_exit); + + BuildMI(MBB_TP_entry, dl, TII->get(ARM::t2B)) + .addMBB(MBB_TP_ph) + .add(predOps(ARMCC::AL)); + + return totalIterationsReg; +} + +/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and +/// t2DoLoopEnd. These are used by later passes to generate tail predicated +/// loops. +static void genTPLoopBody(MachineBasicBlock *MBB_TP_loopBody, + MachineBasicBlock *MBB_TP_ph, + MachineBasicBlock *MBB_TP_exit, + const TargetInstrInfo *TII, DebugLoc dl, + MachineRegisterInfo &MRI, Register opSrcReg, + Register opDestReg, Register elementCountReg, + Register totalIterationsReg) { + + // First insert 4 PHI nodes for: Current pointer to Src, Dest array, loop + // iteration counter, predication counter Current position in the src array + Register srcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); + Register currSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); + BuildMI(MBB_TP_loopBody, dl, TII->get(ARM::PHI), srcPhiReg) + .addUse(opSrcReg) + .addMBB(MBB_TP_ph) + .addUse(currSrcReg) + .addMBB(MBB_TP_loopBody); + + // Current position in the dest array + Register destPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); + Register currDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); + BuildMI(MBB_TP_loopBody, dl, TII->get(ARM::PHI), destPhiReg) + .addUse(opDestReg) + .addMBB(MBB_TP_ph) + .addUse(currDestReg) + .addMBB(MBB_TP_loopBody); + + // Current loop counter + Register loopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass); + Register remainingLoopIterationsReg = + MRI.createVirtualRegister(&ARM::GPRlrRegClass); + BuildMI(MBB_TP_loopBody, dl, TII->get(ARM::PHI), loopCounterPhiReg) + .addUse(totalIterationsReg) + .addMBB(MBB_TP_ph) + .addUse(remainingLoopIterationsReg) + .addMBB(MBB_TP_loopBody); + + // Predication counter + Register predCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); + Register remainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); + BuildMI(MBB_TP_loopBody, dl, TII->get(ARM::PHI), predCounterPhiReg) + .addUse(elementCountReg) + .addMBB(MBB_TP_ph) + .addUse(remainingElementsReg) + .addMBB(MBB_TP_loopBody); + + // Pass predication counter to VCTP + Register vccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass); + BuildMI(MBB_TP_loopBody, dl, TII->get(ARM::MVE_VCTP8), vccrReg) + .addUse(predCounterPhiReg) + .addImm(ARMVCC::None) + .addReg(0); + + BuildMI(MBB_TP_loopBody, dl, TII->get(ARM::t2SUBri), remainingElementsReg) + .addUse(predCounterPhiReg) + .addImm(16) + .add(predOps(ARMCC::AL)) + .addReg(0); + + // VLDRB and VSTRB instructions, predicated using VPR + Register loadedValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass); + BuildMI(MBB_TP_loopBody, dl, TII->get(ARM::MVE_VLDRBU8_post)) + .addDef(currSrcReg) + .addDef(loadedValueReg) + .addReg(srcPhiReg) + .addImm(16) + .addImm(ARMVCC::Then) + .addUse(vccrReg); + + BuildMI(MBB_TP_loopBody, dl, TII->get(ARM::MVE_VSTRBU8_post)) + .addDef(currDestReg) + .addUse(loadedValueReg, RegState::Kill) + .addReg(destPhiReg) + .addImm(16) + .addImm(ARMVCC::Then) + .addUse(vccrReg); + + // Add the pseudoInstrs for decrementing the loop counter and marking the + // end:t2DoLoopDec and t2DoLoopEnd + BuildMI(MBB_TP_loopBody, dl, TII->get(ARM::t2LoopDec), + remainingLoopIterationsReg) + .addUse(loopCounterPhiReg) + .addImm(1); + + BuildMI(MBB_TP_loopBody, dl, TII->get(ARM::t2LoopEnd)) + .addUse(remainingLoopIterationsReg) + .addMBB(MBB_TP_loopBody); + + BuildMI(MBB_TP_loopBody, dl, TII->get(ARM::t2B)) + .addMBB(MBB_TP_exit) + .add(predOps(ARMCC::AL)); +} + MachineBasicBlock * ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { @@ -11097,6 +11240,84 @@ return BB; } + case ARM::MVE_MEMCPYLOOPINST: { + + // Transformation below expands MVE_MEMCPYLOOPINST Pseudo instruction + // into a Tail Predicated (TP) Loop. It adds the instructions to calculate the + // iteration count =ceil(size_in_bytes/16)) in the TP entry block and adds + // the relevant instructions in the TP loop Body for generation of a WLSTP loop. + + // Below is relevant portion of the CFG after the transformation. + // The Machine Basic Blocks are shown along with branch conditions (in + // brackets). Note that TP entry/exit MBBs depict the entry/exit of this portion + // of the CFG and may not necessarily be the entry/exit of the function. + + // (Relevant) CFG after transformation: + // TP entry MBB + // | + // |-----------------| + // (n <= 0) (n > 0) + // | | + // | TP loop preHeader MBB + // | | + // | TP loop Body MBB + // \ | + // \ / + // TP exit MBB + + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + Register opDestReg = MI.getOperand(0).getReg(); + Register opSrcReg = MI.getOperand(1).getReg(); + Register opSizeReg = MI.getOperand(2).getReg(); + + // Allocate the required MBBs and add to parent function. + MachineBasicBlock *MBB_TP_entry = BB; + MachineBasicBlock *MBB_TP_ph = MF->CreateMachineBasicBlock(); + MachineBasicBlock *MBB_TP_loopBody = MF->CreateMachineBasicBlock(); + MachineBasicBlock *MBB_TP_exit; + + MF->push_back(MBB_TP_ph); + MF->push_back(MBB_TP_loopBody); + + // For exit MBB, move instructions (after MVE_MEMCPYLOOPINST) from entry MBB + // into the exit MBB This is required since a branch instruction (which is + // a terminator) is placed in entry MBB at the memcpy call site. + MBB_TP_exit = MBB_TP_entry->splitAt(MI, false); + + // Add logic for iteration count + Register totalIterationsReg = + genTPEntry(MBB_TP_entry, MBB_TP_ph, MBB_TP_loopBody, MBB_TP_exit, + opSizeReg, TII, dl, MRI); + + BuildMI(MBB_TP_ph, dl, TII->get(ARM::t2B)) + .addMBB(MBB_TP_loopBody) + .add(predOps(ARMCC::AL)); + + // Add the vectorized (and predicated) loads/store instructions + genTPLoopBody(MBB_TP_loopBody, MBB_TP_ph, MBB_TP_exit, TII, dl, MRI, opSrcReg, + opDestReg, opSizeReg, totalIterationsReg); + + // Connect the blocks + MBB_TP_entry->addSuccessor(MBB_TP_ph); + MBB_TP_ph->addSuccessor(MBB_TP_loopBody); + MBB_TP_loopBody->addSuccessor(MBB_TP_loopBody); + MBB_TP_loopBody->addSuccessor(MBB_TP_exit); + + // Reorder for better readability of generated MIR + MBB_TP_ph->moveAfter(MBB_TP_entry); + MBB_TP_loopBody->moveAfter(MBB_TP_ph); + MBB_TP_exit->moveAfter(MBB_TP_loopBody); + + // Finally, remove the memcpy Psuedo Instruction + MI.eraseFromParent(); + + // Return the exit block as it may contain other instructions requiring a + // custom inserter + return MBB_TP_exit; + } + // The Thumb2 pre-indexed stores have the same MI operands, they just // define them differently in the .td files from the isel patterns, so // they need pseudos. diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -6864,6 +6864,23 @@ let isTerminator = 1; } +def SDT_MVEMEMCPYLOOPNODE + : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, SDTCisVT<2, i32>]>; +def MVE_MEMCPYLOOPNODE : SDNode<"ARMISD::MEMCPYLOOP", SDT_MVEMEMCPYLOOPNODE, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>; + +let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { + def MVE_MEMCPYLOOPINST : PseudoInst<(outs), + (ins rGPR + : $dst, rGPR + : $src, rGPR + : $sz), + NoItinerary, [(MVE_MEMCPYLOOPNODE rGPR + : $dst, rGPR + : $src, rGPR + : $sz)]>; +} + def MVE_DLSTP_8 : MVE_DLSTP<"dlstp.8", 0b00>; def MVE_DLSTP_16 : MVE_DLSTP<"dlstp.16", 0b01>; def MVE_DLSTP_32 : MVE_DLSTP<"dlstp.32", 0b10>; diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp --- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp +++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -130,13 +130,24 @@ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { const ARMSubtarget &Subtarget = DAG.getMachineFunction().getSubtarget(); + ConstantSDNode *ConstantSize = dyn_cast(Size); + if (Subtarget.hasMVEIntegerOps() && + !DAG.getMachineFunction().getFunction().hasOptNone()) { + if ((!ConstantSize && (Alignment >= Align(4))) || + (ConstantSize && + ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() && + ConstantSize->getZExtValue() < + Subtarget.getMaxTPLoopInlineSizeThreshold())) + return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src, + Size); + } + // Do repeated 4-byte loads and stores. To be improved. // This requires 4-byte alignment. if (Alignment < Align(4)) return SDValue(); // This requires the copy size to be a constant, preferably // within a subtarget-specific limit. - ConstantSDNode *ConstantSize = dyn_cast(Size); if (!ConstantSize) return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Alignment.value(), RTLIB::MEMCPY); diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -538,6 +538,13 @@ return 64; } + /// getMaxTPLoopSizeThreshold - Returns the maximum memcpy size + /// that still makes it profitable to inline the call as a Tail + /// Predicated loop + unsigned getMaxTPLoopInlineSizeThreshold() const { + return 128; + } + /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll @@ -4,31 +4,40 @@ define void @test_memcpy(i32* nocapture %x, i32* nocapture readonly %y, i32 %n, i32 %m) { ; CHECK-LABEL: test_memcpy: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: blt .LBB0_3 +; CHECK-NEXT: blt .LBB0_6 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: mov r8, r3 -; CHECK-NEXT: mov r5, r2 -; CHECK-NEXT: mov r9, r1 -; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: lsls r4, r3, #2 -; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: lsl.w r12, r3, #2 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: b .LBB0_2 ; CHECK-NEXT: .LBB0_2: @ %for.body -; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r0, r7, r6 -; CHECK-NEXT: add.w r1, r9, r6 -; CHECK-NEXT: mov r2, r8 -; CHECK-NEXT: bl __aeabi_memcpy4 -; CHECK-NEXT: add r6, r4 -; CHECK-NEXT: subs r5, #1 -; CHECK-NEXT: bne .LBB0_2 -; CHECK-NEXT: .LBB0_3: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB0_5 Depth 2 +; CHECK-NEXT: add.w r7, r3, #15 +; CHECK-NEXT: bic r7, r7, #16 +; CHECK-NEXT: lsrs r7, r7, #4 +; CHECK-NEXT: wlstp.8 lr, r7, .LBB0_3 +; CHECK-NEXT: b .LBB0_4 +; CHECK-NEXT: .LBB0_3: @ %for.body +; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: add r4, r12 +; CHECK-NEXT: subs r2, #1 +; CHECK-NEXT: beq .LBB0_6 +; CHECK-NEXT: b .LBB0_2 +; CHECK-NEXT: .LBB0_4: @ in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: adds r5, r1, r4 +; CHECK-NEXT: adds r6, r0, r4 +; CHECK-NEXT: mov r7, r3 +; CHECK-NEXT: .LBB0_5: @ Parent Loop BB0_2 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: vldrb.u8 q0, [r5], #16 +; CHECK-NEXT: vstrb.8 q0, [r6], #16 +; CHECK-NEXT: letp lr, .LBB0_5 +; CHECK-NEXT: b .LBB0_3 +; CHECK-NEXT: .LBB0_6: @ %for.cond.cleanup +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %for.body, label %for.cond.cleanup diff --git a/llvm/test/CodeGen/Thumb2/mve_tp_loop.ll b/llvm/test/CodeGen/Thumb2/mve_tp_loop.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve_tp_loop.ll @@ -0,0 +1,209 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O1 -mtriple=arm-arm-none-eabi -mcpu=cortex-m55 --verify-machineinstrs %s -o - | FileCheck %s + +; Check that WLSTP loop is not generated for alignment < 4 +; void test1(char* dest, char* src, int n){ +; memcpy(dest, src, n); +; } + +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg) #1 + +define void @test1(i8* noalias nocapture %X, i8* noalias nocapture readonly %Y, i32 %n){ +; CHECK-LABEL: test1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: bl __aeabi_memcpy +; CHECK-NEXT: pop {r7, pc} +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %X, i8* align 1 %Y, i32 %n, i1 false) + ret void +} + + +; Check that WLSTP loop is generated for alignment >= 4 +; void test2(int* restrict X, int* restrict Y, int n){ +; memcpy(X, Y, n); +; } + + +define void @test2(i32* noalias %X, i32* noalias readonly %Y, i32 %n){ +; CHECK-LABEL: test2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: add.w r3, r2, #15 +; CHECK-NEXT: bic r3, r3, #16 +; CHECK-NEXT: lsrs r3, r3, #4 +; CHECK-NEXT: wlstp.8 lr, r2, .LBB1_2 +; CHECK-NEXT: .LBB1_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u8 q0, [r1], #16 +; CHECK-NEXT: vstrb.8 q0, [r0], #16 +; CHECK-NEXT: letp lr, .LBB1_1 +; CHECK-NEXT: .LBB1_2: @ %entry +; CHECK-NEXT: pop {r7, pc} +entry: + %0 = bitcast i32* %X to i8* + %1 = bitcast i32* %Y to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %n, i1 false) + ret void +} + + +; Checks if code above/below the memcpy call is appropriately transformed. +; void test3(int* restrict X, int* restrict Y, int n){ +; printf("n = %d\n", n); +; memcpy(X, Y, n); +; printf("n = %d\n", n); +; } +declare noundef i32 @printf(i8* nocapture noundef readonly, ...) +@.str = private unnamed_addr constant [8 x i8] c"n = %d\0A\00", align 1 +declare i32 @__2printf(i8*, ...) + +define hidden void @test3(i32* noalias nocapture %X, i32* noalias nocapture readonly %Y, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: test3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: movw r6, :lower16:.L.str +; CHECK-NEXT: movt r6, :upper16:.L.str +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r2 +; CHECK-NEXT: mov r4, r2 +; CHECK-NEXT: bl __2printf +; CHECK-NEXT: add.w r0, r4, #15 +; CHECK-NEXT: bic r0, r0, #16 +; CHECK-NEXT: lsrs r0, r0, #4 +; CHECK-NEXT: wlstp.8 lr, r0, .LBB2_3 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: .LBB2_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u8 q0, [r5], #16 +; CHECK-NEXT: vstrb.8 q0, [r7], #16 +; CHECK-NEXT: letp lr, .LBB2_2 +; CHECK-NEXT: .LBB2_3: @ %entry +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: bl __2printf +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +entry: + %0 = call i32 (i8*, ...) @__2printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0), i32 %n) #2 + %1 = bitcast i32* %X to i8* + %2 = bitcast i32* %Y to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %1, i8* align 4 %2, i32 %n, i1 false) + %3 = call i32 (i8*, ...) @__2printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0), i32 %n) #2 + ret void +} + + + +; Checks that transform handles some arithmetic on the input arguments. +; void test4(int* restrict X, int* restrict Y, int n){ +; printf("n = %d\n", n); +; memcpy(X+2, Y+3, (n*2)+10); +; printf("X = %d\n", (int)X); +; } + +@.str.1 = private unnamed_addr constant [8 x i8] c"X = %d\0A\00", align 1 + +; Function Attrs: nofree nounwind +define hidden void @test4(i32* noalias %X, i32* noalias nocapture readonly %Y, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: test4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: movw r0, :lower16:.L.str +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: movt r0, :upper16:.L.str +; CHECK-NEXT: mov r1, r2 +; CHECK-NEXT: mov r6, r2 +; CHECK-NEXT: bl __2printf +; CHECK-NEXT: movs r0, #10 +; CHECK-NEXT: add.w r0, r0, r6, lsl #1 +; CHECK-NEXT: add.w r1, r0, #15 +; CHECK-NEXT: bic r1, r1, #16 +; CHECK-NEXT: lsrs r1, r1, #4 +; CHECK-NEXT: wlstp.8 lr, r0, .LBB3_3 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: add.w r1, r4, #8 +; CHECK-NEXT: add.w r2, r5, #12 +; CHECK-NEXT: .LBB3_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u8 q0, [r2], #16 +; CHECK-NEXT: vstrb.8 q0, [r1], #16 +; CHECK-NEXT: letp lr, .LBB3_2 +; CHECK-NEXT: .LBB3_3: @ %entry +; CHECK-NEXT: movw r0, :lower16:.L.str.1 +; CHECK-NEXT: movt r0, :upper16:.L.str.1 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: bl __2printf +; CHECK-NEXT: pop {r4, r5, r6, pc} +entry: + %0 = call i32 (i8*, ...) @__2printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0), i32 %n) #2 + %add.ptr = getelementptr inbounds i32, i32* %X, i32 2 + %1 = bitcast i32* %add.ptr to i8* + %add.ptr1 = getelementptr inbounds i32, i32* %Y, i32 3 + %2 = bitcast i32* %add.ptr1 to i8* + %mul = shl nsw i32 %n, 1 + %add = add nsw i32 %mul, 10 + call void @llvm.memcpy.p0i8.p0i8.i32(i8* nonnull align 4 %1, i8* nonnull align 4 %2, i32 %add, i1 false) + %3 = ptrtoint i32* %X to i32 + %4 = call i32 (i8*, ...) @__2printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str.1, i32 0, i32 0), i32 %3) #2 + ret void +} + + +; Checks that transform handles for loops that are implicitly converted to mempcy +; void test5(int* restrict X, int* restrict Y, int n){ +; for(int i = 0; i < n; ++i){ +; X[i] = Y[i]; +; } +; } + +define void @test5(i32* noalias %X, i32* noalias readonly %Y, i32 %n) { +; CHECK-LABEL: test5: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: poplt {r4, r5, r7, pc} +; CHECK-NEXT: .LBB4_1: +; CHECK-NEXT: add.w r3, r2, #15 +; CHECK-NEXT: bic r3, r3, #16 +; CHECK-NEXT: lsr.w r12, r3, #4 +; CHECK-NEXT: .LBB4_2: @ %for.body.preheader +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB4_4 Depth 2 +; CHECK-NEXT: wlstp.8 lr, r5, .LBB4_2 +; CHECK-NEXT: @ %bb.3: @ in Loop: Header=BB4_2 Depth=1 +; CHECK-NEXT: mov r3, r1 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: .LBB4_4: @ Parent Loop BB4_2 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: vldrb.u8 q0, [r3], #16 +; CHECK-NEXT: vstrb.8 q0, [r4], #16 +; CHECK-NEXT: letp lr, .LBB4_4 +; CHECK-NEXT: b .LBB4_2 +; CHECK-NEXT: .LBB4_5: @ %for.cond.cleanup +; CHECK-NEXT: pop {r4, r5, r7, pc} +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %X.bits = bitcast i32* %X to i8* + %Y.bits = bitcast i32* %Y to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %X.bits, i8* align 4 %Y.bits, i32 %n, i1 false) + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body.preheader, %entry + ret void +} + diff --git a/llvm/test/CodeGen/Thumb2/mve_tp_loop.mir b/llvm/test/CodeGen/Thumb2/mve_tp_loop.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve_tp_loop.mir @@ -0,0 +1,219 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -O1 -mtriple=arm-arm-none-eabi -mcpu=cortex-m55 -simplify-mir -run-pass=finalize-isel %s -o - | FileCheck %s +--- | + ; ModuleID = 'llvm/test/CodeGen/Thumb2/mve_tp_loop.ll' + source_filename = "llvm/test/CodeGen/Thumb2/mve_tp_loop.ll" + target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "arm-arm-none-eabi" + + ; Function Attrs: argmemonly nofree nosync nounwind willreturn + declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg) #0 + + define void @test1(i32* noalias %X, i32* noalias readonly %Y, i32 %n) #1 { + entry: + %0 = bitcast i32* %X to i8* + %1 = bitcast i32* %Y to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %n, i1 false) + ret void + } + + define void @test2(i32* noalias %X, i32* noalias readonly %Y, i32 %n) #1 { + entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + %X.bits = bitcast i32* %X to i8* + %Y.bits = bitcast i32* %Y to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %X.bits, i8* align 4 %Y.bits, i32 %n, i1 false) + br label %for.cond.cleanup + + for.cond.cleanup: ; preds = %for.body.preheader, %entry + ret void + } + + attributes #0 = { argmemonly nofree nosync nounwind willreturn "target-cpu"="cortex-m55" } + attributes #1 = { "target-cpu"="cortex-m55" } + +... +--- +name: test1 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: + - { id: 0, class: rgpr, preferred-register: '' } + - { id: 1, class: rgpr, preferred-register: '' } + - { id: 2, class: rgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$r1', virtual-reg: '%1' } + - { reg: '$r2', virtual-reg: '%2' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $r0, $r1, $r2 + + ; CHECK-LABEL: name: test1 + ; CHECK: liveins: $r0, $r1, $r2 + ; CHECK: [[COPY:%[0-9]+]]:rgpr = COPY $r2 + ; CHECK: [[COPY1:%[0-9]+]]:rgpr = COPY $r1 + ; CHECK: [[COPY2:%[0-9]+]]:rgpr = COPY $r0 + ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 15, 14 /* CC::al */, $noreg, $noreg + ; CHECK: [[t2BICri:%[0-9]+]]:rgpr = t2BICri killed [[t2ADDri]], 16, 14 /* CC::al */, $noreg, $noreg + ; CHECK: [[t2LSRri:%[0-9]+]]:rgpr = t2LSRri killed [[t2BICri]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK: [[t2WhileLoopSetup:%[0-9]+]]:gprlr = t2WhileLoopSetup killed [[t2LSRri]] + ; CHECK: t2WhileLoopStart [[t2WhileLoopSetup]], %bb.3, implicit-def $cpsr + ; CHECK: t2B %bb.1, 14 /* CC::al */, $noreg + ; CHECK: .1: + ; CHECK: t2B %bb.2, 14 /* CC::al */, $noreg + ; CHECK: .2: + ; CHECK: [[PHI:%[0-9]+]]:rgpr = PHI [[COPY1]], %bb.1, %8, %bb.2 + ; CHECK: [[PHI1:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.1, %10, %bb.2 + ; CHECK: [[PHI2:%[0-9]+]]:gprlr = PHI [[t2WhileLoopSetup]], %bb.1, %12, %bb.2 + ; CHECK: [[PHI3:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.1, %14, %bb.2 + ; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI3]], 0, $noreg + ; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI3]], 16, 14 /* CC::al */, $noreg, $noreg + ; CHECK: [[MVE_VLDRBU8_post:%[0-9]+]]:rgpr, [[MVE_VLDRBU8_post1:%[0-9]+]]:mqpr = MVE_VLDRBU8_post [[PHI]], 16, 1, [[MVE_VCTP8_]] + ; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post killed [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]] + ; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI2]], 1 + ; CHECK: t2LoopEnd [[t2LoopDec]], %bb.2, implicit-def $cpsr + ; CHECK: t2B %bb.3, 14 /* CC::al */, $noreg + ; CHECK: .3.entry: + ; CHECK: tBX_RET 14 /* CC::al */, $noreg + %2:rgpr = COPY $r2 + %1:rgpr = COPY $r1 + %0:rgpr = COPY $r0 + MVE_MEMCPYLOOPINST %0, %1, %2 + tBX_RET 14 /* CC::al */, $noreg + +... +--- +name: test2 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: + - { id: 0, class: rgpr, preferred-register: '' } + - { id: 1, class: rgpr, preferred-register: '' } + - { id: 2, class: rgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$r1', virtual-reg: '%1' } + - { reg: '$r2', virtual-reg: '%2' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: test2 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x50000000), %bb.2(0x30000000) + ; CHECK: liveins: $r0, $r1, $r2 + ; CHECK: [[COPY:%[0-9]+]]:rgpr = COPY $r2 + ; CHECK: [[COPY1:%[0-9]+]]:rgpr = COPY $r1 + ; CHECK: [[COPY2:%[0-9]+]]:rgpr = COPY $r0 + ; CHECK: t2CMPri [[COPY]], 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2Bcc %bb.2, 11 /* CC::lt */, $cpsr + ; CHECK: t2B %bb.1, 14 /* CC::al */, $noreg + ; CHECK: bb.3: + ; CHECK: t2B %bb.4, 14 /* CC::al */, $noreg + ; CHECK: bb.4: + ; CHECK: [[PHI:%[0-9]+]]:rgpr = PHI [[COPY1]], %bb.3, %8, %bb.4 + ; CHECK: [[PHI1:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.3, %10, %bb.4 + ; CHECK: [[PHI2:%[0-9]+]]:gprlr = PHI %6, %bb.3, %12, %bb.4 + ; CHECK: [[PHI3:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.3, %14, %bb.4 + ; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI3]], 0, $noreg + ; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI3]], 16, 14 /* CC::al */, $noreg, $noreg + ; CHECK: [[MVE_VLDRBU8_post:%[0-9]+]]:rgpr, [[MVE_VLDRBU8_post1:%[0-9]+]]:mqpr = MVE_VLDRBU8_post [[PHI]], 16, 1, [[MVE_VCTP8_]] + ; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post killed [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]] + ; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI2]], 1 + ; CHECK: t2LoopEnd [[t2LoopDec]], %bb.4, implicit-def $cpsr + ; CHECK: t2B %bb.1, 14 /* CC::al */, $noreg + ; CHECK: bb.1.for.body.preheader: + ; CHECK: successors: %bb.2(0x80000000), %bb.3(0x00000000) + ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 15, 14 /* CC::al */, $noreg, $noreg + ; CHECK: [[t2BICri:%[0-9]+]]:rgpr = t2BICri killed [[t2ADDri]], 16, 14 /* CC::al */, $noreg, $noreg + ; CHECK: [[t2LSRri:%[0-9]+]]:rgpr = t2LSRri killed [[t2BICri]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK: [[t2WhileLoopSetup:%[0-9]+]]:gprlr = t2WhileLoopSetup killed [[t2LSRri]] + ; CHECK: t2WhileLoopStart [[t2WhileLoopSetup]], %bb.1, implicit-def $cpsr + ; CHECK: t2B %bb.3, 14 /* CC::al */, $noreg + ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: tBX_RET 14 /* CC::al */, $noreg + bb.0.entry: + successors: %bb.1(0x50000000), %bb.2(0x30000000) + liveins: $r0, $r1, $r2 + + %2:rgpr = COPY $r2 + %1:rgpr = COPY $r1 + %0:rgpr = COPY $r0 + t2CMPri %2, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2Bcc %bb.2, 11 /* CC::lt */, $cpsr + t2B %bb.1, 14 /* CC::al */, $noreg + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + + MVE_MEMCPYLOOPINST %0, %1, %2 + + bb.2.for.cond.cleanup: + tBX_RET 14 /* CC::al */, $noreg + +...