Index: lib/Target/ARM/ARMBaseInstrInfo.h =================================================================== --- lib/Target/ARM/ARMBaseInstrInfo.h +++ lib/Target/ARM/ARMBaseInstrInfo.h @@ -352,6 +352,8 @@ virtual void expandLoadStackGuard(MachineBasicBlock::iterator MI, Reloc::Model RM) const = 0; + void expandMEMCPY(MachineBasicBlock::iterator) const; + private: /// Modeling special VFP / NEON fp MLA / MLS hazards. Index: lib/Target/ARM/ARMBaseInstrInfo.cpp =================================================================== --- lib/Target/ARM/ARMBaseInstrInfo.cpp +++ lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1219,6 +1219,64 @@ return MI->mayLoad() && hasLoadFromStackSlot(MI, Dummy, FrameIndex); } +/// \brief Expands MEMCPY to either LDMIA/STMIA or LDMIA_UPD/STMID_UPD +/// depending on whether the result is used. +void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MBBI) const { + bool isThumb1 = Subtarget.isThumb1Only(); + bool isThumb2 = Subtarget.isThumb2(); + const ARMBaseInstrInfo *TII = Subtarget.getInstrInfo(); + + MachineInstr *MI = MBBI; + DebugLoc dl = MI->getDebugLoc(); + MachineBasicBlock *BB = MI->getParent(); + + MachineInstrBuilder LDM, STM; + if (isThumb1 || !MI->getOperand(1).isDead()) { + LDM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA_UPD + : isThumb1 ? ARM::tLDMIA_UPD + : ARM::LDMIA_UPD)) + .addOperand(MI->getOperand(1)); + } else { + LDM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA : ARM::LDMIA)); + } + + if (isThumb1 || !MI->getOperand(0).isDead()) { + STM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA_UPD + : isThumb1 ? ARM::tSTMIA_UPD + : ARM::STMIA_UPD)) + .addOperand(MI->getOperand(0)); + } else { + STM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA : ARM::STMIA)); + } + + LDM.addOperand(MI->getOperand(3)).addImm(ARMCC::AL).addReg(0); + STM.addOperand(MI->getOperand(2)).addImm(ARMCC::AL).addReg(0); + + // Sort the scratch registers into ascending order. + const TargetRegisterInfo &TRI = getRegisterInfo(); + unsigned NumScratch = MI->getOperand(4).getImm(); + llvm::SmallVector ScratchRegs(NumScratch); + std::transform(MI->operands_begin() + 5, MI->operands_end(), + ScratchRegs.begin(), + [] (const MachineOperand &Op) { + return Op.getReg(); + }); + std::sort(ScratchRegs.begin(), ScratchRegs.end(), + [&TRI](const unsigned &Reg1, + const unsigned &Reg2) -> bool { + return TRI.getEncodingValue(Reg1) < + TRI.getEncodingValue(Reg2); + }); + + for (const auto &Reg : ScratchRegs) { + LDM.addReg(Reg, RegState::Define); + STM.addReg(Reg, RegState::Kill); + } + + BB->erase(MBBI); +} + + bool ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { MachineFunction &MF = *MI->getParent()->getParent(); @@ -1232,6 +1290,11 @@ return true; } + if (MI->getOpcode() == ARM::MEMCPY) { + expandMEMCPY(MI); + return true; + } + // This hook gets to expand COPY instructions before they become // copyPhysReg() calls. Look for VMOVS instructions that can legally be // widened to VMOVD. We prefer the VMOVD when possible because it may be Index: lib/Target/ARM/ARMISelLowering.h =================================================================== --- lib/Target/ARM/ARMISelLowering.h +++ lib/Target/ARM/ARMISelLowering.h @@ -185,6 +185,10 @@ // Vector bitwise select VBSL, + // Pseudo-instruction representing a memory copy using ldm/stm + // instructions. + MEMCPY, + // Vector load N-element structure to all lanes: VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE, VLD3DUP, Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -1178,6 +1178,7 @@ case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; case ARMISD::VBSL: return "ARMISD::VBSL"; + case ARMISD::MEMCPY: return "ARMISD::MEMCPY"; case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; @@ -8072,8 +8073,42 @@ } } +/// \brief Attaches vregs to MEMCPY that it will use as scratch registers +/// when it is expanded into LDM/STM. This is done as a post-isel lowering +/// instead of as a custom inserter because we need the use list from the SDNode. +static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, + MachineInstr *MI, const SDNode *Node) { + bool isThumb1 = Subtarget->isThumb1Only(); + + DebugLoc DL = MI->getDebugLoc(); + MachineFunction *MF = MI->getParent()->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + MachineInstrBuilder MIB(*MF, MI); + + // If the new dst/src is unused mark it as dead. + if (!Node->hasAnyUseOfValue(0)) { + MI->getOperand(0).setIsDead(true); + } + if (!Node->hasAnyUseOfValue(1)) { + MI->getOperand(1).setIsDead(true); + } + + // The MEMCPY both defines and kills the scratch registers. + const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); + for (unsigned I = 0; I != MI->getOperand(4).getImm(); ++I) { + unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass + : &ARM::GPRRegClass); + MIB.addReg(TmpReg, RegState::Define|RegState::Dead); + } +} + void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, SDNode *Node) const { + if (MI->getOpcode() == ARM::MEMCPY) { + attachMEMCPYScratchRegs(Subtarget, MI, Node); + return; + } + const MCInstrDesc *MCID = &MI->getDesc(); // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, // RSC. Coming out of isel, they have an implicit CPSR def, but the optional Index: lib/Target/ARM/ARMInstrInfo.td =================================================================== --- lib/Target/ARM/ARMInstrInfo.td +++ lib/Target/ARM/ARMInstrInfo.td @@ -73,6 +73,10 @@ def SDT_WIN__DBZCHK : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>; +def SDT_ARMMEMCPY : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>, + SDTCisVT<4, i32>]>; + def SDTBinaryArithWithFlags : SDTypeProfile<2, 2, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, @@ -179,6 +183,10 @@ def ARMbfi : SDNode<"ARMISD::BFI", SDT_ARMBFI>; +def ARMmemcopy : SDNode<"ARMISD::MEMCPY", SDT_ARMMEMCPY, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, + SDNPMayStore, SDNPMayLoad]>; + //===----------------------------------------------------------------------===// // ARM Instruction Predicate Definitions. // @@ -4577,6 +4585,19 @@ [(ARMcopystructbyval GPR:$dst, GPR:$src, imm:$size, imm:$alignment)]>; } +let hasPostISelHook = 1, Constraints = "$newdst = $dst, $newsrc = $src" in { + // %newsrc, %newdst = MEMCPY %dst, %src, N, ...N scratch regs... + // Copies N registers worth of memory from address %src to address %dst + // and returns the incremented addresses. N scratch register will + // be attached for the copy to use. + def MEMCPY : PseudoInst< + (outs GPR:$newdst, GPR:$newsrc), + (ins GPR:$dst, GPR:$src, i32imm:$nreg, variable_ops), + NoItinerary, + [(set GPR:$newdst, GPR:$newsrc, + (ARMmemcopy GPR:$dst, GPR:$src, imm:$nreg))]>; +} + def ldrex_1 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{ return cast(N)->getMemoryVT() == MVT::i8; }]>; Index: lib/Target/ARM/ARMInstrThumb.td =================================================================== --- lib/Target/ARM/ARMInstrThumb.td +++ lib/Target/ARM/ARMInstrThumb.td @@ -740,6 +740,7 @@ // Writeback version is just a pseudo, as there's no encoding difference. // Writeback happens iff the base register is not in the destination register // list. +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in def tLDMIA_UPD : InstTemplate, Index: lib/Target/ARM/ARMSelectionDAGInfo.cpp =================================================================== --- lib/Target/ARM/ARMSelectionDAGInfo.cpp +++ lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -160,41 +160,39 @@ unsigned VTSize = 4; unsigned i = 0; // Emit a maximum of 4 loads in Thumb1 since we have fewer registers - const unsigned MAX_LOADS_IN_LDM = Subtarget.isThumb1Only() ? 4 : 6; + const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6; SDValue TFOps[6]; SDValue Loads[6]; uint64_t SrcOff = 0, DstOff = 0; - // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the - // same number of stores. The loads and stores will get combined into - // ldm/stm later on. - while (EmittedNumMemOps < NumMemOps) { - for (i = 0; - i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) { - Loads[i] = DAG.getLoad(VT, dl, Chain, - DAG.getNode(ISD::ADD, dl, MVT::i32, Src, - DAG.getConstant(SrcOff, dl, MVT::i32)), - SrcPtrInfo.getWithOffset(SrcOff), isVolatile, - false, false, 0); - TFOps[i] = Loads[i].getValue(1); - SrcOff += VTSize; - } - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - makeArrayRef(TFOps, i)); + // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to + // VLDM/VSTM and make this code emit it when appropriate. This would reduce + // pressure on the general purpose registers. However this seems harder to map + // onto the register allocator's view of the world. - for (i = 0; - i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) { - TFOps[i] = DAG.getStore(Chain, dl, Loads[i], - DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, - DAG.getConstant(DstOff, dl, MVT::i32)), - DstPtrInfo.getWithOffset(DstOff), - isVolatile, false, 0); - DstOff += VTSize; - } - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - makeArrayRef(TFOps, i)); + // The number of MEMCPY pseudo-instructions to emit. We use up to + // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm + // later on. This is a lower bound on the number of MEMCPY operations we must + // emit. + unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM; + + SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue); + + for (unsigned I = 0; I != NumMEMCPYs; ++I) { + // Evenly distribute registers among MEMCPY operations to reduce register + // pressure. + unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs; + unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps; + + Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src, + DAG.getConstant(NumRegs, dl, MVT::i32)); + Src = Dst.getValue(1); + Chain = Dst.getValue(2); + + DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize); + SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize); - EmittedNumMemOps += i; + EmittedNumMemOps = NextEmittedNumMemOps; } if (BytesLeft == 0) Index: lib/Target/ARM/Thumb2SizeReduction.cpp =================================================================== --- lib/Target/ARM/Thumb2SizeReduction.cpp +++ lib/Target/ARM/Thumb2SizeReduction.cpp @@ -125,7 +125,10 @@ { ARM::t2LDMIA, ARM::tLDMIA, 0, 0, 0, 1, 1, 1,1, 0,1,0 }, { ARM::t2LDMIA_RET,0, ARM::tPOP_RET, 0, 0, 1, 1, 1,1, 0,1,0 }, { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0, 0, 1, 1, 1,1, 0,1,0 }, - // ARM::t2STM (with no basereg writeback) has no Thumb1 equivalent + // ARM::t2STMIA (with no basereg writeback) has no Thumb1 equivalent. + // tSTMIA_UPD is a change in semantics which can only be used if the base + // register is killed. This difference is correctly handled elsewhere. + { ARM::t2STMIA, ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1,0 }, { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1,0 }, { ARM::t2STMDB_UPD, 0, ARM::tPUSH, 0, 0, 1, 1, 1,1, 0,1,0 } }; @@ -435,6 +438,14 @@ isLdStMul = true; break; } + case ARM::t2STMIA: { + // If the base register is killed, we don't care what its value is after the + // instruction, so we can use an updating STMIA. + if (!MI->getOperand(0).isKill()) + return false; + + break; + } case ARM::t2LDMIA_RET: { unsigned BaseReg = MI->getOperand(1).getReg(); if (BaseReg != ARM::SP) @@ -492,6 +503,12 @@ // Add the 16-bit load / store instruction. DebugLoc dl = MI->getDebugLoc(); MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, TII->get(Opc)); + + // tSTMIA_UPD takes a defining register operand. We've already checked that + // the register is killed, so mark it as dead here. + if (Entry.WideOpc == ARM::t2STMIA) + MIB.addReg(MI->getOperand(0).getReg(), RegState::Define | RegState::Dead); + if (!isLdStMul) { MIB.addOperand(MI->getOperand(0)); MIB.addOperand(MI->getOperand(1)); Index: test/CodeGen/ARM/ldm-stm-base-materialization.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/ldm-stm-base-materialization.ll @@ -0,0 +1,93 @@ +; RUN: llc -mtriple armv7a-none-eabi -mattr=-neon < %s -verify-machineinstrs -o - | FileCheck %s + +; Thumb1 (thumbv6m) is tested in tests/Thumb + +@a = external global i32* +@b = external global i32* + +; Function Attrs: nounwind +define void @foo24() #0 { +entry: +; CHECK-LABEL: foo24: +; We use '[rl0-9]*' to allow 'r0'..'r12', 'lr' +; CHECK: movt [[LB:[rl0-9]+]], :upper16:b +; CHECK: movt [[SB:[rl0-9]+]], :upper16:a +; CHECK: add [[NLB:[rl0-9]+]], [[LB]], #4 +; CHECK: add [[NSB:[rl0-9]+]], [[SB]], #4 +; CHECK-NEXT: ldm [[NLB]], {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]], [[R4:[rl0-9]+]], [[R5:[rl0-9]+]], [[R6:[rl0-9]+]]} +; CHECK-NEXT: stm [[NSB]], {[[R1]], [[R2]], [[R3]], [[R4]], [[R5]], [[R6]]} + %0 = load i32*, i32** @a, align 4 + %arrayidx = getelementptr inbounds i32, i32* %0, i32 1 + %1 = bitcast i32* %arrayidx to i8* + %2 = load i32*, i32** @b, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1 + %3 = bitcast i32* %arrayidx1 to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 24, i32 4, i1 false) + ret void +} + +define void @foo28() #0 { +entry: +; CHECK-LABEL: foo28: +; CHECK: movt [[LB:[rl0-9]+]], :upper16:b +; CHECK: movt [[SB:[rl0-9]+]], :upper16:a +; CHECK: add [[NLB:[rl0-9]+]], [[LB]], #4 +; CHECK: add [[NSB:[rl0-9]+]], [[SB]], #4 +; CHECK-NEXT: ldm [[NLB]]!, {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]]} +; CHECK-NEXT: stm [[NSB]]!, {[[R1]], [[R2]], [[R3]]} +; CHECK-NEXT: ldm [[NLB]], {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]], [[R4:[rl0-9]+]]} +; CHECK-NEXT: stm [[NSB]], {[[R1]], [[R2]], [[R3]], [[R4]]} + %0 = load i32*, i32** @a, align 4 + %arrayidx = getelementptr inbounds i32, i32* %0, i32 1 + %1 = bitcast i32* %arrayidx to i8* + %2 = load i32*, i32** @b, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1 + %3 = bitcast i32* %arrayidx1 to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 28, i32 4, i1 false) + ret void +} + +define void @foo32() #0 { +entry: +; CHECK-LABEL: foo32: +; CHECK: movt [[LB:[rl0-9]+]], :upper16:b +; CHECK: movt [[SB:[rl0-9]+]], :upper16:a +; CHECK: add [[NLB:[rl0-9]+]], [[LB]], #4 +; CHECK: add [[NSB:[rl0-9]+]], [[SB]], #4 +; CHECK-NEXT: ldm [[NLB]]!, {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]], [[R4:[rl0-9]+]]} +; CHECK-NEXT: stm [[NSB]]!, {[[R1]], [[R2]], [[R3]], [[R4]]} +; CHECK-NEXT: ldm [[NLB]], {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]], [[R4:[rl0-9]+]]} +; CHECK-NEXT: stm [[NSB]], {[[R1]], [[R2]], [[R3]], [[R4]]} + %0 = load i32*, i32** @a, align 4 + %arrayidx = getelementptr inbounds i32, i32* %0, i32 1 + %1 = bitcast i32* %arrayidx to i8* + %2 = load i32*, i32** @b, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1 + %3 = bitcast i32* %arrayidx1 to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 32, i32 4, i1 false) + ret void +} + +define void @foo36() #0 { +entry: +; CHECK-LABEL: foo36: +; CHECK: movt [[LB:[rl0-9]+]], :upper16:b +; CHECK: movt [[SB:[rl0-9]+]], :upper16:a +; CHECK: add [[NLB:[rl0-9]+]], [[LB]], #4 +; CHECK: add [[NSB:[rl0-9]+]], [[SB]], #4 +; CHECK-NEXT: ldm [[NLB]]!, {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]], [[R4:[rl0-9]+]]} +; CHECK-NEXT: stm [[NSB]]!, {[[R1]], [[R2]], [[R3]], [[R4]]} +; CHECK-NEXT: ldm [[NLB]], {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]], [[R4:[rl0-9]+]], [[R5:[rl0-9]+]]} +; CHECK-NEXT: stm [[NSB]], {[[R1]], [[R2]], [[R3]], [[R4]], [[R5]]} + %0 = load i32*, i32** @a, align 4 + %arrayidx = getelementptr inbounds i32, i32* %0, i32 1 + %1 = bitcast i32* %arrayidx to i8* + %2 = load i32*, i32** @b, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1 + %3 = bitcast i32* %arrayidx1 to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 36, i32 4, i1 false) + ret void +} + +; Function Attrs: nounwind +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1 Index: test/CodeGen/ARM/load-store-flags.ll =================================================================== --- test/CodeGen/ARM/load-store-flags.ll +++ test/CodeGen/ARM/load-store-flags.ll @@ -6,7 +6,7 @@ define void @test_base_kill(i32 %v0, i32 %v1, i32* %addr) { ; CHECK-LABEL: test_base_kill: ; CHECK: adds [[NEWBASE:r[0-9]+]], r2, #4 -; CHECK: stm.w [[NEWBASE]], {r0, r1, r2} +; CHECK: stm [[NEWBASE]]!, {r0, r1, r2} %addr.1 = getelementptr i32, i32* %addr, i32 1 store i32 %v0, i32* %addr.1 @@ -27,7 +27,7 @@ define void @test_base_kill_mid(i32 %v0, i32* %addr, i32 %v1) { ; CHECK-LABEL: test_base_kill_mid: ; CHECK: adds [[NEWBASE:r[0-9]+]], r1, #4 -; CHECK: stm.w [[NEWBASE]], {r0, r1, r2} +; CHECK: stm [[NEWBASE]]!, {r0, r1, r2} %addr.1 = getelementptr i32, i32* %addr, i32 1 store i32 %v0, i32* %addr.1 Index: test/CodeGen/ARM/memcpy-ldm-stm.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/memcpy-ldm-stm.ll @@ -0,0 +1,94 @@ +; RUN: llc -mtriple=thumbv6m-eabi -verify-machineinstrs %s -o - | \ +; RUN: FileCheck %s --check-prefix=CHECK --check-prefix=CHECKV6 +; RUN: llc -mtriple=thumbv6m-eabi -O=0 -verify-machineinstrs %s -o - | \ +; RUN: FileCheck %s --check-prefix=CHECK --check-prefix=CHECKV6 +; RUN: llc -mtriple=thumbv7a-eabi -mattr=-neon -verify-machineinstrs %s -o - | \ +; RUN: FileCheck %s --check-prefix=CHECK --check-prefix=CHECKV7 +; RUN: llc -mtriple=armv7a-eabi -mattr=-neon -verify-machineinstrs %s -o - | \ +; RUN: FileCheck %s --check-prefix=CHECK --check-prefix=CHECKV7 + +@d = external global [64 x i32] +@s = external global [64 x i32] + +; Function Attrs: nounwind +define void @t1() #0 { +entry: +; CHECK-LABEL: t1: +; CHECKV6: ldr [[LB:r[0-7]]], +; CHECKV6-NEXT: ldr [[SB:r[0-7]]], +; We use '[rl0-9]+' to allow 'r0'..'r12', 'lr' +; CHECKV7: movt [[LB:[rl0-9]+]], :upper16:d +; CHECKV7-NEXT: movt [[SB:[rl0-9]+]], :upper16:s +; CHECK-NEXT: ldm{{(\.w)?}} [[LB]]!, +; CHECK-NEXT: stm{{(\.w)?}} [[SB]]!, +; Think of the monstrosity '{{\[}}[[LB]]]' as '[ [[LB]] ]' without the spaces. +; CHECK-NEXT: ldrb{{(\.w)?}} {{.*}}, {{\[}}[[LB]]] +; CHECK-NEXT: strb{{(\.w)?}} {{.*}}, {{\[}}[[SB]]] + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 17, i32 4, i1 false) + ret void +} + +; Function Attrs: nounwind +define void @t2() #0 { +entry: +; CHECK-LABEL: t2: +; CHECKV6: ldr [[LB:r[0-7]]], +; CHECKV6-NEXT: ldr [[SB:r[0-7]]], +; CHECKV7: movt [[LB:[rl0-9]+]], :upper16:d +; CHECKV7-NEXT: movt [[SB:[rl0-9]+]], :upper16:s +; CHECK-NEXT: ldm{{(\.w)?}} [[LB]]!, +; CHECK-NEXT: stm{{(\.w)?}} [[SB]]!, +; CHECK-NEXT: ldrh{{(\.w)?}} {{.*}}, {{\[}}[[LB]]] +; CHECK-NEXT: ldrb{{(\.w)?}} {{.*}}, {{\[}}[[LB]], #2] +; CHECK-NEXT: strb{{(\.w)?}} {{.*}}, {{\[}}[[SB]], #2] +; CHECK-NEXT: strh{{(\.w)?}} {{.*}}, {{\[}}[[SB]]] + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 15, i32 4, i1 false) + ret void +} + +; PR23768 +%struct.T = type { i8, i64, i8 } + +@copy = external global %struct.T, align 8 +@etest = external global %struct.T, align 8 + +define void @t3() { + call void @llvm.memcpy.p0i8.p0i8.i32( + i8* getelementptr inbounds (%struct.T, %struct.T* @copy, i32 0, i32 0), + i8* getelementptr inbounds (%struct.T, %struct.T* @etest, i32 0, i32 0), + i32 24, i32 8, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i32( + i8* getelementptr inbounds (%struct.T, %struct.T* @copy, i32 0, i32 0), + i8* getelementptr inbounds (%struct.T, %struct.T* @etest, i32 0, i32 0), + i32 24, i32 8, i1 false) + ret void +} + +%struct.S = type { [12 x i32] } + +; CHECK-LABEL: test3 +define void @test3(%struct.S* %d, %struct.S* %s) #0 { + %1 = bitcast %struct.S* %d to i8* + %2 = bitcast %struct.S* %s to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %2, i32 48, i32 4, i1 false) +; 3 ldm/stm pairs in v6; 2 in v7 +; CHECK: ldm{{(\.w)?}} {{[rl0-9]+!?}}, [[REGLIST1:{.*}]] +; CHECK: stm{{(\.w)?}} {{[rl0-9]+!?}}, [[REGLIST1]] +; CHECK: ldm{{(\.w)?}} {{[rl0-9]+!?}}, [[REGLIST2:{.*}]] +; CHECK: stm{{(\.w)?}} {{[rl0-9]+!?}}, [[REGLIST2]] +; CHECKV6: ldm {{r[0-7]!?}}, [[REGLIST3:{.*}]] +; CHECKV6: stm {{r[0-7]!?}}, [[REGLIST3]] +; CHECKV7-NOT: ldm +; CHECKV7-NOT: stm + %arrayidx = getelementptr inbounds %struct.S, %struct.S* %s, i32 0, i32 0, i32 1 + tail call void @g(i32* %arrayidx) #3 + ret void +} + +declare void @g(i32*) + +; Set "no-frame-pointer-elim" to increase register pressure +attributes #0 = { "no-frame-pointer-elim"="true" } + +; Function Attrs: nounwind +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1 Index: test/CodeGen/Thumb/ldm-stm-base-materialization-thumb2.ll =================================================================== --- /dev/null +++ test/CodeGen/Thumb/ldm-stm-base-materialization-thumb2.ll @@ -0,0 +1,93 @@ +; RUN: llc -mattr=-neon < %s -verify-machineinstrs -o - | FileCheck %s + +target triple = "thumbv7a-none--eabi" + +@a = external global i32* +@b = external global i32* + +; Function Attrs: nounwind +define void @foo24() #0 { +entry: +; CHECK-LABEL: foo24: +; We use '[rl0-9]*' to allow 'r0'..'r12', 'lr' +; CHECK: movt [[LB:[rl0-9]+]], :upper16:b +; CHECK: movt [[SB:[rl0-9]+]], :upper16:a +; CHECK: add{{s?}}{{(\.w)?}} [[NLB:[rl0-9]+]], [[LB]], #4 +; CHECK: adds [[SB]], #4 +; CHECK-NEXT: ldm{{(\.w)?}} [[NLB]], {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]], [[R4:[rl0-9]+]], [[R5:[rl0-9]+]], [[R6:[rl0-9]+]]} +; CHECK-NEXT: stm{{(\.w)?}} [[SB]], {[[R1]], [[R2]], [[R3]], [[R4]], [[R5]], [[R6]]} + %0 = load i32*, i32** @a, align 4 + %arrayidx = getelementptr inbounds i32, i32* %0, i32 1 + %1 = bitcast i32* %arrayidx to i8* + %2 = load i32*, i32** @b, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1 + %3 = bitcast i32* %arrayidx1 to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 24, i32 4, i1 false) + ret void +} + +define void @foo28() #0 { +entry: +; CHECK-LABEL: foo28: +; CHECK: movt [[LB:[rl0-9]+]], :upper16:b +; CHECK: movt [[SB:[rl0-9]+]], :upper16:a +; CHECK: add{{(\.w)?}} [[NLB:[rl0-9]+]], [[LB]], #4 +; CHECK: adds [[SB]], #4 +; CHECK-NEXT: ldm{{(\.w)?}} [[NLB]]!, {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]]} +; CHECK-NEXT: stm{{(\.w)?}} [[SB]]!, {[[R1]], [[R2]], [[R3]]} +; CHECK-NEXT: ldm{{(\.w)?}} [[NLB]], {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]], [[R4:[rl0-9]+]]} +; CHECK-NEXT: stm{{(\.w)?}} [[SB]], {[[R1]], [[R2]], [[R3]], [[R4]]} + %0 = load i32*, i32** @a, align 4 + %arrayidx = getelementptr inbounds i32, i32* %0, i32 1 + %1 = bitcast i32* %arrayidx to i8* + %2 = load i32*, i32** @b, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1 + %3 = bitcast i32* %arrayidx1 to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 28, i32 4, i1 false) + ret void +} + +define void @foo32() #0 { +entry: +; CHECK-LABEL: foo32: +; CHECK: movt [[LB:[rl0-9]+]], :upper16:b +; CHECK: movt [[SB:[rl0-9]+]], :upper16:a +; CHECK: add{{(\.w)?}} [[NLB:[rl0-9]+]], [[LB]], #4 +; CHECK: adds [[SB]], #4 +; CHECK-NEXT: ldm{{(\.w)?}} [[NLB]]!, {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]], [[R4:[rl0-9]+]]} +; CHECK-NEXT: stm{{(\.w)?}} [[SB]]!, {[[R1]], [[R2]], [[R3]], [[R4]]} +; CHECK-NEXT: ldm{{(\.w)?}} [[NLB]], {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]], [[R4:[rl0-9]+]]} +; CHECK-NEXT: stm{{(\.w)?}} [[SB]], {[[R1]], [[R2]], [[R3]], [[R4]]} + %0 = load i32*, i32** @a, align 4 + %arrayidx = getelementptr inbounds i32, i32* %0, i32 1 + %1 = bitcast i32* %arrayidx to i8* + %2 = load i32*, i32** @b, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1 + %3 = bitcast i32* %arrayidx1 to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 32, i32 4, i1 false) + ret void +} + +define void @foo36() #0 { +entry: +; CHECK-LABEL: foo36: +; CHECK: movt [[LB:[rl0-9]+]], :upper16:b +; CHECK: movt [[SB:[rl0-9]+]], :upper16:a +; CHECK: add{{(\.w)?}} [[NLB:[rl0-9]+]], [[LB]], #4 +; CHECK: adds [[SB]], #4 +; CHECK-NEXT: ldm{{(\.w)?}} [[NLB]]!, {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]], [[R4:[rl0-9]+]]} +; CHECK-NEXT: stm{{(\.w)?}} [[SB]]!, {[[R1]], [[R2]], [[R3]], [[R4]]} +; CHECK-NEXT: ldm{{(\.w)?}} [[NLB]], {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]], [[R4:[rl0-9]+]], [[R5:[rl0-9]+]]} +; CHECK-NEXT: stm{{(\.w)?}} [[SB]], {[[R1]], [[R2]], [[R3]], [[R4]], [[R5]]} + %0 = load i32*, i32** @a, align 4 + %arrayidx = getelementptr inbounds i32, i32* %0, i32 1 + %1 = bitcast i32* %arrayidx to i8* + %2 = load i32*, i32** @b, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1 + %3 = bitcast i32* %arrayidx1 to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 36, i32 4, i1 false) + ret void +} + +; Function Attrs: nounwind +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1 Index: test/CodeGen/Thumb/ldm-stm-base-materialization.ll =================================================================== --- test/CodeGen/Thumb/ldm-stm-base-materialization.ll +++ test/CodeGen/Thumb/ldm-stm-base-materialization.ll @@ -6,15 +6,17 @@ @b = external global i32* ; Function Attrs: nounwind -define void @foo() #0 { +define void @foo24() #0 { entry: -; CHECK-LABEL: foo: -; CHECK: ldr r[[SB:[0-9]]], .LCPI +; CHECK-LABEL: foo24: ; CHECK: ldr r[[LB:[0-9]]], .LCPI ; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4 -; CHECK-NEXT: ldm r[[NLB]], +; CHECK: ldr r[[SB:[0-9]]], .LCPI ; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4 -; CHECK-NEXT: stm r[[NSB]] +; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]} +; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]} +; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]} +; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]} %0 = load i32*, i32** @a, align 4 %arrayidx = getelementptr inbounds i32, i32* %0, i32 1 %1 = bitcast i32* %arrayidx to i8* @@ -25,5 +27,70 @@ ret void } +define void @foo28() #0 { +entry: +; CHECK-LABEL: foo28: +; CHECK: ldr r[[LB:[0-9]]], .LCPI +; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4 +; CHECK: ldr r[[SB:[0-9]]], .LCPI +; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4 +; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]} +; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]} +; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]], r[[R4:[0-9]]]} +; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]], r[[R4]]} + %0 = load i32*, i32** @a, align 4 + %arrayidx = getelementptr inbounds i32, i32* %0, i32 1 + %1 = bitcast i32* %arrayidx to i8* + %2 = load i32*, i32** @b, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1 + %3 = bitcast i32* %arrayidx1 to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 28, i32 4, i1 false) + ret void +} + +define void @foo32() #0 { +entry: +; CHECK-LABEL: foo32: +; CHECK: ldr r[[LB:[0-9]]], .LCPI +; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4 +; CHECK: ldr r[[SB:[0-9]]], .LCPI +; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4 +; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]], r[[R4:[0-9]]]} +; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]], r[[R4]]} +; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]], r[[R4:[0-9]]]} +; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]], r[[R4]]} + %0 = load i32*, i32** @a, align 4 + %arrayidx = getelementptr inbounds i32, i32* %0, i32 1 + %1 = bitcast i32* %arrayidx to i8* + %2 = load i32*, i32** @b, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1 + %3 = bitcast i32* %arrayidx1 to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 32, i32 4, i1 false) + ret void +} + +define void @foo36() #0 { +entry: +; CHECK-LABEL: foo36: +; CHECK: ldr r[[LB:[0-9]]], .LCPI +; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4 +; CHECK: ldr r[[SB:[0-9]]], .LCPI +; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4 +; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]} +; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]} +; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]} +; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]} +; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]} +; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]} + %0 = load i32*, i32** @a, align 4 + %arrayidx = getelementptr inbounds i32, i32* %0, i32 1 + %1 = bitcast i32* %arrayidx to i8* + %2 = load i32*, i32** @b, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1 + %3 = bitcast i32* %arrayidx1 to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 36, i32 4, i1 false) + ret void +} + ; Function Attrs: nounwind declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1 Index: test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll =================================================================== --- test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll +++ /dev/null @@ -1,36 +0,0 @@ -; RUN: llc -mtriple=thumbv6m-eabi -verify-machineinstrs %s -o - | FileCheck %s -@d = external global [64 x i32] -@s = external global [64 x i32] - -; Function Attrs: nounwind -define void @t1() #0 { -entry: -; CHECK-LABEL: t1: -; CHECK: ldr r[[LB:[0-9]]], -; CHECK-NEXT: ldm r[[LB]]!, -; CHECK-NEXT: ldr r[[SB:[0-9]]], -; CHECK-NEXT: stm r[[SB]]!, -; CHECK-NEXT: ldrb {{.*}}, [r[[LB]]] -; CHECK-NEXT: strb {{.*}}, [r[[SB]]] - tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 17, i32 4, i1 false) - ret void -} - -; Function Attrs: nounwind -define void @t2() #0 { -entry: -; CHECK-LABEL: t2: -; CHECK: ldr r[[LB:[0-9]]], -; CHECK-NEXT: ldm r[[LB]]!, -; CHECK-NEXT: ldr r[[SB:[0-9]]], -; CHECK-NEXT: stm r[[SB]]!, -; CHECK-NEXT: ldrh {{.*}}, [r[[LB]]] -; CHECK-NEXT: ldrb {{.*}}, [r[[LB]], #2] -; CHECK-NEXT: strb {{.*}}, [r[[SB]], #2] -; CHECK-NEXT: strh {{.*}}, [r[[SB]]] - tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 15, i32 4, i1 false) - ret void -} - -; Function Attrs: nounwind -declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1