Index: lib/Target/ARM/ARMISelLowering.h =================================================================== --- lib/Target/ARM/ARMISelLowering.h +++ lib/Target/ARM/ARMISelLowering.h @@ -189,6 +189,10 @@ // Vector bitwise select VBSL, + // Pseudo-instruction representing a memory copy using ldm/stm + // instructions. + MCOPY, + // Vector load N-element structure to all lanes: VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE, VLD3DUP, Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -7559,8 +7559,55 @@ } } +/// \brief Lowers MCOPY to either LDMIA/STMIA or LDMIA_UPD/STMID_UPD depending +/// on whether the result is used. This is done as a post-isel lowering instead +/// of as a custom inserter because we need the use list from the SDNode. +static void LowerMCOPY(const ARMSubtarget *Subtarget, MachineInstr *MI, + SDNode *Node) { + bool isThumb2 = Subtarget->isThumb2(); + const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); + + DebugLoc dl = MI->getDebugLoc(); + MachineBasicBlock *BB = MI->getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + MachineInstrBuilder LD, ST; + if (Node->hasAnyUseOfValue(1)) { + LD = BuildMI(*BB, MI, dl, + TII->get(isThumb2 ? ARM::t2LDMIA_UPD : ARM::LDMIA_UPD)) + .addOperand(MI->getOperand(1)); + } else { + LD = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA : ARM::LDMIA)); + } + + if (Node->hasAnyUseOfValue(0)) { + ST = BuildMI(*BB, MI, dl, + TII->get(isThumb2 ? ARM::t2STMIA_UPD : ARM::STMIA_UPD)) + .addOperand(MI->getOperand(0)); + } else { + ST = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA : ARM::STMIA)); + } + + LD.addOperand(MI->getOperand(3)).addImm(ARMCC::AL).addReg(0); + ST.addOperand(MI->getOperand(2)).addImm(ARMCC::AL).addReg(0); + + for (unsigned I = 0; I != MI->getOperand(4).getImm(); ++I) { + unsigned TmpReg = MRI.createVirtualRegister(&ARM::GPRRegClass); + LD.addReg(TmpReg, RegState::Define); + ST.addReg(TmpReg, RegState::Kill); + } + + MI->eraseFromParent(); +} + void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, SDNode *Node) const { + if (MI->getOpcode() == ARM::MCOPY) { + LowerMCOPY(Subtarget, MI, Node); + return; + } + const MCInstrDesc *MCID = &MI->getDesc(); // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, // RSC. Coming out of isel, they have an implicit CPSR def, but the optional Index: lib/Target/ARM/ARMInstrInfo.td =================================================================== --- lib/Target/ARM/ARMInstrInfo.td +++ lib/Target/ARM/ARMInstrInfo.td @@ -74,6 +74,10 @@ def SDT_ARMVMAXNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>; def SDT_ARMVMINNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>; +def SDT_ARMMCOPY : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>, + SDTCisVT<4, i32>]>; + def SDTBinaryArithWithFlags : SDTypeProfile<2, 2, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, @@ -180,6 +184,10 @@ def ARMvmaxnm : SDNode<"ARMISD::VMAXNM", SDT_ARMVMAXNM, []>; def ARMvminnm : SDNode<"ARMISD::VMINNM", SDT_ARMVMINNM, []>; +def ARMmcopy : SDNode<"ARMISD::MCOPY", SDT_ARMMCOPY, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, + SDNPMayStore, SDNPMayLoad]>; + //===----------------------------------------------------------------------===// // ARM Instruction Predicate Definitions. // @@ -4553,6 +4561,13 @@ [(ARMcopystructbyval GPR:$dst, GPR:$src, imm:$size, imm:$alignment)]>; } +let hasPostISelHook = 1 in { + def MCOPY : PseudoInst< + (outs GPR:$newdst, GPR:$newsrc), (ins GPR:$dst, GPR:$src, i32imm:$nreg), + NoItinerary, + [(set GPR:$newdst, GPR:$newsrc, (ARMmcopy GPR:$dst, GPR:$src, imm:$nreg))]>; +} + def ldrex_1 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{ return cast(N)->getMemoryVT() == MVT::i8; }]>; Index: lib/Target/ARM/ARMSelectionDAGInfo.cpp =================================================================== --- lib/Target/ARM/ARMSelectionDAGInfo.cpp +++ lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -59,46 +59,39 @@ SDValue Loads[6]; uint64_t SrcOff = 0, DstOff = 0; - // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the - // same number of stores. The loads and stores will get combined into - // ldm/stm later on. - while (EmittedNumMemOps < NumMemOps) { - for (i = 0; - i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) { - Loads[i] = DAG.getLoad(VT, dl, Chain, - DAG.getNode(ISD::ADD, dl, MVT::i32, Src, - DAG.getConstant(SrcOff, dl, MVT::i32)), - SrcPtrInfo.getWithOffset(SrcOff), isVolatile, - false, false, 0); - TFOps[i] = Loads[i].getValue(1); - SrcOff += VTSize; - } - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - makeArrayRef(TFOps, i)); - - for (i = 0; - i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) { - TFOps[i] = DAG.getStore(Chain, dl, Loads[i], - DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, - DAG.getConstant(DstOff, dl, MVT::i32)), - DstPtrInfo.getWithOffset(DstOff), - isVolatile, false, 0); - DstOff += VTSize; - } - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - makeArrayRef(TFOps, i)); + SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue); + + // Subtract 1 to avoid emitting an mcopy for a 4-byte copy; a load/store is + // good enough in that case. + while (EmittedNumMemOps < NumMemOps - 1) { + // Use up to MAX_LOADS_IN_LDM registers per mcopy. The mcopys will get + // lowered into ldm/stm later on. + unsigned NumRegs = std::min(MAX_LOADS_IN_LDM, NumMemOps - EmittedNumMemOps); - EmittedNumMemOps += i; + Dst = DAG.getNode(ARMISD::MCOPY, dl, VTs, Chain, Dst, Src, + DAG.getConstant(NumRegs, dl, MVT::i32)); + Src = Dst.getValue(1); + Chain = Dst.getValue(2); + + DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize); + SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize); + + EmittedNumMemOps += NumRegs; } + BytesLeft += (NumMemOps - EmittedNumMemOps) * 4; + if (BytesLeft == 0) return Chain; - // Issue loads / stores for the trailing (1 - 3) bytes. + // Issue loads / stores for the trailing (1 - 7) bytes. unsigned BytesLeftSave = BytesLeft; i = 0; while (BytesLeft) { - if (BytesLeft >= 2) { + if (BytesLeft >= 4) { + VT = MVT::i32; + VTSize = 4; + } else if (BytesLeft >= 2) { VT = MVT::i16; VTSize = 2; } else { @@ -122,7 +115,10 @@ i = 0; BytesLeft = BytesLeftSave; while (BytesLeft) { - if (BytesLeft >= 2) { + if (BytesLeft >= 4) { + VT = MVT::i32; + VTSize = 4; + } else if (BytesLeft >= 2) { VT = MVT::i16; VTSize = 2; } else { Index: lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp =================================================================== --- lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp +++ lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp @@ -744,10 +744,21 @@ const MCSubtargetInfo &STI, raw_ostream &O) { O << "{"; - for (unsigned i = OpNum, e = MI->getNumOperands(); i != e; ++i) { - if (i != OpNum) + + // The backend may have given us a register list in non-ascending order. Sort + // it now. + std::vector RegOps(MI->size() - OpNum); + std::copy(MI->begin() + OpNum, MI->end(), RegOps.begin()); + std::sort(RegOps.begin(), RegOps.end(), + [this](const MCOperand &O1, const MCOperand &O2) -> bool { + return MRI.getEncodingValue(O1.getReg()) < + MRI.getEncodingValue(O2.getReg()); + }); + + for (unsigned i = 0, e = RegOps.size(); i != e; ++i) { + if (i != 0) O << ", "; - printRegName(O, MI->getOperand(i).getReg()); + printRegName(O, RegOps[i].getReg()); } O << "}"; } Index: lib/Target/ARM/Thumb2SizeReduction.cpp =================================================================== --- lib/Target/ARM/Thumb2SizeReduction.cpp +++ lib/Target/ARM/Thumb2SizeReduction.cpp @@ -126,6 +126,7 @@ { ARM::t2LDMIA_RET,0, ARM::tPOP_RET, 0, 0, 1, 1, 1,1, 0,1,0 }, { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0, 0, 1, 1, 1,1, 0,1,0 }, // ARM::t2STM (with no basereg writeback) has no Thumb1 equivalent + { ARM::t2STMIA, ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1,0 }, { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1,0 }, { ARM::t2STMDB_UPD, 0, ARM::tPUSH, 0, 0, 1, 1, 1,1, 0,1,0 } }; @@ -432,6 +433,14 @@ isLdStMul = true; break; } + case ARM::t2STMIA: { + // If the base register is killed, we don't care what its value is after the + // instruction, so we can use an updating STMIA. + if (!MI->getOperand(0).isKill()) + return false; + + break; + } case ARM::t2LDMIA_RET: { unsigned BaseReg = MI->getOperand(1).getReg(); if (BaseReg != ARM::SP) @@ -489,6 +498,12 @@ // Add the 16-bit load / store instruction. DebugLoc dl = MI->getDebugLoc(); MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, TII->get(Opc)); + + // tSTMIA_UPD takes a defining register operand. We've already checked that + // the register is killed, so mark it as dead here. + if (Entry.WideOpc == ARM::t2STMIA) + MIB.addReg(MI->getOperand(0).getReg(), RegState::Define | RegState::Dead); + if (!isLdStMul) { MIB.addOperand(MI->getOperand(0)); MIB.addOperand(MI->getOperand(1)); Index: test/CodeGen/Thumb/ldm-stm-base-materialization.ll =================================================================== --- test/CodeGen/Thumb/ldm-stm-base-materialization.ll +++ test/CodeGen/Thumb/ldm-stm-base-materialization.ll @@ -9,12 +9,14 @@ define void @foo() #0 { entry: ; CHECK-LABEL: foo: -; CHECK: ldr r[[SB:[0-9]]], .LCPI ; CHECK: ldr r[[LB:[0-9]]], .LCPI ; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4 -; CHECK-NEXT: ldm r[[NLB]], +; CHECK: ldr r[[SB:[0-9]]], .LCPI ; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4 -; CHECK-NEXT: stm r[[NSB]] +; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]], r[[R4:[0-9]]]} +; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]], r[[R4]]} +; CHECK-NEXT: ldm r[[NLB]], {r[[R1:[0-9]]], r[[R2:[0-9]]]} +; CHECK-NEXT: stm r[[NSB]], {r[[R1]], r[[R2]]} %0 = load i32*, i32** @a, align 4 %arrayidx = getelementptr inbounds i32, i32* %0, i32 1 %1 = bitcast i32* %arrayidx to i8* Index: test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll =================================================================== --- test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll +++ test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll @@ -7,8 +7,8 @@ entry: ; CHECK-LABEL: t1: ; CHECK: ldr r[[LB:[0-9]]], -; CHECK-NEXT: ldm r[[LB]]!, ; CHECK-NEXT: ldr r[[SB:[0-9]]], +; CHECK-NEXT: ldm r[[LB]]!, ; CHECK-NEXT: stm r[[SB]]!, ; CHECK-NEXT: ldrb {{.*}}, [r[[LB]]] ; CHECK-NEXT: strb {{.*}}, [r[[SB]]] @@ -21,8 +21,8 @@ entry: ; CHECK-LABEL: t2: ; CHECK: ldr r[[LB:[0-9]]], -; CHECK-NEXT: ldm r[[LB]]!, ; CHECK-NEXT: ldr r[[SB:[0-9]]], +; CHECK-NEXT: ldm r[[LB]]!, ; CHECK-NEXT: stm r[[SB]]!, ; CHECK-NEXT: ldrh {{.*}}, [r[[LB]]] ; CHECK-NEXT: ldrb {{.*}}, [r[[LB]], #2]