Index: llvm/trunk/lib/Target/ARM/ARMISelLowering.h =================================================================== --- llvm/trunk/lib/Target/ARM/ARMISelLowering.h +++ llvm/trunk/lib/Target/ARM/ARMISelLowering.h @@ -189,6 +189,10 @@ // Vector bitwise select VBSL, + // Pseudo-instruction representing a memory copy using ldm/stm + // instructions. + MCOPY, + // Vector load N-element structure to all lanes: VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE, VLD3DUP, Index: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp @@ -1122,6 +1122,7 @@ case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; case ARMISD::VBSL: return "ARMISD::VBSL"; + case ARMISD::MCOPY: return "ARMISD::MCOPY"; case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; @@ -7629,8 +7630,59 @@ } } +/// \brief Lowers MCOPY to either LDMIA/STMIA or LDMIA_UPD/STMID_UPD depending +/// on whether the result is used. This is done as a post-isel lowering instead +/// of as a custom inserter because we need the use list from the SDNode. +static void LowerMCOPY(const ARMSubtarget *Subtarget, MachineInstr *MI, + SDNode *Node) { + bool isThumb1 = Subtarget->isThumb1Only(); + bool isThumb2 = Subtarget->isThumb2(); + const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); + + DebugLoc dl = MI->getDebugLoc(); + MachineBasicBlock *BB = MI->getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + MachineInstrBuilder LD, ST; + if (isThumb1 || Node->hasAnyUseOfValue(1)) { + LD = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA_UPD + : isThumb1 ? ARM::tLDMIA_UPD + : ARM::LDMIA_UPD)) + .addOperand(MI->getOperand(1)); + } else { + LD = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA : ARM::LDMIA)); + } + + if (isThumb1 || Node->hasAnyUseOfValue(0)) { + ST = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA_UPD + : isThumb1 ? ARM::tSTMIA_UPD + : ARM::STMIA_UPD)) + .addOperand(MI->getOperand(0)); + } else { + ST = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA : ARM::STMIA)); + } + + LD.addOperand(MI->getOperand(3)).addImm(ARMCC::AL).addReg(0); + ST.addOperand(MI->getOperand(2)).addImm(ARMCC::AL).addReg(0); + + for (unsigned I = 0; I != MI->getOperand(4).getImm(); ++I) { + unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass + : &ARM::GPRRegClass); + LD.addReg(TmpReg, RegState::Define); + ST.addReg(TmpReg, RegState::Kill); + } + + MI->eraseFromParent(); +} + void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, SDNode *Node) const { + if (MI->getOpcode() == ARM::MCOPY) { + LowerMCOPY(Subtarget, MI, Node); + return; + } + const MCInstrDesc *MCID = &MI->getDesc(); // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, // RSC. Coming out of isel, they have an implicit CPSR def, but the optional Index: llvm/trunk/lib/Target/ARM/ARMInstrInfo.td =================================================================== --- llvm/trunk/lib/Target/ARM/ARMInstrInfo.td +++ llvm/trunk/lib/Target/ARM/ARMInstrInfo.td @@ -73,6 +73,10 @@ def SDT_ARMVMAXNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>; def SDT_ARMVMINNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>; +def SDT_ARMMCOPY : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>, + SDTCisVT<4, i32>]>; + def SDTBinaryArithWithFlags : SDTypeProfile<2, 2, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, @@ -179,6 +183,10 @@ def ARMvmaxnm : SDNode<"ARMISD::VMAXNM", SDT_ARMVMAXNM, []>; def ARMvminnm : SDNode<"ARMISD::VMINNM", SDT_ARMVMINNM, []>; +def ARMmcopy : SDNode<"ARMISD::MCOPY", SDT_ARMMCOPY, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, + SDNPMayStore, SDNPMayLoad]>; + //===----------------------------------------------------------------------===// // ARM Instruction Predicate Definitions. // @@ -4552,6 +4560,13 @@ [(ARMcopystructbyval GPR:$dst, GPR:$src, imm:$size, imm:$alignment)]>; } +let hasPostISelHook = 1 in { + def MCOPY : PseudoInst< + (outs GPR:$newdst, GPR:$newsrc), (ins GPR:$dst, GPR:$src, i32imm:$nreg), + NoItinerary, + [(set GPR:$newdst, GPR:$newsrc, (ARMmcopy GPR:$dst, GPR:$src, imm:$nreg))]>; +} + def ldrex_1 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{ return cast(N)->getMemoryVT() == MVT::i8; }]>; Index: llvm/trunk/lib/Target/ARM/ARMSelectionDAGInfo.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMSelectionDAGInfo.cpp +++ llvm/trunk/lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -164,41 +164,38 @@ unsigned VTSize = 4; unsigned i = 0; // Emit a maximum of 4 loads in Thumb1 since we have fewer registers - const unsigned MAX_LOADS_IN_LDM = Subtarget.isThumb1Only() ? 4 : 6; + const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6; SDValue TFOps[6]; SDValue Loads[6]; uint64_t SrcOff = 0, DstOff = 0; - // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the - // same number of stores. The loads and stores will get combined into - // ldm/stm later on. - while (EmittedNumMemOps < NumMemOps) { - for (i = 0; - i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) { - Loads[i] = DAG.getLoad(VT, dl, Chain, - DAG.getNode(ISD::ADD, dl, MVT::i32, Src, - DAG.getConstant(SrcOff, dl, MVT::i32)), - SrcPtrInfo.getWithOffset(SrcOff), isVolatile, - false, false, 0); - TFOps[i] = Loads[i].getValue(1); - SrcOff += VTSize; - } - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - makeArrayRef(TFOps, i)); + // FIXME: We should invent a VMCOPY pseudo-instruction that lowers to + // VLDM/VSTM and make this code emit it when appropriate. This would reduce + // pressure on the general purpose registers. However this seems harder to map + // onto the register allocator's view of the world. + + // The number of MCOPY pseudo-instructions to emit. We use up to MaxLoadsInLDM + // registers per mcopy, which will get lowered into ldm/stm later on. This is + // a lower bound on the number of MCOPY operations we must emit. + unsigned NumMCOPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM; + + SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue); + + for (unsigned I = 0; I != NumMCOPYs; ++I) { + // Evenly distribute registers among MCOPY operations to reduce register + // pressure. + unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMCOPYs; + unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps; + + Dst = DAG.getNode(ARMISD::MCOPY, dl, VTs, Chain, Dst, Src, + DAG.getConstant(NumRegs, dl, MVT::i32)); + Src = Dst.getValue(1); + Chain = Dst.getValue(2); - for (i = 0; - i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) { - TFOps[i] = DAG.getStore(Chain, dl, Loads[i], - DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, - DAG.getConstant(DstOff, dl, MVT::i32)), - DstPtrInfo.getWithOffset(DstOff), - isVolatile, false, 0); - DstOff += VTSize; - } - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - makeArrayRef(TFOps, i)); + DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize); + SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize); - EmittedNumMemOps += i; + EmittedNumMemOps = NextEmittedNumMemOps; } if (BytesLeft == 0) Index: llvm/trunk/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp +++ llvm/trunk/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp @@ -744,10 +744,21 @@ const MCSubtargetInfo &STI, raw_ostream &O) { O << "{"; - for (unsigned i = OpNum, e = MI->getNumOperands(); i != e; ++i) { - if (i != OpNum) + + // The backend may have given us a register list in non-ascending order. Sort + // it now. + std::vector RegOps(MI->size() - OpNum); + std::copy(MI->begin() + OpNum, MI->end(), RegOps.begin()); + std::sort(RegOps.begin(), RegOps.end(), + [this](const MCOperand &O1, const MCOperand &O2) -> bool { + return MRI.getEncodingValue(O1.getReg()) < + MRI.getEncodingValue(O2.getReg()); + }); + + for (unsigned i = 0, e = RegOps.size(); i != e; ++i) { + if (i != 0) O << ", "; - printRegName(O, MI->getOperand(i).getReg()); + printRegName(O, RegOps[i].getReg()); } O << "}"; } Index: llvm/trunk/lib/Target/ARM/Thumb2SizeReduction.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/Thumb2SizeReduction.cpp +++ llvm/trunk/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -125,7 +125,10 @@ { ARM::t2LDMIA, ARM::tLDMIA, 0, 0, 0, 1, 1, 1,1, 0,1,0 }, { ARM::t2LDMIA_RET,0, ARM::tPOP_RET, 0, 0, 1, 1, 1,1, 0,1,0 }, { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0, 0, 1, 1, 1,1, 0,1,0 }, - // ARM::t2STM (with no basereg writeback) has no Thumb1 equivalent + // ARM::t2STMIA (with no basereg writeback) has no Thumb1 equivalent. + // tSTMIA_UPD is a change in semantics which can only be used if the base + // register is killed. This difference is correctly handled elsewhere. + { ARM::t2STMIA, ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1,0 }, { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1,0 }, { ARM::t2STMDB_UPD, 0, ARM::tPUSH, 0, 0, 1, 1, 1,1, 0,1,0 } }; @@ -432,6 +435,14 @@ isLdStMul = true; break; } + case ARM::t2STMIA: { + // If the base register is killed, we don't care what its value is after the + // instruction, so we can use an updating STMIA. + if (!MI->getOperand(0).isKill()) + return false; + + break; + } case ARM::t2LDMIA_RET: { unsigned BaseReg = MI->getOperand(1).getReg(); if (BaseReg != ARM::SP) @@ -489,6 +500,12 @@ // Add the 16-bit load / store instruction. DebugLoc dl = MI->getDebugLoc(); MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, TII->get(Opc)); + + // tSTMIA_UPD takes a defining register operand. We've already checked that + // the register is killed, so mark it as dead here. + if (Entry.WideOpc == ARM::t2STMIA) + MIB.addReg(MI->getOperand(0).getReg(), RegState::Define | RegState::Dead); + if (!isLdStMul) { MIB.addOperand(MI->getOperand(0)); MIB.addOperand(MI->getOperand(1)); Index: llvm/trunk/test/CodeGen/Thumb/ldm-stm-base-materialization.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb/ldm-stm-base-materialization.ll +++ llvm/trunk/test/CodeGen/Thumb/ldm-stm-base-materialization.ll @@ -6,15 +6,17 @@ @b = external global i32* ; Function Attrs: nounwind -define void @foo() #0 { +define void @foo24() #0 { entry: -; CHECK-LABEL: foo: -; CHECK: ldr r[[SB:[0-9]]], .LCPI +; CHECK-LABEL: foo24: ; CHECK: ldr r[[LB:[0-9]]], .LCPI ; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4 -; CHECK-NEXT: ldm r[[NLB]], +; CHECK: ldr r[[SB:[0-9]]], .LCPI ; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4 -; CHECK-NEXT: stm r[[NSB]] +; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]} +; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]} +; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]} +; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]} %0 = load i32*, i32** @a, align 4 %arrayidx = getelementptr inbounds i32, i32* %0, i32 1 %1 = bitcast i32* %arrayidx to i8* @@ -25,5 +27,70 @@ ret void } +define void @foo28() #0 { +entry: +; CHECK-LABEL: foo28: +; CHECK: ldr r[[LB:[0-9]]], .LCPI +; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4 +; CHECK: ldr r[[SB:[0-9]]], .LCPI +; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4 +; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]} +; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]} +; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]], r[[R4:[0-9]]]} +; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]], r[[R4]]} + %0 = load i32*, i32** @a, align 4 + %arrayidx = getelementptr inbounds i32, i32* %0, i32 1 + %1 = bitcast i32* %arrayidx to i8* + %2 = load i32*, i32** @b, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1 + %3 = bitcast i32* %arrayidx1 to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 28, i32 4, i1 false) + ret void +} + +define void @foo32() #0 { +entry: +; CHECK-LABEL: foo32: +; CHECK: ldr r[[LB:[0-9]]], .LCPI +; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4 +; CHECK: ldr r[[SB:[0-9]]], .LCPI +; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4 +; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]], r[[R4:[0-9]]]} +; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]], r[[R4]]} +; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]], r[[R4:[0-9]]]} +; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]], r[[R4]]} + %0 = load i32*, i32** @a, align 4 + %arrayidx = getelementptr inbounds i32, i32* %0, i32 1 + %1 = bitcast i32* %arrayidx to i8* + %2 = load i32*, i32** @b, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1 + %3 = bitcast i32* %arrayidx1 to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 32, i32 4, i1 false) + ret void +} + +define void @foo36() #0 { +entry: +; CHECK-LABEL: foo36: +; CHECK: ldr r[[LB:[0-9]]], .LCPI +; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4 +; CHECK: ldr r[[SB:[0-9]]], .LCPI +; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4 +; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]} +; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]} +; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]} +; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]} +; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]} +; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]} + %0 = load i32*, i32** @a, align 4 + %arrayidx = getelementptr inbounds i32, i32* %0, i32 1 + %1 = bitcast i32* %arrayidx to i8* + %2 = load i32*, i32** @b, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1 + %3 = bitcast i32* %arrayidx1 to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 36, i32 4, i1 false) + ret void +} + ; Function Attrs: nounwind declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1 Index: llvm/trunk/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll +++ llvm/trunk/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll @@ -7,8 +7,8 @@ entry: ; CHECK-LABEL: t1: ; CHECK: ldr r[[LB:[0-9]]], -; CHECK-NEXT: ldm r[[LB]]!, ; CHECK-NEXT: ldr r[[SB:[0-9]]], +; CHECK-NEXT: ldm r[[LB]]!, ; CHECK-NEXT: stm r[[SB]]!, ; CHECK-NEXT: ldrb {{.*}}, [r[[LB]]] ; CHECK-NEXT: strb {{.*}}, [r[[SB]]] @@ -21,8 +21,8 @@ entry: ; CHECK-LABEL: t2: ; CHECK: ldr r[[LB:[0-9]]], -; CHECK-NEXT: ldm r[[LB]]!, ; CHECK-NEXT: ldr r[[SB:[0-9]]], +; CHECK-NEXT: ldm r[[LB]]!, ; CHECK-NEXT: stm r[[SB]]!, ; CHECK-NEXT: ldrh {{.*}}, [r[[LB]]] ; CHECK-NEXT: ldrb {{.*}}, [r[[LB]], #2]