Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -1199,6 +1199,15 @@ return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy; } + /// \brief Get maximum # of store operations to be glued together + /// + /// This function returns the maximum number of store operations permitted + /// to glue together during lowering of llvm.memcpy. The value is set by + // the target at the performance threshold for such a replacement. + virtual unsigned getMaxGluedStoresPerMemcpy() const { + return MaxGluedStoresPerMemcpy; + } + /// Get maximum # of load operations permitted for memcmp /// /// This function returns the maximum number of load operations permitted @@ -2509,6 +2518,14 @@ /// constant size. unsigned MaxStoresPerMemcpy; + + /// \brief Specify max number of store instructions to glue in inlined memcpy. + /// + /// When memcpy is inlined based on MaxStoresPerMemcpy, specify maximum number + /// of store instructions to keep together. This helps in pairing and + // vectorization later on. + unsigned MaxGluedStoresPerMemcpy = 0; + /// Maximum number of store operations that may be substituted for a call to /// memcpy, used for functions with OptSize attribute. unsigned MaxStoresPerMemcpyOptSize; Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -89,6 +89,14 @@ #define DEBUG_TYPE "selectiondag" +static cl::opt EnableMemCpyDAGOpt("enable-memcpy-dag-opt", + cl::Hidden, cl::init(true), + cl::desc("Gang up loads and stores generated by inlining of memcpy")); + +static cl::opt MaxLdStGlue("ldstmemcpy-glue-max", + cl::desc("Number limit for gluing ld/st of memcpy."), + cl::Hidden, cl::init(0)); + static void NewSDValueDbgMsg(SDValue V, StringRef Msg, SelectionDAG *G) { LLVM_DEBUG(dbgs() << Msg; V.getNode()->dump(G);); } @@ -5218,6 +5226,31 @@ return MF.getFunction().optForSize(); } +static void chainLoadsAndStoresForMemcpy(SelectionDAG &DAG, const SDLoc &dl, + SmallVector &OutChains, unsigned From, + unsigned To, SmallVector &OutLoadChains, + SmallVector &OutStoreChains) { + assert(OutLoadChains.size() && "Missing loads in memcpy inlining"); + assert(OutStoreChains.size() && "Missing stores in memcpy inlining"); + SmallVector GluedLoadChains; + for (unsigned i = From; i < To; ++i) { + OutChains.push_back(OutLoadChains[i]); + GluedLoadChains.push_back(OutLoadChains[i]); + } + + // Chain for all loads. + SDValue LoadToken = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + GluedLoadChains); + + for (unsigned i = From; i < To; ++i) { + StoreSDNode *ST = dyn_cast(OutStoreChains[i]); + SDValue NewStore = DAG.getTruncStore(LoadToken, dl, ST->getValue(), + ST->getBasePtr(), ST->getMemoryVT(), + ST->getMemOperand()); + OutChains.push_back(NewStore); + } +} + static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, unsigned Align, @@ -5282,7 +5315,9 @@ MachineMemOperand::Flags MMOFlags = isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone; - SmallVector OutChains; + SmallVector OutLoadChains; + SmallVector OutStoreChains; + SmallVector OutChains; unsigned NumMemOps = MemOps.size(); uint64_t SrcOff = 0, DstOff = 0; for (unsigned i = 0; i != NumMemOps; ++i) { @@ -5316,11 +5351,13 @@ SubSlice.Length = VTSize; } Value = getMemsetStringVal(VT, dl, DAG, TLI, SubSlice); - if (Value.getNode()) + if (Value.getNode()) { Store = DAG.getStore(Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl), DstPtrInfo.getWithOffset(DstOff), Align, MMOFlags); + OutChains.push_back(Store); + } } if (!Store.getNode()) { @@ -5342,17 +5379,61 @@ DAG.getMemBasePlusOffset(Src, SrcOff, dl), SrcPtrInfo.getWithOffset(SrcOff), VT, MinAlign(SrcAlign, SrcOff), SrcMMOFlags); - OutChains.push_back(Value.getValue(1)); + OutLoadChains.push_back(Value.getValue(1)); + Store = DAG.getTruncStore( Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl), DstPtrInfo.getWithOffset(DstOff), VT, Align, MMOFlags); + OutStoreChains.push_back(Store); } - OutChains.push_back(Store); SrcOff += VTSize; DstOff += VTSize; Size -= VTSize; } + unsigned GluedLdStLimit = MaxLdStGlue == 0 ? + TLI.getMaxGluedStoresPerMemcpy() : MaxLdStGlue; + unsigned NumLdStInMemcpy = OutStoreChains.size(); + + if (NumLdStInMemcpy) { + // It may be that memcpy might be converted to memset if it's memcpy + // of constants. In such a case, we won't have loads and stores, but + // just stores. In the absence of loads, there is nothing to gang up. + if ((GluedLdStLimit <= 1) || !EnableMemCpyDAGOpt) { + // If target does not care, just leave as it. + for (unsigned i = 0; i < NumLdStInMemcpy; ++i) { + OutChains.push_back(OutLoadChains[i]); + OutChains.push_back(OutStoreChains[i]); + } + } else { + // Ld/St less than/equal limit set by target. + if (NumLdStInMemcpy <= GluedLdStLimit) { + chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, 0, + NumLdStInMemcpy, OutLoadChains, + OutStoreChains); + } else { + unsigned NumberLdChain = NumLdStInMemcpy / GluedLdStLimit; + unsigned RemainingLdStInMemcpy = NumLdStInMemcpy % GluedLdStLimit; + unsigned GlueIter = 0; + + for (unsigned cnt = 0; cnt < NumberLdChain; ++cnt) { + unsigned IndexFrom = NumLdStInMemcpy - GlueIter - GluedLdStLimit; + unsigned IndexTo = NumLdStInMemcpy - GlueIter; + + chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, IndexFrom, IndexTo, + OutLoadChains, OutStoreChains); + GlueIter += GluedLdStLimit; + } + + // Residual ld/st. + if (RemainingLdStInMemcpy) { + chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, 0, + RemainingLdStInMemcpy, OutLoadChains, + OutStoreChains); + } + } + } + } return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } Index: llvm/lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- llvm/lib/CodeGen/TargetLoweringBase.cpp +++ llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -535,6 +535,7 @@ // Perform these initializations only once. MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = MaxLoadsPerMemcmp = 8; + MaxGluedStoresPerMemcpy = 0; MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize = MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4; UseUnderscoreSetJmp = false; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -580,6 +580,8 @@ setTargetDAGCombine(ISD::GlobalAddress); MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; + MaxGluedStoresPerMemcpy = 4; + MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4; Index: llvm/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll +++ llvm/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll @@ -8,9 +8,9 @@ ; CHECK: adrp x[[PAGE:[0-9]+]], {{l_b@PAGE|.Lb}} ; CHECK: add x[[ADDR:[0-9]+]], x[[PAGE]], {{l_b@PAGEOFF|:lo12:.Lb}} +; CHECK-NEXT: ldr [[VAL2:x[0-9]+]], [x[[ADDR]]] ; CHECK-NEXT: ldr [[VAL:w[0-9]+]], [x[[ADDR]], #8] ; CHECK-NEXT: str [[VAL]], [x0, #8] -; CHECK-NEXT: ldr [[VAL2:x[0-9]+]], [x[[ADDR]]] ; CHECK-NEXT: str [[VAL2]], [x0] define void @foo(i8* %a) { Index: llvm/test/CodeGen/AArch64/arm64-memcpy-inline.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-memcpy-inline.ll +++ llvm/test/CodeGen/AArch64/arm64-memcpy-inline.ll @@ -29,10 +29,10 @@ define void @t1(i8* nocapture %C) nounwind { entry: ; CHECK-LABEL: t1: -; CHECK: ldur [[DEST:q[0-9]+]], [x[[BASEREG:[0-9]+]], #15] -; CHECK: stur [[DEST]], [x0, #15] ; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]] -; CHECK: str [[DEST]], [x0] +; CHECK: ldur [[DEST:q[0-9]+]], [x[[BASEREG:[0-9]+]], #15] +; CHECK: stur [[DEST:q[0-9]+]], [x0, #15] +; CHECK: str [[DEST:q[0-9]+]], [x0] tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([31 x i8], [31 x i8]* @.str1, i64 0, i64 0), i64 31, i1 false) ret void } @@ -52,9 +52,9 @@ define void @t3(i8* nocapture %C) nounwind { entry: ; CHECK-LABEL: t3: +; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]] ; CHECK: ldr [[REG4:x[0-9]+]], [x[[BASEREG:[0-9]+]], #16] ; CHECK: str [[REG4]], [x0, #16] -; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]] ; CHECK: str [[DEST]], [x0] tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str3, i64 0, i64 0), i64 24, i1 false) ret void Index: llvm/test/CodeGen/AArch64/arm64-variadic-aapcs.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-variadic-aapcs.ll +++ llvm/test/CodeGen/AArch64/arm64-variadic-aapcs.ll @@ -130,12 +130,9 @@ ; CHECK: add x[[SRC:[0-9]+]], {{x[0-9]+}}, :lo12:var -; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]]] +; CHECK: ldp [[BLOCK:q[0-9]+]], [[BLOCK:q[0-9]+]], [x[[SRC]]] ; CHECK: add x[[DST:[0-9]+]], {{x[0-9]+}}, :lo12:second_list -; CHECK: str [[BLOCK]], [x[[DST]]] - -; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]], #16] -; CHECK: str [[BLOCK]], [x[[DST]], #16] +; CHECK: stp [[BLOCK:q[0-9]+]], [[BLOCK:q[0-9]+]], [x[[DST]]] ret void ; CHECK: ret } Index: llvm/test/CodeGen/AArch64/arm64-virtual_base.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-virtual_base.ll +++ llvm/test/CodeGen/AArch64/arm64-virtual_base.ll @@ -34,8 +34,8 @@ define void @Precompute_Patch_Values(%struct.Bicubic_Patch_Struct* %Shape) { ; CHECK: Precompute_Patch_Values ; CHECK: ldr [[VAL:x[0-9]+]], [x0, #288] -; CHECK-NEXT: str [[VAL]], [sp, #232] ; CHECK-NEXT: ldr [[VAL2:q[0-9]+]], [x0, #272] +; CHECK-NEXT: str [[VAL]], [sp, #232] ; CHECK-NEXT: stur [[VAL2]], {{\[}}sp, #216] entry: %Control_Points = alloca [16 x [3 x double]], align 8 Index: llvm/test/CodeGen/AArch64/mergestores_noimplicitfloat.ll =================================================================== --- llvm/test/CodeGen/AArch64/mergestores_noimplicitfloat.ll +++ llvm/test/CodeGen/AArch64/mergestores_noimplicitfloat.ll @@ -4,16 +4,14 @@ target triple = "arm64-apple-ios10.0.0" ; PR33475 - Expect 64-bit operations as 128-operations are not legal +; However, we can generate a paired 64-bit loads and stores, without using +; floating point registers. ; CHECK-LABEL: pr33475 -; CHECK-DAG: ldr [[R0:x[0-9]+]], [x1] -; CHECK-DAG: str [[R0]], [x0] -; CHECK-DAG: ldr [[R1:x[0-9]+]], [x1, #8] -; CHECK-DAG: str [[R1]], [x0, #8] -; CHECK-DAG: ldr [[R2:x[0-9]+]], [x1, #16] -; CHECK-DAG: str [[R2]], [x0, #16] -; CHECK-DAG: ldr [[R3:x[0-9]+]], [x1, #24] -; CHECK-DAG: str [[R3]], [x0, #24] +; CHECK-DAG: ldp [[R0:x[0-9]+]], [[R0:x[0-9]+]], [x1, #16] +; CHECK-DAG: ldp [[R0:x[0-9]+]], [[R0:x[0-9]+]], [x1] +; CHECK-DAG: stp [[R0:x[0-9]+]], [[R0:x[0-9]+]], [x0, #16] +; CHECK-DAG: stp [[R0:x[0-9]+]], [[R0:x[0-9]+]], [x0] define void @pr33475(i8* %p0, i8* %p1) noimplicitfloat { call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %p0, i8* align 4 %p1, i64 32, i1 false) Index: llvm/test/CodeGen/ARM/memcpy-inline.ll =================================================================== --- llvm/test/CodeGen/ARM/memcpy-inline.ll +++ llvm/test/CodeGen/ARM/memcpy-inline.ll @@ -44,15 +44,14 @@ define void @t2(i8* nocapture %C) nounwind { entry: ; CHECK-LABEL: t2: -; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]! +; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2]! +; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2] +; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] ; CHECK: movs [[INC:r[0-9]+]], #32 -; CHECK: add.w r3, r0, #16 ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0], [[INC]] ; CHECK: movw [[REG2:r[0-9]+]], #16716 ; CHECK: movt [[REG2:r[0-9]+]], #72 ; CHECK: str [[REG2]], [r0] -; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] -; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r3] ; CHECK-T1-LABEL: t2: ; CHECK-T1: bl _memcpy tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str2, i64 0, i64 0), i64 36, i1 false)