Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -1199,6 +1199,15 @@ return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy; } + /// \brief Get maximum # of store operations to be glued together + /// + /// This function returns the maximum number of store operations permitted + /// to glue together during lowering of llvm.memcpy. The value is set by + // the target at the performance threshold for such a replacement. + virtual unsigned getMaxGluedStoresPerMemcpy() const { + return MaxGluedStoresPerMemcpy; + } + /// Get maximum # of load operations permitted for memcmp /// /// This function returns the maximum number of load operations permitted @@ -2504,6 +2513,14 @@ /// constant size. unsigned MaxStoresPerMemcpy; + + /// \brief Specify max number of store instructions to glue in inlined memcpy. + /// + /// When memcpy is inline based on MaxStoresPerMemcpy, specify maximum number + /// of store instructions that you can want to keep tother. This helps in + /// pairing and vectorazation later on. + unsigned MaxGluedStoresPerMemcpy = 0; + /// Maximum number of store operations that may be substituted for a call to /// memcpy, used for functions with OptSize attribute. unsigned MaxStoresPerMemcpyOptSize; Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -89,6 +89,15 @@ #define DEBUG_TYPE "selectiondag" +static cl::opt DisableMemCpyDAGOpt("disable-memcpy-dag-opt", + cl::Hidden, cl::init(false), + cl::desc("Gang up loads and stores generated by inlining of memcpy " + "such that such loads and stores can be paired.")); + +static cl::opt MaxLdStGlue("ldstmemcpy-glue-max", + cl::desc("Number limit for gluing ld/st of memcpy."), + cl::Hidden, cl::init(0)); + static void NewSDValueDbgMsg(SDValue V, StringRef Msg, SelectionDAG *G) { DEBUG( dbgs() << Msg; @@ -5219,6 +5228,31 @@ return MF.getFunction().optForSize(); } +static void chainLoadsAndStoresForMemcpy(SelectionDAG &DAG, const SDLoc &dl, + SmallVector &OutChains, unsigned From, + unsigned To, SmallVector &OutLoadChains, + SmallVector &OutStoreChains) { + assert(OutLoadChains.size() && "Missing loads in memcpy inlining"); + assert(OutStoreChains.size() && "Missing stores in memcpy inlining"); + SmallVector GluedLoadChains; + for (unsigned i = From; i < To; ++i) { + OutChains.push_back(OutLoadChains[i]); + GluedLoadChains.push_back(OutLoadChains[i]); + } + + // Chain for all loads + SDValue LoadToken = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + GluedLoadChains); + + for (unsigned i = From; i < To; ++i) { + StoreSDNode *ST = dyn_cast(OutStoreChains[i]); + SDValue NewStore = DAG.getTruncStore(LoadToken, dl, ST->getValue(), + ST->getBasePtr(), ST->getMemoryVT(), + ST->getMemOperand()); + OutChains.push_back(NewStore); + } +} + static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, unsigned Align, @@ -5283,7 +5317,9 @@ MachineMemOperand::Flags MMOFlags = isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone; - SmallVector OutChains; + SmallVector OutLoadChains; + SmallVector OutStoreChains; + SmallVector OutChains; unsigned NumMemOps = MemOps.size(); uint64_t SrcOff = 0, DstOff = 0; for (unsigned i = 0; i != NumMemOps; ++i) { @@ -5317,11 +5353,14 @@ SubSlice.Length = VTSize; } Value = getMemsetStringVal(VT, dl, DAG, TLI, SubSlice); - if (Value.getNode()) + if (Value.getNode()) { Store = DAG.getStore(Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl), DstPtrInfo.getWithOffset(DstOff), Align, MMOFlags); + OutChains.push_back(Store); + } + } if (!Store.getNode()) { @@ -5343,17 +5382,62 @@ DAG.getMemBasePlusOffset(Src, SrcOff, dl), SrcPtrInfo.getWithOffset(SrcOff), VT, MinAlign(SrcAlign, SrcOff), SrcMMOFlags); - OutChains.push_back(Value.getValue(1)); + OutLoadChains.push_back(Value.getValue(1)); + Store = DAG.getTruncStore( Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl), DstPtrInfo.getWithOffset(DstOff), VT, Align, MMOFlags); + OutStoreChains.push_back(Store); } - OutChains.push_back(Store); SrcOff += VTSize; DstOff += VTSize; Size -= VTSize; } + unsigned GluedLdStLimit = (MaxLdStGlue == 0 ? + TLI.getMaxGluedStoresPerMemcpy() : MaxLdStGlue); + unsigned NumLdStInMemcpy = OutStoreChains.size(); + + if (NumLdStInMemcpy) { + // Only if we have load/stores because of memcpy. It maybe that mempy + // might be converted to memset if it's memcpy of constants. In that case, + // ignore all these. + if ((GluedLdStLimit <= 1) || DisableMemCpyDAGOpt) { + // If target does not care, just leave as it. + for (unsigned i = 0; i < NumLdStInMemcpy; ++i) { + OutChains.push_back(OutLoadChains[i]); + OutChains.push_back(OutStoreChains[i]); + } + } + else { + // Ld/St less than/equal limit set by target. + if (NumLdStInMemcpy <= GluedLdStLimit) { + chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, 0, + NumLdStInMemcpy, OutLoadChains, + OutStoreChains); + } else { + unsigned NumberLdChain = NumLdStInMemcpy / GluedLdStLimit; + unsigned RemainingLdStInMemcpy = NumLdStInMemcpy % GluedLdStLimit; + unsigned GlueIter = 0; + + for (unsigned cnt = 0; cnt < NumberLdChain; ++cnt) { + unsigned IndexFrom = (NumLdStInMemcpy - GlueIter - GluedLdStLimit); + unsigned IndexTo = (NumLdStInMemcpy - GlueIter); + + chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, IndexFrom, IndexTo, + OutLoadChains, OutStoreChains); + GlueIter += GluedLdStLimit; + } + + // Residual ld/st + if (RemainingLdStInMemcpy) { + chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, 0, + RemainingLdStInMemcpy, OutLoadChains, + OutStoreChains); + } + } + } + } return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } Index: llvm/lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- llvm/lib/CodeGen/TargetLoweringBase.cpp +++ llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -535,6 +535,7 @@ // Perform these initializations only once. MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = MaxLoadsPerMemcmp = 8; + MaxGluedStoresPerMemcpy = 0; MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize = MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4; UseUnderscoreSetJmp = false; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -579,9 +579,15 @@ setTargetDAGCombine(ISD::GlobalAddress); - MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; - MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; - MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4; + MaxStoresPerMemset = 32; + MaxStoresPerMemsetOptSize = 8; + + MaxStoresPerMemcpy = 16; + MaxStoresPerMemcpyOptSize = 4; + MaxGluedStoresPerMemcpy = 4; + + MaxStoresPerMemmove = 16; + MaxStoresPerMemmoveOptSize = 4; setStackPointerRegisterToSaveRestore(AArch64::SP); Index: llvm/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll +++ llvm/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll @@ -8,9 +8,9 @@ ; CHECK: adrp x[[PAGE:[0-9]+]], {{l_b@PAGE|.Lb}} ; CHECK: add x[[ADDR:[0-9]+]], x[[PAGE]], {{l_b@PAGEOFF|:lo12:.Lb}} +; CHECK-NEXT: ldr [[VAL2:x[0-9]+]], [x[[ADDR]]] ; CHECK-NEXT: ldr [[VAL:w[0-9]+]], [x[[ADDR]], #8] ; CHECK-NEXT: str [[VAL]], [x0, #8] -; CHECK-NEXT: ldr [[VAL2:x[0-9]+]], [x[[ADDR]]] ; CHECK-NEXT: str [[VAL2]], [x0] define void @foo(i8* %a) { Index: llvm/test/CodeGen/AArch64/arm64-memcpy-inline.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-memcpy-inline.ll +++ llvm/test/CodeGen/AArch64/arm64-memcpy-inline.ll @@ -29,10 +29,10 @@ define void @t1(i8* nocapture %C) nounwind { entry: ; CHECK-LABEL: t1: -; CHECK: ldur [[DEST:q[0-9]+]], [x[[BASEREG:[0-9]+]], #15] -; CHECK: stur [[DEST]], [x0, #15] ; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]] -; CHECK: str [[DEST]], [x0] +; CHECK: ldur [[DEST:q[0-9]+]], [x[[BASEREG:[0-9]+]], #15] +; CHECK: stur [[DEST:q[0-9]+]], [x0, #15] +; CHECK: str [[DEST:q[0-9]+]], [x0] tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([31 x i8], [31 x i8]* @.str1, i64 0, i64 0), i64 31, i1 false) ret void } @@ -52,9 +52,9 @@ define void @t3(i8* nocapture %C) nounwind { entry: ; CHECK-LABEL: t3: +; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]] ; CHECK: ldr [[REG4:x[0-9]+]], [x[[BASEREG:[0-9]+]], #16] ; CHECK: str [[REG4]], [x0, #16] -; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]] ; CHECK: str [[DEST]], [x0] tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str3, i64 0, i64 0), i64 24, i1 false) ret void Index: llvm/test/CodeGen/AArch64/arm64-memset-to-bzero.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-memset-to-bzero.ll +++ llvm/test/CodeGen/AArch64/arm64-memset-to-bzero.ll @@ -6,8 +6,9 @@ ; CHECK-LABEL: fct1: ; For small size (<= 256), we do not change memset to bzero. -; CHECK-DARWIN: {{b|bl}} _memset -; CHECK-LINUX: {{b|bl}} memset +; Inline memset. +; CHECK: stp [[R0:q[0-9]+]], [[R0:q[0-9]+]], [x0, #224] +; CHECK: stp [[R0:q[0-9]+]], [[R0:q[0-9]+]], [x0] define void @fct1(i8* nocapture %ptr) { entry: tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 256, i1 false) @@ -38,9 +39,9 @@ } ; CHECK-LABEL: fct4: -; Size <= 256, no change. -; CHECK-DARWIN: {{b|bl}} _memset -; CHECK-LINUX: {{b|bl}} memset +; Size <= 256, inline memset. +; CHECK: stp [[R0:q[0-9]+]], [[R0:q[0-9]+]], [x0, #224] +; CHECK: stp [[R0:q[0-9]+]], [[R0:q[0-9]+]], [x0] define void @fct4(i8* %ptr) { entry: %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) @@ -85,7 +86,7 @@ define void @fct7(i8* %ptr) { entry: %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) - %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 256, i64 %tmp) + %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 260, i64 %tmp) ret void } Index: llvm/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll +++ llvm/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll @@ -1,13 +1,13 @@ ; RUN: llc -mtriple=arm64-apple-ios -mattr=+strict-align < %s | FileCheck %s -; Small (16-bytes here) unaligned memcpys should stay memcpy calls if +; Small (24-bytes here) unaligned memcpys should stay memcpy calls if ; strict-alignment is turned on. define void @t0(i8* %out, i8* %in) { ; CHECK-LABEL: t0: -; CHECK: orr w2, wzr, #0x10 +; CHECK: orr w2, wzr, #0x18 ; CHECK-NEXT: bl _memcpy entry: - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out, i8* %in, i64 16, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out, i8* %in, i64 24, i1 false) ret void } Index: llvm/test/CodeGen/AArch64/arm64-variadic-aapcs.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-variadic-aapcs.ll +++ llvm/test/CodeGen/AArch64/arm64-variadic-aapcs.ll @@ -130,12 +130,9 @@ ; CHECK: add x[[SRC:[0-9]+]], {{x[0-9]+}}, :lo12:var -; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]]] +; CHECK: ldp [[BLOCK:q[0-9]+]], [[BLOCK:q[0-9]+]], [x[[SRC]]] ; CHECK: add x[[DST:[0-9]+]], {{x[0-9]+}}, :lo12:second_list -; CHECK: str [[BLOCK]], [x[[DST]]] - -; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]], #16] -; CHECK: str [[BLOCK]], [x[[DST]], #16] +; CHECK: stp [[BLOCK:q[0-9]+]], [[BLOCK:q[0-9]+]], [x[[DST]]] ret void ; CHECK: ret } Index: llvm/test/CodeGen/AArch64/arm64-virtual_base.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-virtual_base.ll +++ llvm/test/CodeGen/AArch64/arm64-virtual_base.ll @@ -34,8 +34,8 @@ define void @Precompute_Patch_Values(%struct.Bicubic_Patch_Struct* %Shape) { ; CHECK: Precompute_Patch_Values ; CHECK: ldr [[VAL:x[0-9]+]], [x0, #288] -; CHECK-NEXT: str [[VAL]], [sp, #232] ; CHECK-NEXT: ldr [[VAL2:q[0-9]+]], [x0, #272] +; CHECK-NEXT: str [[VAL]], [sp, #232] ; CHECK-NEXT: stur [[VAL2]], {{\[}}sp, #216] entry: %Control_Points = alloca [16 x [3 x double]], align 8 Index: llvm/test/CodeGen/AArch64/mergestores_noimplicitfloat.ll =================================================================== --- llvm/test/CodeGen/AArch64/mergestores_noimplicitfloat.ll +++ llvm/test/CodeGen/AArch64/mergestores_noimplicitfloat.ll @@ -4,16 +4,14 @@ target triple = "arm64-apple-ios10.0.0" ; PR33475 - Expect 64-bit operations as 128-operations are not legal +; However, we can generate a paired 64-bit loads and stores, without using +; floating point registers. ; CHECK-LABEL: pr33475 -; CHECK-DAG: ldr [[R0:x[0-9]+]], [x1] -; CHECK-DAG: str [[R0]], [x0] -; CHECK-DAG: ldr [[R1:x[0-9]+]], [x1, #8] -; CHECK-DAG: str [[R1]], [x0, #8] -; CHECK-DAG: ldr [[R2:x[0-9]+]], [x1, #16] -; CHECK-DAG: str [[R2]], [x0, #16] -; CHECK-DAG: ldr [[R3:x[0-9]+]], [x1, #24] -; CHECK-DAG: str [[R3]], [x0, #24] +; CHECK-DAG: ldp [[R0:x[0-9]+]], [[R0:x[0-9]+]], [x1, #16] +; CHECK-DAG: ldp [[R0:x[0-9]+]], [[R0:x[0-9]+]], [x1] +; CHECK-DAG: stp [[R0:x[0-9]+]], [[R0:x[0-9]+]], [x0, #16] +; CHECK-DAG: stp [[R0:x[0-9]+]], [[R0:x[0-9]+]], [x0] define void @pr33475(i8* %p0, i8* %p1) noimplicitfloat { call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %p0, i8* align 4 %p1, i64 32, i1 false) Index: llvm/test/CodeGen/ARM/memcpy-inline.ll =================================================================== --- llvm/test/CodeGen/ARM/memcpy-inline.ll +++ llvm/test/CodeGen/ARM/memcpy-inline.ll @@ -44,15 +44,14 @@ define void @t2(i8* nocapture %C) nounwind { entry: ; CHECK-LABEL: t2: -; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]! +; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2]! +; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2] +; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] ; CHECK: movs [[INC:r[0-9]+]], #32 -; CHECK: add.w r3, r0, #16 ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0], [[INC]] ; CHECK: movw [[REG2:r[0-9]+]], #16716 ; CHECK: movt [[REG2:r[0-9]+]], #72 ; CHECK: str [[REG2]], [r0] -; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] -; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r3] ; CHECK-T1-LABEL: t2: ; CHECK-T1: bl _memcpy tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str2, i64 0, i64 0), i64 36, i1 false)