diff --git a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h --- a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h +++ b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h @@ -35,8 +35,10 @@ class StoreInst; class TargetLibraryInfo; class Value; +class TargetTransformInfo; class MemCpyOptPass : public PassInfoMixin { + const TargetTransformInfo *TTI; TargetLibraryInfo *TLI = nullptr; AAResults *AA = nullptr; AssumptionCache *AC = nullptr; @@ -50,8 +52,9 @@ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); // Glue for the old PM. - bool runImpl(Function &F, TargetLibraryInfo *TLI, AAResults *AA, - AssumptionCache *AC, DominatorTree *DT, MemorySSA *MSSA); + bool runImpl(Function &F, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, + AAResults *AA, AssumptionCache *AC, DominatorTree *DT, + MemorySSA *MSSA); private: // Helper functions @@ -62,7 +65,10 @@ bool performCallSlotOptzn(Instruction *cpyLoad, Instruction *cpyStore, Value *cpyDst, Value *cpySrc, TypeSize cpyLen, Align cpyAlign, std::function GetC); - bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep); + bool processMemCpyThroughImmediate(MemCpyInst *M, MemCpyInst *MDep, + BasicBlock::iterator &BBI); + bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, + BasicBlock::iterator &BBI); bool processMemSetMemCpyDependence(MemCpyInst *MemCpy, MemSetInst *MemSet); bool performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, MemSetInst *MemSet); bool processByValArgument(CallBase &CB, unsigned ArgNo); diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -27,6 +27,7 @@ #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -67,6 +68,9 @@ "enable-memcpyopt-without-libcalls", cl::Hidden, cl::desc("Enable memcpyopt even when libcalls are disabled")); +STATISTIC(NumMemCpyThroughIntermediate, + "Number of memcpy of memcpy w/ clobber inbetween expanded into " + "loads+stores instructions deleted"); STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted"); STATISTIC(NumMemSetInfer, "Number of memsets inferred"); STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy"); @@ -274,6 +278,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addPreserved(); AU.addPreserved(); @@ -294,6 +299,7 @@ INITIALIZE_PASS_BEGIN(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) @@ -1142,10 +1148,86 @@ return true; } +// We have something like this: +// ``` +// MDep: memcpy(tmp <- a) +// ... +// ... a is potentially modified inbetween, e.g.: +// memcpy(a <- b) +// ... +// M: memcpy(b <- tmp) +// ``` +// Since we know that tmp is last modified by MDep, +// what we can do, is expand MDep's memcpy into load+store pair, +// and then expand M's memcpy into a store of the MDep's load: +// ``` +// reload = load a +// store reload, tmp ; spill +// ... +// store reload, b ; final store +// ``` +// This pattern can happen e.g. when swapping contents of the a and b, +// in which case tmp might go away completely, especially if it is an alloca. +// +// This isn't always an obvious improvement, and in general, creating large +// vectors can easily cause problematic compile-time implications, so there is a +// profitability heuristic: loading said vector should not require more vectors +// than theoretically avaliable on the given target. +bool MemCpyOptPass::processMemCpyThroughImmediate(MemCpyInst *M, + MemCpyInst *MDep, + BasicBlock::iterator &BBI) { + IRBuilder<> Builder(M->getContext()); + + const uint64_t NumBytes = cast(M->getLength())->getZExtValue(); + + Type *ByteVecTy = FixedVectorType::get( + IntegerType::getInt8Ty(Builder.getContext()), NumBytes); + + unsigned NumRegs = TTI->getNumberOfRegisters( + TTI->getRegisterClassForType(/*Vector=*/true, ByteVecTy)); + if (!NumRegs) + return false; + + unsigned RegBitWidth = + TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) + .getFixedSize(); + const unsigned NeededRegs = divideCeil(8 * NumBytes, RegBitWidth); + if (NeededRegs > NumRegs) + return false; + + Value *SrcAddr = MDep->getSource(); + Value *SpillAddr = MDep->getDest(); + assert(M->getSource() == SpillAddr && "Unexpected memory flow."); + Value *TgtAddr = M->getDest(); + + Builder.SetInsertPoint(MDep); + Instruction *ReloadedVal = + Builder.CreateAlignedLoad(ByteVecTy, SrcAddr, MDep->getSourceAlign(), + SrcAddr->getName() + ".reload"); + ReloadedVal->setAAMetadata(MDep->getAAMetadata()); + Instruction *Spill = + Builder.CreateAlignedStore(ReloadedVal, SpillAddr, MDep->getDestAlign()); + Spill->setAAMetadata(MDep->getAAMetadata()); + + Builder.SetInsertPoint(M); + Instruction *Store = + Builder.CreateAlignedStore(ReloadedVal, TgtAddr, M->getDestAlign()); + Store->setAAMetadata(M->getAAMetadata()); + + BBI = Store->getIterator(); + + eraseInstruction(M); + eraseInstruction(MDep); + + ++NumMemCpyThroughIntermediate; + return true; +} + /// We've found that the (upward scanning) memory dependence of memcpy 'M' is /// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can. bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, - MemCpyInst *MDep) { + MemCpyInst *MDep, + BasicBlock::iterator &BBI) { // We can only transforms memcpy's where the dest of one is the source of the // other. if (M->getSource() != MDep->getDest() || MDep->isVolatile()) @@ -1174,6 +1256,7 @@ // *b = 42; // memcpy(c <- a) // It would be invalid to transform the second memcpy into memcpy(c <- b). + // but this could be a part of a swap pattern. // // TODO: If the code between M and MDep is transparent to the destination "c", // then we could still perform the xform by moving M up to the first memcpy. @@ -1181,7 +1264,7 @@ // size of M, rather than MDep. if (writtenBetween(MSSA, *AA, MemoryLocation::getForSource(MDep), MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(M))) - return false; + return processMemCpyThroughImmediate(M, MDep, BBI); // If the dest of the second might alias the source of the first, then the // source and dest might overlap. In addition, if the source of the first @@ -1512,7 +1595,7 @@ } } if (auto *MDep = dyn_cast(MI)) - return processMemCpyMemCpyDependence(M, MDep); + return processMemCpyMemCpyDependence(M, MDep, BBI); if (auto *MDep = dyn_cast(MI)) { if (performMemCpyToMemSetOptzn(M, MDep)) { LLVM_DEBUG(dbgs() << "Converted memcpy to memset\n"); @@ -1681,13 +1764,14 @@ } PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) { + auto &TTI = AM.getResult(F); auto &TLI = AM.getResult(F); auto *AA = &AM.getResult(F); auto *AC = &AM.getResult(F); auto *DT = &AM.getResult(F); auto *MSSA = &AM.getResult(F); - bool MadeChange = runImpl(F, &TLI, AA, AC, DT, &MSSA->getMSSA()); + bool MadeChange = runImpl(F, &TTI, &TLI, AA, AC, DT, &MSSA->getMSSA()); if (!MadeChange) return PreservedAnalyses::all(); @@ -1697,10 +1781,12 @@ return PA; } -bool MemCpyOptPass::runImpl(Function &F, TargetLibraryInfo *TLI_, - AliasAnalysis *AA_, AssumptionCache *AC_, - DominatorTree *DT_, MemorySSA *MSSA_) { +bool MemCpyOptPass::runImpl(Function &F, TargetTransformInfo *TTI_, + TargetLibraryInfo *TLI_, AliasAnalysis *AA_, + AssumptionCache *AC_, DominatorTree *DT_, + MemorySSA *MSSA_) { bool MadeChange = false; + TTI = TTI_; TLI = TLI_; AA = AA_; AC = AC_; @@ -1726,11 +1812,12 @@ if (skipFunction(F)) return false; + auto *TTI = &getAnalysis().getTTI(F); auto *TLI = &getAnalysis().getTLI(F); auto *AA = &getAnalysis().getAAResults(); auto *AC = &getAnalysis().getAssumptionCache(F); auto *DT = &getAnalysis().getDomTree(); auto *MSSA = &getAnalysis().getMSSA(); - return Impl.runImpl(F, TLI, AA, AC, DT, MSSA); + return Impl.runImpl(F, TTI, TLI, AA, AC, DT, MSSA); } diff --git a/llvm/test/Transforms/MemCpyOpt/X86/lit.local.cfg b/llvm/test/Transforms/MemCpyOpt/X86/lit.local.cfg new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/X86/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'X86' in config.root.targets: + config.unsupported = True diff --git a/llvm/test/Transforms/MemCpyOpt/X86/memory-swap.ll b/llvm/test/Transforms/MemCpyOpt/X86/memory-swap.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/X86/memory-swap.ll @@ -0,0 +1,311 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -S -memcpyopt -verify-memoryssa -mtriple=x86_64-- -mattr=-sse,-avx,-avx512f | FileCheck %s --check-prefixes=CHECK-ALL,CHECK-SCALAR +; RUN: opt < %s -S -memcpyopt -verify-memoryssa -mtriple=x86_64-- -mattr=+sse,-avx,-avx512f | FileCheck %s --check-prefixes=CHECK-ALL,CHECK-SSE,CHECK-SSE1 +; RUN: opt < %s -S -memcpyopt -verify-memoryssa -mtriple=x86_64-- -mattr=+sse,+avx,-avx512f | FileCheck %s --check-prefixes=CHECK-ALL,CHECK-SSE,CHECK-AVX,CHECK-AVX1 +; RUN: opt < %s -S -memcpyopt -verify-memoryssa -mtriple=x86_64-- -mattr=+sse,+avx,+avx512f | FileCheck %s --check-prefixes=CHECK-ALL,CHECK-SSE,CHECK-AVX,CHECK-AVX512F + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" + +define void @swap-16bytes(ptr align 1 %SOURCE_A, ptr align 1 %SOURCE_B) { +; CHECK-SCALAR-LABEL: @swap-16bytes( +; CHECK-SCALAR-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 1 +; CHECK-SCALAR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[INTERMEDIATE]], ptr align 1 [[SOURCE_A:%.*]], i64 16, i1 false) +; CHECK-SCALAR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_A]], ptr align 1 [[SOURCE_B:%.*]], i64 16, i1 false) +; CHECK-SCALAR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_B]], ptr align 1 [[INTERMEDIATE]], i64 16, i1 false) +; CHECK-SCALAR-NEXT: ret void +; +; CHECK-SSE-LABEL: @swap-16bytes( +; CHECK-SSE-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 1 +; CHECK-SSE-NEXT: [[SOURCE_A_RELOAD:%.*]] = load <16 x i8>, ptr [[SOURCE_A:%.*]], align 1 +; CHECK-SSE-NEXT: store <16 x i8> [[SOURCE_A_RELOAD]], ptr [[INTERMEDIATE]], align 1 +; CHECK-SSE-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_A]], ptr align 1 [[SOURCE_B:%.*]], i64 16, i1 false) +; CHECK-SSE-NEXT: store <16 x i8> [[SOURCE_A_RELOAD]], ptr [[SOURCE_B]], align 1 +; CHECK-SSE-NEXT: ret void +; + %INTERMEDIATE = alloca [16 x i8] + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %INTERMEDIATE, ptr align 1 %SOURCE_A, i64 16, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %SOURCE_A, ptr align 1 %SOURCE_B, i64 16, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %SOURCE_B, ptr align 1 %INTERMEDIATE, i64 16, i1 false) + ret void +} + +define void @swap-32bytes(ptr align 1 %SOURCE_A, ptr align 1 %SOURCE_B) { +; CHECK-SCALAR-LABEL: @swap-32bytes( +; CHECK-SCALAR-NEXT: [[INTERMEDIATE:%.*]] = alloca [32 x i8], align 1 +; CHECK-SCALAR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[INTERMEDIATE]], ptr align 1 [[SOURCE_A:%.*]], i64 32, i1 false) +; CHECK-SCALAR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_A]], ptr align 1 [[SOURCE_B:%.*]], i64 32, i1 false) +; CHECK-SCALAR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_B]], ptr align 1 [[INTERMEDIATE]], i64 32, i1 false) +; CHECK-SCALAR-NEXT: ret void +; +; CHECK-SSE-LABEL: @swap-32bytes( +; CHECK-SSE-NEXT: [[INTERMEDIATE:%.*]] = alloca [32 x i8], align 1 +; CHECK-SSE-NEXT: [[SOURCE_A_RELOAD:%.*]] = load <32 x i8>, ptr [[SOURCE_A:%.*]], align 1 +; CHECK-SSE-NEXT: store <32 x i8> [[SOURCE_A_RELOAD]], ptr [[INTERMEDIATE]], align 1 +; CHECK-SSE-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_A]], ptr align 1 [[SOURCE_B:%.*]], i64 32, i1 false) +; CHECK-SSE-NEXT: store <32 x i8> [[SOURCE_A_RELOAD]], ptr [[SOURCE_B]], align 1 +; CHECK-SSE-NEXT: ret void +; + %INTERMEDIATE = alloca [32 x i8] + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %INTERMEDIATE, ptr align 1 %SOURCE_A, i64 32, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %SOURCE_A, ptr align 1 %SOURCE_B, i64 32, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %SOURCE_B, ptr align 1 %INTERMEDIATE, i64 32, i1 false) + ret void +} + +define void @swap-64bytes(ptr align 1 %SOURCE_A, ptr align 1 %SOURCE_B) { +; CHECK-SCALAR-LABEL: @swap-64bytes( +; CHECK-SCALAR-NEXT: [[INTERMEDIATE:%.*]] = alloca [64 x i8], align 1 +; CHECK-SCALAR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[INTERMEDIATE]], ptr align 1 [[SOURCE_A:%.*]], i64 64, i1 false) +; CHECK-SCALAR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_A]], ptr align 1 [[SOURCE_B:%.*]], i64 64, i1 false) +; CHECK-SCALAR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_B]], ptr align 1 [[INTERMEDIATE]], i64 64, i1 false) +; CHECK-SCALAR-NEXT: ret void +; +; CHECK-SSE-LABEL: @swap-64bytes( +; CHECK-SSE-NEXT: [[INTERMEDIATE:%.*]] = alloca [64 x i8], align 1 +; CHECK-SSE-NEXT: [[SOURCE_A_RELOAD:%.*]] = load <64 x i8>, ptr [[SOURCE_A:%.*]], align 1 +; CHECK-SSE-NEXT: store <64 x i8> [[SOURCE_A_RELOAD]], ptr [[INTERMEDIATE]], align 1 +; CHECK-SSE-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_A]], ptr align 1 [[SOURCE_B:%.*]], i64 64, i1 false) +; CHECK-SSE-NEXT: store <64 x i8> [[SOURCE_A_RELOAD]], ptr [[SOURCE_B]], align 1 +; CHECK-SSE-NEXT: ret void +; + %INTERMEDIATE = alloca [64 x i8] + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %INTERMEDIATE, ptr align 1 %SOURCE_A, i64 64, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %SOURCE_A, ptr align 1 %SOURCE_B, i64 64, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %SOURCE_B, ptr align 1 %INTERMEDIATE, i64 64, i1 false) + ret void +} + +define void @swap-128bytes(ptr align 1 %SOURCE_A, ptr align 1 %SOURCE_B) { +; CHECK-SCALAR-LABEL: @swap-128bytes( +; CHECK-SCALAR-NEXT: [[INTERMEDIATE:%.*]] = alloca [128 x i8], align 1 +; CHECK-SCALAR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[INTERMEDIATE]], ptr align 1 [[SOURCE_A:%.*]], i64 128, i1 false) +; CHECK-SCALAR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_A]], ptr align 1 [[SOURCE_B:%.*]], i64 128, i1 false) +; CHECK-SCALAR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_B]], ptr align 1 [[INTERMEDIATE]], i64 128, i1 false) +; CHECK-SCALAR-NEXT: ret void +; +; CHECK-SSE-LABEL: @swap-128bytes( +; CHECK-SSE-NEXT: [[INTERMEDIATE:%.*]] = alloca [128 x i8], align 1 +; CHECK-SSE-NEXT: [[SOURCE_A_RELOAD:%.*]] = load <128 x i8>, ptr [[SOURCE_A:%.*]], align 1 +; CHECK-SSE-NEXT: store <128 x i8> [[SOURCE_A_RELOAD]], ptr [[INTERMEDIATE]], align 1 +; CHECK-SSE-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_A]], ptr align 1 [[SOURCE_B:%.*]], i64 128, i1 false) +; CHECK-SSE-NEXT: store <128 x i8> [[SOURCE_A_RELOAD]], ptr [[SOURCE_B]], align 1 +; CHECK-SSE-NEXT: ret void +; + %INTERMEDIATE = alloca [128 x i8] + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %INTERMEDIATE, ptr align 1 %SOURCE_A, i64 128, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %SOURCE_A, ptr align 1 %SOURCE_B, i64 128, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %SOURCE_B, ptr align 1 %INTERMEDIATE, i64 128, i1 false) + ret void +} + +define void @swap-256bytes(ptr align 1 %SOURCE_A, ptr align 1 %SOURCE_B) { +; CHECK-SCALAR-LABEL: @swap-256bytes( +; CHECK-SCALAR-NEXT: [[INTERMEDIATE:%.*]] = alloca [256 x i8], align 1 +; CHECK-SCALAR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[INTERMEDIATE]], ptr align 1 [[SOURCE_A:%.*]], i64 256, i1 false) +; CHECK-SCALAR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_A]], ptr align 1 [[SOURCE_B:%.*]], i64 256, i1 false) +; CHECK-SCALAR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_B]], ptr align 1 [[INTERMEDIATE]], i64 256, i1 false) +; CHECK-SCALAR-NEXT: ret void +; +; CHECK-SSE-LABEL: @swap-256bytes( +; CHECK-SSE-NEXT: [[INTERMEDIATE:%.*]] = alloca [256 x i8], align 1 +; CHECK-SSE-NEXT: [[SOURCE_A_RELOAD:%.*]] = load <256 x i8>, ptr [[SOURCE_A:%.*]], align 1 +; CHECK-SSE-NEXT: store <256 x i8> [[SOURCE_A_RELOAD]], ptr [[INTERMEDIATE]], align 1 +; CHECK-SSE-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_A]], ptr align 1 [[SOURCE_B:%.*]], i64 256, i1 false) +; CHECK-SSE-NEXT: store <256 x i8> [[SOURCE_A_RELOAD]], ptr [[SOURCE_B]], align 1 +; CHECK-SSE-NEXT: ret void +; + %INTERMEDIATE = alloca [256 x i8] + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %INTERMEDIATE, ptr align 1 %SOURCE_A, i64 256, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %SOURCE_A, ptr align 1 %SOURCE_B, i64 256, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %SOURCE_B, ptr align 1 %INTERMEDIATE, i64 256, i1 false) + ret void +} + +define void @swap-512bytes(ptr align 1 %SOURCE_A, ptr align 1 %SOURCE_B) { +; CHECK-SCALAR-LABEL: @swap-512bytes( +; CHECK-SCALAR-NEXT: [[INTERMEDIATE:%.*]] = alloca [512 x i8], align 1 +; CHECK-SCALAR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[INTERMEDIATE]], ptr align 1 [[SOURCE_A:%.*]], i64 512, i1 false) +; CHECK-SCALAR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_A]], ptr align 1 [[SOURCE_B:%.*]], i64 512, i1 false) +; CHECK-SCALAR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_B]], ptr align 1 [[INTERMEDIATE]], i64 512, i1 false) +; CHECK-SCALAR-NEXT: ret void +; +; CHECK-SSE1-LABEL: @swap-512bytes( +; CHECK-SSE1-NEXT: [[INTERMEDIATE:%.*]] = alloca [512 x i8], align 1 +; CHECK-SSE1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[INTERMEDIATE]], ptr align 1 [[SOURCE_A:%.*]], i64 512, i1 false) +; CHECK-SSE1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_A]], ptr align 1 [[SOURCE_B:%.*]], i64 512, i1 false) +; CHECK-SSE1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_B]], ptr align 1 [[INTERMEDIATE]], i64 512, i1 false) +; CHECK-SSE1-NEXT: ret void +; +; CHECK-AVX-LABEL: @swap-512bytes( +; CHECK-AVX-NEXT: [[INTERMEDIATE:%.*]] = alloca [512 x i8], align 1 +; CHECK-AVX-NEXT: [[SOURCE_A_RELOAD:%.*]] = load <512 x i8>, ptr [[SOURCE_A:%.*]], align 1 +; CHECK-AVX-NEXT: store <512 x i8> [[SOURCE_A_RELOAD]], ptr [[INTERMEDIATE]], align 1 +; CHECK-AVX-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_A]], ptr align 1 [[SOURCE_B:%.*]], i64 512, i1 false) +; CHECK-AVX-NEXT: store <512 x i8> [[SOURCE_A_RELOAD]], ptr [[SOURCE_B]], align 1 +; CHECK-AVX-NEXT: ret void +; + %INTERMEDIATE = alloca [512 x i8] + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %INTERMEDIATE, ptr align 1 %SOURCE_A, i64 512, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %SOURCE_A, ptr align 1 %SOURCE_B, i64 512, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %SOURCE_B, ptr align 1 %INTERMEDIATE, i64 512, i1 false) + ret void +} + +define void @swap-1024bytes(ptr align 1 %SOURCE_A, ptr align 1 %SOURCE_B) { +; CHECK-SCALAR-LABEL: @swap-1024bytes( +; CHECK-SCALAR-NEXT: [[INTERMEDIATE:%.*]] = alloca [1024 x i8], align 1 +; CHECK-SCALAR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[INTERMEDIATE]], ptr align 1 [[SOURCE_A:%.*]], i64 1024, i1 false) +; CHECK-SCALAR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_A]], ptr align 1 [[SOURCE_B:%.*]], i64 1024, i1 false) +; CHECK-SCALAR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_B]], ptr align 1 [[INTERMEDIATE]], i64 1024, i1 false) +; CHECK-SCALAR-NEXT: ret void +; +; CHECK-SSE1-LABEL: @swap-1024bytes( +; CHECK-SSE1-NEXT: [[INTERMEDIATE:%.*]] = alloca [1024 x i8], align 1 +; CHECK-SSE1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[INTERMEDIATE]], ptr align 1 [[SOURCE_A:%.*]], i64 1024, i1 false) +; CHECK-SSE1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_A]], ptr align 1 [[SOURCE_B:%.*]], i64 1024, i1 false) +; CHECK-SSE1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_B]], ptr align 1 [[INTERMEDIATE]], i64 1024, i1 false) +; CHECK-SSE1-NEXT: ret void +; +; CHECK-AVX1-LABEL: @swap-1024bytes( +; CHECK-AVX1-NEXT: [[INTERMEDIATE:%.*]] = alloca [1024 x i8], align 1 +; CHECK-AVX1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[INTERMEDIATE]], ptr align 1 [[SOURCE_A:%.*]], i64 1024, i1 false) +; CHECK-AVX1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_A]], ptr align 1 [[SOURCE_B:%.*]], i64 1024, i1 false) +; CHECK-AVX1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_B]], ptr align 1 [[INTERMEDIATE]], i64 1024, i1 false) +; CHECK-AVX1-NEXT: ret void +; +; CHECK-AVX512F-LABEL: @swap-1024bytes( +; CHECK-AVX512F-NEXT: [[INTERMEDIATE:%.*]] = alloca [1024 x i8], align 1 +; CHECK-AVX512F-NEXT: [[SOURCE_A_RELOAD:%.*]] = load <1024 x i8>, ptr [[SOURCE_A:%.*]], align 1 +; CHECK-AVX512F-NEXT: store <1024 x i8> [[SOURCE_A_RELOAD]], ptr [[INTERMEDIATE]], align 1 +; CHECK-AVX512F-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_A]], ptr align 1 [[SOURCE_B:%.*]], i64 1024, i1 false) +; CHECK-AVX512F-NEXT: store <1024 x i8> [[SOURCE_A_RELOAD]], ptr [[SOURCE_B]], align 1 +; CHECK-AVX512F-NEXT: ret void +; + %INTERMEDIATE = alloca [1024 x i8] + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %INTERMEDIATE, ptr align 1 %SOURCE_A, i64 1024, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %SOURCE_A, ptr align 1 %SOURCE_B, i64 1024, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %SOURCE_B, ptr align 1 %INTERMEDIATE, i64 1024, i1 false) + ret void +} + +define void @swap-2048bytes(ptr align 1 %SOURCE_A, ptr align 1 %SOURCE_B) { +; CHECK-SCALAR-LABEL: @swap-2048bytes( +; CHECK-SCALAR-NEXT: [[INTERMEDIATE:%.*]] = alloca [2048 x i8], align 1 +; CHECK-SCALAR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[INTERMEDIATE]], ptr align 1 [[SOURCE_A:%.*]], i64 2048, i1 false) +; CHECK-SCALAR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_A]], ptr align 1 [[SOURCE_B:%.*]], i64 2048, i1 false) +; CHECK-SCALAR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_B]], ptr align 1 [[INTERMEDIATE]], i64 2048, i1 false) +; CHECK-SCALAR-NEXT: ret void +; +; CHECK-SSE1-LABEL: @swap-2048bytes( +; CHECK-SSE1-NEXT: [[INTERMEDIATE:%.*]] = alloca [2048 x i8], align 1 +; CHECK-SSE1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[INTERMEDIATE]], ptr align 1 [[SOURCE_A:%.*]], i64 2048, i1 false) +; CHECK-SSE1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_A]], ptr align 1 [[SOURCE_B:%.*]], i64 2048, i1 false) +; CHECK-SSE1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_B]], ptr align 1 [[INTERMEDIATE]], i64 2048, i1 false) +; CHECK-SSE1-NEXT: ret void +; +; CHECK-AVX1-LABEL: @swap-2048bytes( +; CHECK-AVX1-NEXT: [[INTERMEDIATE:%.*]] = alloca [2048 x i8], align 1 +; CHECK-AVX1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[INTERMEDIATE]], ptr align 1 [[SOURCE_A:%.*]], i64 2048, i1 false) +; CHECK-AVX1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_A]], ptr align 1 [[SOURCE_B:%.*]], i64 2048, i1 false) +; CHECK-AVX1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_B]], ptr align 1 [[INTERMEDIATE]], i64 2048, i1 false) +; CHECK-AVX1-NEXT: ret void +; +; CHECK-AVX512F-LABEL: @swap-2048bytes( +; CHECK-AVX512F-NEXT: [[INTERMEDIATE:%.*]] = alloca [2048 x i8], align 1 +; CHECK-AVX512F-NEXT: [[SOURCE_A_RELOAD:%.*]] = load <2048 x i8>, ptr [[SOURCE_A:%.*]], align 1 +; CHECK-AVX512F-NEXT: store <2048 x i8> [[SOURCE_A_RELOAD]], ptr [[INTERMEDIATE]], align 1 +; CHECK-AVX512F-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_A]], ptr align 1 [[SOURCE_B:%.*]], i64 2048, i1 false) +; CHECK-AVX512F-NEXT: store <2048 x i8> [[SOURCE_A_RELOAD]], ptr [[SOURCE_B]], align 1 +; CHECK-AVX512F-NEXT: ret void +; + %INTERMEDIATE = alloca [2048 x i8] + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %INTERMEDIATE, ptr align 1 %SOURCE_A, i64 2048, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %SOURCE_A, ptr align 1 %SOURCE_B, i64 2048, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %SOURCE_B, ptr align 1 %INTERMEDIATE, i64 2048, i1 false) + ret void +} + +define void @length-mismatch-final-is-smaller(ptr align 1 %SOURCE_A, ptr align 1 %SOURCE_B) { +; CHECK-SCALAR-LABEL: @length-mismatch-final-is-smaller( +; CHECK-SCALAR-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 1 +; CHECK-SCALAR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[INTERMEDIATE]], ptr align 1 [[SOURCE_A:%.*]], i64 16, i1 false) +; CHECK-SCALAR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_A]], ptr align 1 [[SOURCE_B:%.*]], i64 16, i1 false) +; CHECK-SCALAR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_B]], ptr align 1 [[INTERMEDIATE]], i64 8, i1 false) +; CHECK-SCALAR-NEXT: ret void +; +; CHECK-SSE-LABEL: @length-mismatch-final-is-smaller( +; CHECK-SSE-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 1 +; CHECK-SSE-NEXT: [[SOURCE_A_RELOAD:%.*]] = load <8 x i8>, ptr [[SOURCE_A:%.*]], align 1 +; CHECK-SSE-NEXT: store <8 x i8> [[SOURCE_A_RELOAD]], ptr [[INTERMEDIATE]], align 1 +; CHECK-SSE-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_A]], ptr align 1 [[SOURCE_B:%.*]], i64 16, i1 false) +; CHECK-SSE-NEXT: store <8 x i8> [[SOURCE_A_RELOAD]], ptr [[SOURCE_B]], align 1 +; CHECK-SSE-NEXT: ret void +; + %INTERMEDIATE = alloca [16 x i8] + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %INTERMEDIATE, ptr align 1 %SOURCE_A, i64 16, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %SOURCE_A, ptr align 1 %SOURCE_B, i64 16, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %SOURCE_B, ptr align 1 %INTERMEDIATE, i64 8, i1 false) + ret void +} + +define void @length-mismatch-final-is-larger(ptr align 1 %SOURCE_A, ptr align 1 %SOURCE_B) { +; CHECK-ALL-LABEL: @length-mismatch-final-is-larger( +; CHECK-ALL-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 1 +; CHECK-ALL-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[INTERMEDIATE]], ptr align 1 [[SOURCE_A:%.*]], i64 8, i1 false) +; CHECK-ALL-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_A]], ptr align 1 [[SOURCE_B:%.*]], i64 16, i1 false) +; CHECK-ALL-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_B]], ptr align 1 [[INTERMEDIATE]], i64 16, i1 false) +; CHECK-ALL-NEXT: ret void +; + %INTERMEDIATE = alloca [16 x i8] + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %INTERMEDIATE, ptr align 1 %SOURCE_A, i64 8, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %SOURCE_A, ptr align 1 %SOURCE_B, i64 16, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %SOURCE_B, ptr align 1 %INTERMEDIATE, i64 16, i1 false) + ret void +} + +define void @first-length-is-variable(ptr align 1 %SOURCE_A, ptr align 1 %SOURCE_B, i64 %len_a) { +; CHECK-ALL-LABEL: @first-length-is-variable( +; CHECK-ALL-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 1 +; CHECK-ALL-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[INTERMEDIATE]], ptr align 1 [[SOURCE_A:%.*]], i64 [[LEN_A:%.*]], i1 false) +; CHECK-ALL-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_A]], ptr align 1 [[SOURCE_B:%.*]], i64 16, i1 false) +; CHECK-ALL-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_B]], ptr align 1 [[INTERMEDIATE]], i64 16, i1 false) +; CHECK-ALL-NEXT: ret void +; + %INTERMEDIATE = alloca [16 x i8] + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %INTERMEDIATE, ptr align 1 %SOURCE_A, i64 %len_a, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %SOURCE_A, ptr align 1 %SOURCE_B, i64 16, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %SOURCE_B, ptr align 1 %INTERMEDIATE, i64 16, i1 false) + ret void +} + +define void @second-length-is-variable(ptr align 1 %SOURCE_A, ptr align 1 %SOURCE_B, i64 %len_b) { +; CHECK-ALL-LABEL: @second-length-is-variable( +; CHECK-ALL-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 1 +; CHECK-ALL-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[INTERMEDIATE]], ptr align 1 [[SOURCE_A:%.*]], i64 16, i1 false) +; CHECK-ALL-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_A]], ptr align 1 [[SOURCE_B:%.*]], i64 16, i1 false) +; CHECK-ALL-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_B]], ptr align 1 [[INTERMEDIATE]], i64 [[LEN_B:%.*]], i1 false) +; CHECK-ALL-NEXT: ret void +; + %INTERMEDIATE = alloca [16 x i8] + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %INTERMEDIATE, ptr align 1 %SOURCE_A, i64 16, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %SOURCE_A, ptr align 1 %SOURCE_B, i64 16, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %SOURCE_B, ptr align 1 %INTERMEDIATE, i64 %len_b, i1 false) + ret void +} + +define void @lengths-are-variable(ptr align 1 %SOURCE_A, ptr align 1 %SOURCE_B, i64 %len_a, i64 %len_b) { +; CHECK-ALL-LABEL: @lengths-are-variable( +; CHECK-ALL-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 1 +; CHECK-ALL-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[INTERMEDIATE]], ptr align 1 [[SOURCE_A:%.*]], i64 [[LEN_A:%.*]], i1 false) +; CHECK-ALL-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_A]], ptr align 1 [[SOURCE_B:%.*]], i64 16, i1 false) +; CHECK-ALL-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[SOURCE_B]], ptr align 1 [[INTERMEDIATE]], i64 [[LEN_B:%.*]], i1 false) +; CHECK-ALL-NEXT: ret void +; + %INTERMEDIATE = alloca [16 x i8] + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %INTERMEDIATE, ptr align 1 %SOURCE_A, i64 %len_a, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %SOURCE_A, ptr align 1 %SOURCE_B, i64 16, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %SOURCE_B, ptr align 1 %INTERMEDIATE, i64 %len_b, i1 false) + ret void +} + +declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1) diff --git a/llvm/test/Transforms/MemCpyOpt/lifetime.ll b/llvm/test/Transforms/MemCpyOpt/lifetime.ll --- a/llvm/test/Transforms/MemCpyOpt/lifetime.ll +++ b/llvm/test/Transforms/MemCpyOpt/lifetime.ll @@ -32,9 +32,10 @@ ; CHECK-LABEL: @memcpy_memcpy_across_lifetime( ; CHECK-NEXT: [[A:%.*]] = alloca [16 x i8], align 1 ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[A]]) -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[A]], ptr [[P1:%.*]], i64 16, i1 false) +; CHECK-NEXT: [[P1_RELOAD:%.*]] = load <16 x i8>, ptr [[P1:%.*]], align 16 +; CHECK-NEXT: store <16 x i8> [[P1_RELOAD]], ptr [[A]], align 16 ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[P1]], ptr [[P2:%.*]], i64 16, i1 false) -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[P2]], ptr [[A]], i64 16, i1 false) +; CHECK-NEXT: store <16 x i8> [[P1_RELOAD]], ptr [[P2]], align 16 ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr [[A]]) ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[P3:%.*]], ptr [[P2]], i64 16, i1 false) ; CHECK-NEXT: ret void