Index: lib/Transforms/Utils/PromoteMemoryToRegister.cpp =================================================================== --- lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -68,6 +68,33 @@ // not have any meaning for a local alloca. if (SI->isVolatile()) return false; + } else if (const MemCpyInst *MCI = dyn_cast(U)) { + // Punt if this alloca is an array allocation + if (AI->isArrayAllocation()) + return false; + if (MCI->isVolatile()) + return false; + Value *Length = MCI->getLength(); + if (!isa(Length)) + return false; + // Anything less than the full alloca, we leave for SROA + const DataLayout &DL = AI->getModule()->getDataLayout(); + size_t AIElSize = DL.getTypeAllocSize(AI->getAllocatedType()); + if (cast(Length)->getZExtValue() != AIElSize) + return false; + // If the other argument is also an alloca, we need to be sure that either + // the types are bitcastable, or the other alloca is not eligible for + // promotion (e.g. because the memcpy is for less than the whole size of + // that alloca), otherwise we risk turning an allocatable alloca into a + // non-allocatable one when splitting the memcpy. + AllocaInst *OtherAI = dyn_cast( + AI == MCI->getRawSource() ? MCI->getRawDest() : MCI->getRawSource()); + if (OtherAI) { + if (!CastInst::isBitCastable(AI->getAllocatedType(), + OtherAI->getAllocatedType()) && + DL.getTypeAllocSize(OtherAI->getAllocatedType()) == AIElSize) + return false; + } } else if (const IntrinsicInst *II = dyn_cast(U)) { if (II->getIntrinsicID() != Intrinsic::lifetime_start && II->getIntrinsicID() != Intrinsic::lifetime_end) @@ -98,11 +125,14 @@ SmallVector DefiningBlocks; SmallVector UsingBlocks; + // This gets updated with stores we find as we get along. Our use of + // a vector for DefiningBlocks has the side effect of counting the number + // of stores, so if DefiningBlocks.size() == 1, there is only one store + // and we can quickly find it here. StoreInst *OnlyStore; BasicBlock *OnlyBlock; bool OnlyUsedInOneBlock; - Value *AllocaPointerVal; DbgDeclareInst *DbgDeclare; void clear() { @@ -111,7 +141,6 @@ OnlyStore = nullptr; OnlyBlock = nullptr; OnlyUsedInOneBlock = true; - AllocaPointerVal = nullptr; DbgDeclare = nullptr; } @@ -129,14 +158,11 @@ if (StoreInst *SI = dyn_cast(User)) { // Remember the basic blocks which define new values for the alloca DefiningBlocks.push_back(SI->getParent()); - AllocaPointerVal = SI->getOperand(0); OnlyStore = SI; } else { LoadInst *LI = cast(User); - // Otherwise it must be a load instruction, keep track of variable - // reads. + // Keep track of variable reads. UsingBlocks.push_back(LI->getParent()); - AllocaPointerVal = LI; } if (OnlyUsedInOneBlock) { @@ -181,7 +207,9 @@ /// This code only looks at accesses to allocas. static bool isInterestingInstruction(const Instruction *I) { return (isa(I) && isa(I->getOperand(0))) || - (isa(I) && isa(I->getOperand(1))); + (isa(I) && isa(I->getOperand(1))) || + (isa(I) && (isa(I->getOperand(0)) || + isa(I->getOperand(1)))); } /// Get or calculate the index of the specified instruction. @@ -208,6 +236,24 @@ return It->second; } + // When we split a memcpy intrinsic, we need to update the numbering in this + // struct. To make sure the relative ordering remains the same, we give both + // the LI and the SI the number that the MCI used to have (if they are both + // interesting). This means that they will have equal numbers, which usually + // can't happen. However, since they can never reference the same alloca + // (since memcpy operands may not overlap), this is fine, because we will + // never compare instruction indices for instructions that operate on distinct + // allocas. + void splitMemCpy(MemCpyInst *MCI, LoadInst *LI, StoreInst *SI) { + DenseMap::iterator It = + InstNumbers.find(MCI); + if (It == InstNumbers.end()) + return; + InstNumbers[LI] = It->second; + InstNumbers[SI] = It->second; + deleteValue(MCI); + } + void deleteValue(const Instruction *I) { InstNumbers.erase(I); } void clear() { InstNumbers.clear(); } @@ -305,9 +351,10 @@ AC->registerAssumption(CI); } -static void removeLifetimeIntrinsicUsers(AllocaInst *AI) { - // Knowing that this alloca is promotable, we know that it's safe to kill all - // instructions except for load and store. +static void canonicalizeUsers(LargeBlockInfo &LBI, AllocaInst *AI) { + // Knowing that this alloca is promotable, we know that it's safe to split + // MTIs into load/store and to kill all other instructions except for + // load and store. for (auto UI = AI->user_begin(), UE = AI->user_end(); UI != UE;) { Instruction *I = cast(*UI); @@ -315,6 +362,38 @@ if (isa(I) || isa(I)) continue; + if (isa(I)) { + MemCpyInst *MCI = cast(I); + AAMDNodes AA; + MCI->getAAMetadata(AA); + // This might add to the end of the use list, but that's fine. At worst, + // we'd not visit the instructions we insert here, but we don't care + // about them in this loop anyway. + LoadInst *LI = + new LoadInst(AI->getAllocatedType(), MCI->getRawSource(), "", + MCI->isVolatile(), MCI->getAlignment(), MCI); + Value *Val = LI; + Value *Dest = MCI->getRawDest(); + Type *DestElTy = cast(Dest->getType())->getElementType(); + if (LI->getType() != DestElTy) { + if (CastInst::isBitCastable(LI->getType(), DestElTy)) + Val = CastInst::Create(Instruction::BitCast, Val, DestElTy, "", MCI); + else + Dest = CastInst::Create( + Instruction::BitCast, Dest, + LI->getType()->getPointerTo( + cast(Dest->getType())->getAddressSpace()), + "", MCI); + } + StoreInst *SI = + new StoreInst(Val, Dest, MCI->isVolatile(), MCI->getAlignment(), MCI); + LI->setAAMetadata(AA); + SI->setAAMetadata(AA); + LBI.splitMemCpy(MCI, LI, SI); + MCI->eraseFromParent(); + continue; + } + if (!I->getType()->isVoidTy()) { // The only users of this bitcast/GEP instruction are lifetime intrinsics. // Follow the use/def chain to erase them now instead of leaving it for @@ -542,7 +621,7 @@ assert(AI->getParent()->getParent() == &F && "All allocas should be in the same function, which is same as DF!"); - removeLifetimeIntrinsicUsers(AI); + canonicalizeUsers(LBI, AI); if (AI->use_empty()) { // If there are no uses of the alloca, just delete it now. Index: test/Transforms/Mem2Reg/memcpy.ll =================================================================== --- /dev/null +++ test/Transforms/Mem2Reg/memcpy.ll @@ -0,0 +1,44 @@ +; RUN: opt < %s -mem2reg -S | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +declare void @llvm.memcpy.p0i128.p0i64.i32(i128 *, i64 *, i32, i32, i1) +declare void @llvm.memcpy.p0i64.p0i64.i32(i64 *, i64 *, i32, i32, i1) +declare void @llvm.memcpy.p0f64.p0i64.i32(double *, i64 *, i32, i32, i1) + +define i128 @test_cpy_different(i64) { +; CHECK-LABEL: @test_cpy_different +; CHECK-NOT: alloca i64 +; CHECK: store i64 %0 + %a = alloca i64 + %b = alloca i128 + store i128 0, i128 *%b + store i64 %0, i64 *%a + call void @llvm.memcpy.p0i128.p0i64.i32(i128 *%b, i64 *%a, i32 8, i32 0, i1 0) + %loaded = load i128, i128 *%b + ret i128 %loaded +} + +define i64 @test_cpy_same(i64) { +; CHECK-LABEL: @test_cpy_same +; CHECK-NOT: alloca +; CHECK: ret i64 %0 + %a = alloca i64 + %b = alloca i64 + store i64 %0, i64 *%a + call void @llvm.memcpy.p0i64.p0i64.i32(i64 *%b, i64 *%a, i32 8, i32 0, i1 0) + %loaded = load i64, i64 *%b + ret i64 %loaded +} + +define double @test_cpy_different_type(i64) { +; CHECK-LABEL: @test_cpy_different_type +; CHECK-NOT: alloca +; CHECK: bitcast i64 %0 to double + %a = alloca i64 + %b = alloca double + store i64 %0, i64 *%a + call void @llvm.memcpy.p0f64.p0i64.i32(double *%b, i64 *%a, i32 8, i32 0, i1 0) + %loaded = load double, double *%b + ret double %loaded +}