Index: include/llvm/Transforms/Scalar/SROA.h =================================================================== --- include/llvm/Transforms/Scalar/SROA.h +++ include/llvm/Transforms/Scalar/SROA.h @@ -17,6 +17,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/PassManager.h" #include "llvm/Support/Compiler.h" #include @@ -66,6 +67,7 @@ LLVMContext *C = nullptr; DominatorTree *DT = nullptr; AssumptionCache *AC = nullptr; + TargetTransformInfo *TTI = nullptr; /// Worklist of alloca instructions to simplify. /// @@ -121,7 +123,7 @@ /// Helper used by both the public run method and by the legacy pass. PreservedAnalyses runImpl(Function &F, DominatorTree &RunDT, - AssumptionCache &RunAC); + AssumptionCache &RunAC, TargetTransformInfo &TTI); bool presplitLoadsAndStores(AllocaInst &AI, sroa::AllocaSlices &AS); AllocaInst *rewritePartition(AllocaInst &AI, sroa::AllocaSlices &AS, Index: lib/Transforms/Scalar/SROA.cpp =================================================================== --- lib/Transforms/Scalar/SROA.cpp +++ lib/Transforms/Scalar/SROA.cpp @@ -297,6 +297,8 @@ void dump() const; #endif + bool shouldExpand(TargetTransformInfo *TTI) const; + private: template class BuilderBase; class SliceBuilder; @@ -1095,6 +1097,26 @@ #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +bool AllocaSlices::shouldExpand(TargetTransformInfo *TTI) const { + for (auto I = begin(), E = end(); I != E; ++I) { + const User *U = I->getUse()->getUser(); + if (!dyn_cast(U)->getParent()->getParent()->hasMinSize()) + return true; + + const unsigned Cost = TTI->getInstructionCost(dyn_cast(U), + TargetTransformInfo::TCK_CodeSize); + + if (Cost >= TargetTransformInfo::TCC_Expensive) { + LLVM_DEBUG(dbgs() << "Instruction with cost " << Cost << " is too " << + "expensive to expand: "; U->dump(); + dbgs() << "Not lowering this slice: "; + print(dbgs(), I)); + return false; + } + } + return true; +} + /// Walk the range of a partitioning looking for a common type to cover this /// sequence of slices. static Type *findCommonType(AllocaSlices::const_iterator B, @@ -4403,6 +4425,10 @@ // Build the slices using a recursive instruction-visiting builder. AllocaSlices AS(DL, AI); LLVM_DEBUG(AS.print(dbgs())); + + if (!AS.shouldExpand(TTI)) + return Changed; + if (AS.isEscaped()) return Changed; @@ -4501,11 +4527,13 @@ } PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT, - AssumptionCache &RunAC) { + AssumptionCache &RunAC, + TargetTransformInfo &RunTTI) { LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); C = &F.getContext(); DT = &RunDT; AC = &RunAC; + TTI = &RunTTI; BasicBlock &EntryBB = F.getEntryBlock(); for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end()); @@ -4553,7 +4581,8 @@ PreservedAnalyses SROA::run(Function &F, FunctionAnalysisManager &AM) { return runImpl(F, AM.getResult(F), - AM.getResult(F)); + AM.getResult(F), + AM.getResult(F)); } /// A legacy pass for the legacy pass manager that wraps the \c SROA pass. @@ -4577,11 +4606,13 @@ auto PA = Impl.runImpl( F, getAnalysis().getDomTree(), - getAnalysis().getAssumptionCache(F)); + getAnalysis().getAssumptionCache(F), + getAnalysis().getTTI(F)); return !PA.areAllPreserved(); } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addPreserved(); Index: test/Transforms/SROA/memcpy-minsize.ll =================================================================== --- /dev/null +++ test/Transforms/SROA/memcpy-minsize.ll @@ -0,0 +1,40 @@ +; RUN: opt < %s -sroa -S | FileCheck %s + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv7m-arm-unknown-eabi" + +%struct.data_t = type { [32 x i8] } +%struct.data_2_t = type { i64, i64, i64 } + +define dso_local void @test(i8* %src) minsize optsize { +; CHECK-LABEL: @test +; CHECK: %tmp = alloca %struct.data_t, align 1 +; CHECK: @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 %tmp6, i8* align 1 %arrayidx, i32 8, i1 false) +; CHECK: @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 %tmp8, i8* align 1 %arrayidx2, i32 8, i1 false) +entry: + %src.addr = alloca i8*, align 4 + %tmp = alloca %struct.data_t, align 1 + %dst = alloca %struct.data_2_t*, align 4 + store i8* %src, i8** %src.addr, align 4 + %tmp2 = bitcast %struct.data_t* %tmp to i8* + %tmp3 = load i8*, i8** %src.addr, align 4 + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp2, i8* align 1 %tmp3, i32 32, i1 false) + %call = call %struct.data_2_t* bitcast (%struct.data_2_t* (...)* @getData2 to %struct.data_2_t* ()*)() + store %struct.data_2_t* %call, %struct.data_2_t** %dst, align 4 + %tmp5 = load %struct.data_2_t*, %struct.data_2_t** %dst, align 4 + %m1 = getelementptr inbounds %struct.data_2_t, %struct.data_2_t* %tmp5, i32 0, i32 0 + %tmp6 = bitcast i64* %m1 to i8* + %data = getelementptr inbounds %struct.data_t, %struct.data_t* %tmp, i32 0, i32 0 + %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* %data, i32 0, i32 8 + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 %tmp6, i8* align 1 %arrayidx, i32 8, i1 false) + %tmp7 = load %struct.data_2_t*, %struct.data_2_t** %dst, align 4 + %m2 = getelementptr inbounds %struct.data_2_t, %struct.data_2_t* %tmp7, i32 0, i32 1 + %tmp8 = bitcast i64* %m2 to i8* + %data1 = getelementptr inbounds %struct.data_t, %struct.data_t* %tmp, i32 0, i32 0 + %arrayidx2 = getelementptr inbounds [32 x i8], [32 x i8]* %data1, i32 0, i32 16 + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 %tmp8, i8* align 1 %arrayidx2, i32 8, i1 false) + ret void +} + +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i1 immarg) +declare dso_local %struct.data_2_t* @getData2(...) local_unnamed_addr