Index: llvm/trunk/include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- llvm/trunk/include/llvm/Transforms/Utils/LoopUtils.h +++ llvm/trunk/include/llvm/Transforms/Utils/LoopUtils.h @@ -373,8 +373,8 @@ bool promoteLoopAccessesToScalars(AliasSet &, SmallVectorImpl &, SmallVectorImpl &, PredIteratorCache &, LoopInfo *, - DominatorTree *, Loop *, AliasSetTracker *, - LICMSafetyInfo *); + DominatorTree *, const TargetLibraryInfo *, + Loop *, AliasSetTracker *, LICMSafetyInfo *); /// \brief Computes safety information for a loop /// checks loop body & header for the possibility of may throw Index: llvm/trunk/lib/Transforms/Scalar/LICM.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/LICM.cpp +++ llvm/trunk/lib/Transforms/Scalar/LICM.cpp @@ -35,10 +35,12 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -223,7 +225,7 @@ // Loop over all of the alias sets in the tracker object. for (AliasSet &AS : *CurAST) Changed |= promoteLoopAccessesToScalars(AS, ExitBlocks, InsertPts, - PIC, LI, DT, CurLoop, + PIC, LI, DT, TLI, CurLoop, CurAST, &SafetyInfo); // Once we have promoted values across the loop body we have to recursively @@ -846,7 +848,9 @@ SmallVectorImpl&ExitBlocks, SmallVectorImpl&InsertPts, PredIteratorCache &PIC, LoopInfo *LI, - DominatorTree *DT, Loop *CurLoop, + DominatorTree *DT, + const TargetLibraryInfo *TLI, + Loop *CurLoop, AliasSetTracker *CurAST, LICMSafetyInfo * SafetyInfo) { // Verify inputs. @@ -879,10 +883,25 @@ // // is not safe, because *P may only be valid to access if 'c' is true. // + // The safety property divides into two parts: + // 1) The memory may not be dereferenceable on entry to the loop. In this + // case, we can't insert the required load in the preheader. + // 2) The memory model does not allow us to insert a store along any dynamic + // path which did not originally have one. + // // It is safe to promote P if all uses are direct load/stores and if at // least one is guaranteed to be executed. bool GuaranteedToExecute = false; + // It is also safe to promote P if we can prove that speculating a load into + // the preheader is safe (i.e. proving dereferenceability on all + // paths through the loop), and that the memory can be proven thread local + // (so that the memory model requirement doesn't apply.) We first establish + // the former, and then run a capture analysis below to establish the later. + // We can use any access within the alias set to prove dereferenceability + // since they're all must alias. + bool CanSpeculateLoad = false; + SmallVector LoopUses; SmallPtrSet PointerMustAliases; @@ -892,6 +911,16 @@ AAMDNodes AATags; bool HasDedicatedExits = CurLoop->hasDedicatedExits(); + // Don't sink stores from loops without dedicated block exits. Exits + // containing indirect branches are not transformed by loop simplify, + // make sure we catch that. An additional load may be generated in the + // preheader for SSA updater, so also avoid sinking when no preheader + // is available. + if (!HasDedicatedExits || !Preheader) + return false; + + const DataLayout &MDL = Preheader->getModule()->getDataLayout(); + // Check that all of the pointers in the alias set have the same type. We // cannot (yet) promote a memory location that is loaded and stored in // different sizes. While we are at it, collect alignment and AA info. @@ -918,6 +947,12 @@ assert(!Load->isVolatile() && "AST broken"); if (!Load->isSimple()) return Changed; + + if (!GuaranteedToExecute && !CanSpeculateLoad) + CanSpeculateLoad = + isSafeToExecuteUnconditionally(*Load, DT, TLI, CurLoop, + SafetyInfo, + Preheader->getTerminator()); } else if (const StoreInst *Store = dyn_cast(UI)) { // Stores *of* the pointer are not interesting, only stores *to* the // pointer. @@ -926,13 +961,6 @@ assert(!Store->isVolatile() && "AST broken"); if (!Store->isSimple()) return Changed; - // Don't sink stores from loops without dedicated block exits. Exits - // containing indirect branches are not transformed by loop simplify, - // make sure we catch that. An additional load may be generated in the - // preheader for SSA updater, so also avoid sinking when no preheader - // is available. - if (!HasDedicatedExits || !Preheader) - return Changed; // Note that we only check GuaranteedToExecute inside the store case // so that we do not introduce stores where they did not exist before @@ -953,6 +981,14 @@ GuaranteedToExecute = isGuaranteedToExecute(*UI, DT, CurLoop, SafetyInfo); + + if (!GuaranteedToExecute && !CanSpeculateLoad) { + CanSpeculateLoad = + isDereferenceableAndAlignedPointer(Store->getPointerOperand(), + Store->getAlignment(), MDL, + Preheader->getTerminator(), + DT, TLI); + } } else return Changed; // Not a load or store. @@ -968,8 +1004,17 @@ } } - // If there isn't a guaranteed-to-execute instruction, we can't promote. - if (!GuaranteedToExecute) + // Check legality per comment above. Otherwise, we can't promote. + bool PromotionIsLegal = GuaranteedToExecute; + if (!PromotionIsLegal && CanSpeculateLoad) { + // If this is a thread local location, then we can insert stores along + // paths which originally didn't have them without violating the memory + // model. + Value *Object = GetUnderlyingObject(SomePtr, MDL); + PromotionIsLegal = isAllocLikeFn(Object, TLI) && + !PointerMayBeCaptured(Object, true, true); + } + if (!PromotionIsLegal) return Changed; // Figure out the loop exits and their insertion points, if this is the Index: llvm/trunk/test/Transforms/LICM/promote-tls.ll =================================================================== --- llvm/trunk/test/Transforms/LICM/promote-tls.ll +++ llvm/trunk/test/Transforms/LICM/promote-tls.ll @@ -0,0 +1,133 @@ +; RUN: opt -tbaa -basicaa -licm -S < %s | FileCheck %s + +; If we can prove a local is thread local, we can insert stores during +; promotion which wouldn't be legal otherwise. + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-linux-generic" + +@p = external global i8* + +declare i8* @malloc(i64) + +; Exercise the TLS case +define i32* @test(i32 %n) { +entry: + ;; ignore the required null check for simplicity + %mem = call dereferenceable(16) noalias i8* @malloc(i64 16) + %addr = bitcast i8* %mem to i32* + br label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + br label %for.header + +for.header: + %i.02 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %old = load i32, i32* %addr, align 4 + ; deliberate impossible to analyze branch + %guard = load volatile i8*, i8** @p + %exitcmp = icmp eq i8* %guard, null + br i1 %exitcmp, label %for.body, label %early-exit + +early-exit: +; CHECK-LABEL: early-exit: +; CHECK: store i32 %new1.lcssa, i32* %addr, align 1 + ret i32* null + +for.body: + %new = add i32 %old, 1 + store i32 %new, i32* %addr, align 4 + %inc = add nsw i32 %i.02, 1 + %cmp = icmp slt i32 %inc, %n + br i1 %cmp, label %for.header, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.body +; CHECK-LABEL: for.cond.for.end_crit_edge: +; CHECK: store i32 %new.lcssa, i32* %addr, align 1 + %split = phi i32* [ %addr, %for.body ] + ret i32* null +} + +declare i8* @not_malloc(i64) + +; Negative test - not TLS +define i32* @test_neg(i32 %n) { +entry: + ;; ignore the required null check for simplicity + %mem = call dereferenceable(16) noalias i8* @not_malloc(i64 16) + %addr = bitcast i8* %mem to i32* + br label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + br label %for.header + +for.header: + %i.02 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %old = load i32, i32* %addr, align 4 + ; deliberate impossible to analyze branch + %guard = load volatile i8*, i8** @p + %exitcmp = icmp eq i8* %guard, null + br i1 %exitcmp, label %for.body, label %early-exit + +early-exit: +; CHECK-LABEL: early-exit: +; CHECK-NOT: store + ret i32* null + +for.body: +; CHECK-LABEL: for.body: +; CHECK: store i32 %new, i32* %addr, align 4 + %new = add i32 %old, 1 + store i32 %new, i32* %addr, align 4 + %inc = add nsw i32 %i.02, 1 + %cmp = icmp slt i32 %inc, %n + br i1 %cmp, label %for.header, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.body +; CHECK-LABEL: for.cond.for.end_crit_edge: +; CHECK-NOT: store + %split = phi i32* [ %addr, %for.body ] + ret i32* null +} + +; Negative test - can't speculate load since branch +; may control alignment +define i32* @test_neg2(i32 %n) { +entry: + ;; ignore the required null check for simplicity + %mem = call dereferenceable(16) noalias i8* @malloc(i64 16) + %addr = bitcast i8* %mem to i32* + br label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + br label %for.header + +for.header: + %i.02 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + ; deliberate impossible to analyze branch + %guard = load volatile i8*, i8** @p + %exitcmp = icmp eq i8* %guard, null + br i1 %exitcmp, label %for.body, label %early-exit + +early-exit: +; CHECK-LABEL: early-exit: +; CHECK-NOT: store + ret i32* null + +for.body: +; CHECK-LABEL: for.body: +; CHECK: store i32 %new, i32* %addr, align 4 + %old = load i32, i32* %addr, align 4 + %new = add i32 %old, 1 + store i32 %new, i32* %addr, align 4 + %inc = add nsw i32 %i.02, 1 + %cmp = icmp slt i32 %inc, %n + br i1 %cmp, label %for.header, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.body +; CHECK-LABEL: for.cond.for.end_crit_edge: +; CHECK-NOT: store + %split = phi i32* [ %addr, %for.body ] + ret i32* null +} +