diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -1282,6 +1282,17 @@ } }; +/// This represents the llvm.type.test intrinsic. +class TypeTestInst : public IntrinsicInst { +public: + static bool classof(const IntrinsicInst *I) { + return I->getIntrinsicID() == Intrinsic::type_test; + } + static bool classof(const Value *V) { + return isa(V) && classof(cast(V)); + } +}; + } // end namespace llvm #endif // LLVM_IR_INTRINSICINST_H diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -416,17 +416,28 @@ return isSafeToLoadUnconditionally(V, Alignment, Size, DL, ScanFrom, DT, TLI); } -/// DefMaxInstsToScan - the default number of maximum instructions +/// DefMaxInstsToScan - the default number of maximum ld/st instructions /// to scan in the block, used by FindAvailableLoadedValue(). /// FindAvailableLoadedValue() was introduced in r60148, to improve jump /// threading in part by eliminating partially redundant loads. /// At that point, the value of MaxInstsToScan was already set to '6' /// without documented explanation. -cl::opt -llvm::DefMaxInstsToScan("available-load-scan-limit", cl::init(6), cl::Hidden, - cl::desc("Use this to specify the default maximum number of instructions " - "to scan backward from a given instruction, when searching for " - "available loaded value")); +cl::opt llvm::DefMaxInstsToScan( + "available-load-scan-limit", cl::init(6), cl::Hidden, + cl::desc( + "Use this to specify the default maximum number of ld/st instructions " + "to scan backward from a given instruction, when searching for " + "available loaded value")); + +// The default maximum number of non ld/st instructions to skip when finding +// available loads. +static cl::opt + DefMaxInstsToSkip("available-load-skip-limit", cl::init(32), cl::Hidden, + cl::desc("Use this to specify the default maximum number " + "of non-ld/st instructions " + "to skip when scanning backward from a given " + "instruction, when searching for " + "available loaded value")); Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BasicBlock *ScanBB, @@ -524,6 +535,11 @@ AAResults *AA, bool *IsLoadCSE, unsigned *NumScanedInst) { if (MaxInstsToScan == 0) MaxInstsToScan = ~0U; + // If the max insts to scan is larger, presumably the caller wanted to give a + // larger budget (potentially unlimited), so bump up the insts to skip + // accordingly. + int MaxNonLdSt = + DefMaxInstsToSkip > MaxInstsToScan ? DefMaxInstsToSkip : MaxInstsToScan; const DataLayout &DL = ScanBB->getModule()->getDataLayout(); const Value *StrippedPtr = Loc.Ptr->stripPointerCasts(); @@ -532,8 +548,13 @@ // We must ignore debug info directives when counting (otherwise they // would affect codegen). Instruction *Inst = &*--ScanFrom; - if (Inst->isDebugOrPseudoInst()) + if (!isa(Inst) && !isa(Inst) && + !Inst->mayWriteToMemory()) { + if (MaxNonLdSt-- == 0) { + return nullptr; + } continue; + } // Restore ScanFrom to expected value in case next test succeeds ScanFrom++; @@ -618,10 +639,20 @@ // queries until later. Value *Available = nullptr;; SmallVector MustNotAliasInsts; + // If the max insts to scan is larger, presumably the caller wanted to give a + // larger budget (potentially unlimited), so bump up the insts to skip + // accordingly. + int MaxNonLdSt = + DefMaxInstsToSkip > MaxInstsToScan ? DefMaxInstsToSkip : MaxInstsToScan; for (Instruction &Inst : make_range(++Load->getReverseIterator(), ScanBB->rend())) { - if (Inst.isDebugOrPseudoInst()) + if (!isa(&Inst) && !isa(&Inst) && + !Inst.mayWriteToMemory()) { + if (MaxNonLdSt-- == 0) { + return nullptr; + } continue; + } if (MaxInstsToScan-- == 0) return nullptr; diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -1404,8 +1404,11 @@ // If PredBB has a single predecessor, continue scanning through the // single predecessor. BasicBlock *SinglePredBB = PredBB; + // Make sure we don't walk backwards into an unreachable loop. + SmallPtrSet Visited; while (!PredAvailable && SinglePredBB && BBIt == SinglePredBB->begin() && - NumScanedInst < DefMaxInstsToScan) { + NumScanedInst < DefMaxInstsToScan && + Visited.insert(SinglePredBB).second) { SinglePredBB = SinglePredBB->getSinglePredecessor(); if (SinglePredBB) { BBIt = SinglePredBB->end(); diff --git a/llvm/test/Transforms/InstCombine/load.ll b/llvm/test/Transforms/InstCombine/load.ll --- a/llvm/test/Transforms/InstCombine/load.ll +++ b/llvm/test/Transforms/InstCombine/load.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -instcombine -S < %s | FileCheck %s -; RUN: opt -passes=instcombine -S < %s | FileCheck %s +; Set available-load-scan-limit to the current default of 6 so that the test +; works the same if the limit is adjusted in the future. +; RUN: opt -instcombine -available-load-scan-limit=6 -S < %s | FileCheck %s +; RUN: opt -passes=instcombine -available-load-scan-limit=6 -S < %s | FileCheck %s target datalayout = "e-m:e-p:64:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1" @@ -100,6 +102,37 @@ ret i32 %X } +declare void @llvm.assume(i1) +declare i1 @llvm.type.test(i8*, metadata) nounwind readnone + +; Ensure that load store forwarding not prevented by the bitcast / type test / +; assume sequences inserted for whole program devirtualization. +define i32 @test8_type_test_assume(i32* %P, [3 x i8*]* %vtable) { +; CHECK-LABEL: @test8_type_test_assume( +; CHECK-NEXT: store i32 1, i32* [[P:%.*]], align 4 +; CHECK-NEXT: bitcast [3 x i8*]* %vtable to i8* +; CHECK-NEXT: call i1 @llvm.type.test +; CHECK-NEXT: call void @llvm.assume +; CHECK-NEXT: bitcast [3 x i8*]* %vtable to i8* +; CHECK-NEXT: call i1 @llvm.type.test +; CHECK-NEXT: call void @llvm.assume +; CHECK-NEXT: ret i32 1 +; + store i32 1, i32* %P + + ; Insert 2 bitcast / type test / assume sequences so that we would be above + ; the scan limit of 6 if they were not ignored. + %vtablei8 = bitcast [3 x i8*]* %vtable to i8* + %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"foo") + tail call void @llvm.assume(i1 %p) + %vtablei82 = bitcast [3 x i8*]* %vtable to i8* + %p2 = call i1 @llvm.type.test(i8* %vtablei82, metadata !"foo") + tail call void @llvm.assume(i1 %p2) + + %X = load i32, i32* %P ; [#uses=1] + ret i32 %X +} + define i32 @test9(i32* %P) { ; CHECK-LABEL: @test9( ; CHECK-NEXT: ret i32 0 diff --git a/llvm/test/Transforms/JumpThreading/thread-loads.ll b/llvm/test/Transforms/JumpThreading/thread-loads.ll --- a/llvm/test/Transforms/JumpThreading/thread-loads.ll +++ b/llvm/test/Transforms/JumpThreading/thread-loads.ll @@ -1,6 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -jump-threading -S | FileCheck %s -; RUN: opt < %s -aa-pipeline=basic-aa -passes=jump-threading -S | FileCheck %s +; Set available-load-scan-limit to the current default of 6 so that the test +; works the same if the limit is adjusted in the future. +; RUN: opt < %s -jump-threading -available-load-scan-limit=6 -S | FileCheck %s +; RUN: opt < %s -aa-pipeline=basic-aa -passes=jump-threading -available-load-scan-limit=6 -S | FileCheck %s +; Try again with a low skip limit. +; RUN: opt < %s -aa-pipeline=basic-aa -passes=jump-threading -available-load-scan-limit=6 -available-load-skip-limit=6 -S | FileCheck %s --check-prefix=SKIPLIM6 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" target triple = "i386-apple-darwin7" @@ -682,6 +686,127 @@ ret i32 10 } +declare void @llvm.assume(i1) +declare i1 @llvm.type.test(i8*, metadata) nounwind readnone + +; Test that we can thread through the block with the partially redundant load (%2), +; ensuring that the threading is not prevented by the bitcast / type test / +; assume sequences inserted for whole program devirtualization. +define i32 @test1_type_test_assume(i32* %P, [3 x i8*]* %vtable) nounwind { +; CHECK-LABEL: @test1_type_test_assume( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 (...) @f1() #[[ATTR0:[0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[BB1:%.*]], label [[BB1_THREAD:%.*]] +; CHECK: bb1.thread: +; CHECK-NEXT: store i32 42, i32* [[P:%.*]], align 4 +; CHECK-NEXT: bitcast [3 x i8*]* %vtable to i8* +; CHECK-NEXT: call i1 @llvm.type.test +; CHECK-NEXT: call void @llvm.assume +; CHECK-NEXT: bitcast [3 x i8*]* %vtable to i8* +; CHECK-NEXT: call i1 @llvm.type.test +; CHECK-NEXT: call void @llvm.assume +; CHECK-NEXT: bitcast [3 x i8*]* %vtable to i8* +; CHECK-NEXT: call i1 @llvm.type.test +; CHECK-NEXT: call void @llvm.assume +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[DOTPR:%.*]] = load i32, i32* [[P]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[DOTPR]], 36 +; CHECK-NEXT: br i1 [[TMP2]], label [[BB3]], label [[BB2:%.*]] +; CHECK: bb2: +; CHECK-NEXT: [[TMP3:%.*]] = tail call i32 (...) @f2() #[[ATTR0]] +; CHECK-NEXT: ret i32 0 +; CHECK: bb3: +; CHECK-NEXT: [[RES_02:%.*]] = phi i32 [ 1, [[BB1_THREAD]] ], [ 0, [[BB1]] ] +; CHECK-NEXT: ret i32 [[RES_02]] +; +; We should not thread if we've set the scan limit low. +; SKIPLIM6-LABEL: @test1_type_test_assume +; SKIPLIM6-NOT: thread +; SKIPLIM6: ret +entry: + %0 = tail call i32 (...) @f1() nounwind ; [#uses=1] + %1 = icmp eq i32 %0, 0 ; [#uses=1] + br i1 %1, label %bb1, label %bb + +bb: ; preds = %entry + store i32 42, i32* %P, align 4 + + ; Insert 3 bitcast / type test / assume sequences so that we would be above + ; a skip limit of 6. + %vtablei8 = bitcast [3 x i8*]* %vtable to i8* + %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"foo") + tail call void @llvm.assume(i1 %p) + %vtablei82 = bitcast [3 x i8*]* %vtable to i8* + %p2 = call i1 @llvm.type.test(i8* %vtablei82, metadata !"foo") + tail call void @llvm.assume(i1 %p2) + %vtablei83 = bitcast [3 x i8*]* %vtable to i8* + %p3 = call i1 @llvm.type.test(i8* %vtablei83, metadata !"foo") + tail call void @llvm.assume(i1 %p3) + + br label %bb1 + +bb1: ; preds = %entry, %bb + %res.0 = phi i32 [ 1, %bb ], [ 0, %entry ] ; [#uses=2] + %2 = load i32, i32* %P, align 4 ; [#uses=1] + %3 = icmp sgt i32 %2, 36 ; [#uses=1] + br i1 %3, label %bb3, label %bb2 + +bb2: ; preds = %bb1 + %4 = tail call i32 (...) @f2() nounwind ; [#uses=0] + ret i32 %res.0 + +bb3: ; preds = %bb1 + ret i32 %res.0 +} + +; Make sure we don't infinite loop in the presence of an unreachable loop +; when searching up through single preds for available pointer loads. +define i32 @fn_SinglePredUnreachable(i1 %c2,i64* %P) { +; CHECK-LABEL: @fn_SinglePredUnreachable( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[L1:%.*]] = load i64, i64* [[P:%.*]], align 4 +; CHECK-NEXT: [[C:%.*]] = icmp eq i64 [[L1]], 0 +; CHECK-NEXT: br label [[COND2:%.*]] +; CHECK: cond1: +; CHECK-NEXT: br i1 false, label [[COND1:%.*]], label %[[CONDPRESPLIT:.*]] +; CHECK: [[CONDPRESPLIT]]: +; CHECK-NEXT: [[L2PR:%.*]] = load i64, i64* [[P]], align 4 +; CHECK-NEXT: br label [[COND2:%.*]] +; CHECK: cond2: +; CHECK-NEXT: [[L2:%.*]] = phi i64 [ [[L2PR]], %[[CONDPRESPLIT]] ] +; CHECK-NEXT: call void @fn2(i64 [[L2]]) +; CHECK-NEXT: [[C3:%.*]] = icmp eq i64 [[L2]], 0 +; CHECK-NEXT: br i1 %c3, label [[COND3:%.*]], label [[END:%.*]] +; CHECK: cond3: +; CHECK-NEXT: call void @fn3(i64 [[L2]]) +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: ret i32 0 +; + +entry: + %l1 = load i64, i64* %P + %c = icmp eq i64 %l1, 0 + br label %cond2 + +cond1: + br i1 false, label %cond1, label %cond2 + +cond2: + %l2 = load i64, i64* %P + call void @fn2(i64 %l2) + %c3 = icmp eq i64 %l2, 0 + br i1 %c3, label %cond3, label %end + +cond3: + call void @fn3(i64 %l2) + br label %end + +end: + ret i32 0 +} ; CHECK: [[RNG4]] = !{i32 0, i32 1}