diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -1282,6 +1282,17 @@ } }; +/// This represents the llvm.type.test intrinsic. +class TypeTestInst : public IntrinsicInst { +public: + static bool classof(const IntrinsicInst *I) { + return I->getIntrinsicID() == Intrinsic::type_test; + } + static bool classof(const Value *V) { + return isa(V) && classof(cast(V)); + } +}; + } // end namespace llvm #endif // LLVM_IR_INTRINSICINST_H diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -532,7 +532,17 @@ // We must ignore debug info directives when counting (otherwise they // would affect codegen). Instruction *Inst = &*--ScanFrom; - if (Inst->isDebugOrPseudoInst()) + // Ignore unrelated instruction types that in particular may only be + // inserted under special compile modes. The scan limit is set quite small + // (6 instructions) by default, so these can have an outsized effect on this + // optimization. Additionally, if these instructions are only applied when + // feeding back instrumentation PGO, different optimization can affect + // matching. Specifically, it is not uncommon to apply -g or whole program + // devirtualization (which results in bitcast / llvm.type.test / llvm.assume + // sequences) only when feeding back profiles and not for the + // instrumentation build. + if (Inst->isDebugOrPseudoInst() || isa(Inst) || + isa(Inst) || isa(Inst)) continue; // Restore ScanFrom to expected value in case next test succeeds @@ -620,7 +630,17 @@ SmallVector MustNotAliasInsts; for (Instruction &Inst : make_range(++Load->getReverseIterator(), ScanBB->rend())) { - if (Inst.isDebugOrPseudoInst()) + // Ignore unrelated instruction types that in particular may only be + // inserted under special compile modes. The scan limit is set quite small + // (6 instructions) by default, so these can have an outsized effect on this + // optimization. Additionally, if these instructions are only applied when + // feeding back instrumentation PGO, different optimization can affect + // matching. Specifically, it is not uncommon to apply -g or whole program + // devirtualization (which results in bitcast / llvm.type.test / llvm.assume + // sequences) only when feeding back profiles and not for the + // instrumentation build. + if (Inst.isDebugOrPseudoInst() || isa(&Inst) || + isa(&Inst) || isa(&Inst)) continue; if (MaxInstsToScan-- == 0) diff --git a/llvm/test/Transforms/InstCombine/load.ll b/llvm/test/Transforms/InstCombine/load.ll --- a/llvm/test/Transforms/InstCombine/load.ll +++ b/llvm/test/Transforms/InstCombine/load.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -instcombine -S < %s | FileCheck %s -; RUN: opt -passes=instcombine -S < %s | FileCheck %s +; Set available-load-scan-limit to the current default of 6 so that the test +; works the same if the limit is adjusted in the future. +; RUN: opt -instcombine -available-load-scan-limit=6 -S < %s | FileCheck %s +; RUN: opt -passes=instcombine -available-load-scan-limit=6 -S < %s | FileCheck %s target datalayout = "e-m:e-p:64:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1" @@ -100,6 +102,37 @@ ret i32 %X } +declare void @llvm.assume(i1) +declare i1 @llvm.type.test(i8*, metadata) nounwind readnone + +; Ensure that load store forwarding not prevented by the bitcast / type test / +; assume sequences inserted for whole program devirtualization. +define i32 @test8_type_test_assume(i32* %P, [3 x i8*]* %vtable) { +; CHECK-LABEL: @test8_type_test_assume( +; CHECK-NEXT: store i32 1, i32* [[P:%.*]], align 4 +; CHECK-NEXT: bitcast [3 x i8*]* %vtable to i8* +; CHECK-NEXT: call i1 @llvm.type.test +; CHECK-NEXT: call void @llvm.assume +; CHECK-NEXT: bitcast [3 x i8*]* %vtable to i8* +; CHECK-NEXT: call i1 @llvm.type.test +; CHECK-NEXT: call void @llvm.assume +; CHECK-NEXT: ret i32 1 +; + store i32 1, i32* %P + + ; Insert 2 bitcast / type test / assume sequences so that we would be above + ; the scan limit of 6 if they were not ignored. + %vtablei8 = bitcast [3 x i8*]* %vtable to i8* + %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"foo") + tail call void @llvm.assume(i1 %p) + %vtablei82 = bitcast [3 x i8*]* %vtable to i8* + %p2 = call i1 @llvm.type.test(i8* %vtablei82, metadata !"foo") + tail call void @llvm.assume(i1 %p2) + + %X = load i32, i32* %P ; [#uses=1] + ret i32 %X +} + define i32 @test9(i32* %P) { ; CHECK-LABEL: @test9( ; CHECK-NEXT: ret i32 0 diff --git a/llvm/test/Transforms/JumpThreading/thread-loads.ll b/llvm/test/Transforms/JumpThreading/thread-loads.ll --- a/llvm/test/Transforms/JumpThreading/thread-loads.ll +++ b/llvm/test/Transforms/JumpThreading/thread-loads.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -jump-threading -S | FileCheck %s -; RUN: opt < %s -aa-pipeline=basic-aa -passes=jump-threading -S | FileCheck %s +; Set available-load-scan-limit to the current default of 6 so that the test +; works the same if the limit is adjusted in the future. +; RUN: opt < %s -jump-threading -available-load-scan-limit=6 -S | FileCheck %s +; RUN: opt < %s -aa-pipeline=basic-aa -passes=jump-threading -available-load-scan-limit=6 -S | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" target triple = "i386-apple-darwin7" @@ -682,6 +684,70 @@ ret i32 10 } +declare void @llvm.assume(i1) +declare i1 @llvm.type.test(i8*, metadata) nounwind readnone + +; Test that we can thread through the block with the partially redundant load (%2), +; ensuring that the threading is not prevented by the bitcast / type test / +; assume sequences inserted for whole program devirtualization. +define i32 @test1_type_test_assume(i32* %P, [3 x i8*]* %vtable) nounwind { +; CHECK-LABEL: @test1_type_test_assume( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 (...) @f1() #[[ATTR0:[0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[BB1:%.*]], label [[BB1_THREAD:%.*]] +; CHECK: bb1.thread: +; CHECK-NEXT: store i32 42, i32* [[P:%.*]], align 4 +; CHECK-NEXT: bitcast [3 x i8*]* %vtable to i8* +; CHECK-NEXT: call i1 @llvm.type.test +; CHECK-NEXT: call void @llvm.assume +; CHECK-NEXT: bitcast [3 x i8*]* %vtable to i8* +; CHECK-NEXT: call i1 @llvm.type.test +; CHECK-NEXT: call void @llvm.assume +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[DOTPR:%.*]] = load i32, i32* [[P]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[DOTPR]], 36 +; CHECK-NEXT: br i1 [[TMP2]], label [[BB3]], label [[BB2:%.*]] +; CHECK: bb2: +; CHECK-NEXT: [[TMP3:%.*]] = tail call i32 (...) @f2() #[[ATTR0]] +; CHECK-NEXT: ret i32 0 +; CHECK: bb3: +; CHECK-NEXT: [[RES_02:%.*]] = phi i32 [ 1, [[BB1_THREAD]] ], [ 0, [[BB1]] ] +; CHECK-NEXT: ret i32 [[RES_02]] +; +entry: + %0 = tail call i32 (...) @f1() nounwind ; [#uses=1] + %1 = icmp eq i32 %0, 0 ; [#uses=1] + br i1 %1, label %bb1, label %bb + +bb: ; preds = %entry + store i32 42, i32* %P, align 4 + + ; Insert 2 bitcast / type test / assume sequences so that we would be above + ; the scan limit of 6 if they were not ignored. + %vtablei8 = bitcast [3 x i8*]* %vtable to i8* + %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"foo") + tail call void @llvm.assume(i1 %p) + %vtablei82 = bitcast [3 x i8*]* %vtable to i8* + %p2 = call i1 @llvm.type.test(i8* %vtablei82, metadata !"foo") + tail call void @llvm.assume(i1 %p2) + + br label %bb1 + +bb1: ; preds = %entry, %bb + %res.0 = phi i32 [ 1, %bb ], [ 0, %entry ] ; [#uses=2] + %2 = load i32, i32* %P, align 4 ; [#uses=1] + %3 = icmp sgt i32 %2, 36 ; [#uses=1] + br i1 %3, label %bb3, label %bb2 + +bb2: ; preds = %bb1 + %4 = tail call i32 (...) @f2() nounwind ; [#uses=0] + ret i32 %res.0 + +bb3: ; preds = %bb1 + ret i32 %res.0 +} ; CHECK: [[RNG4]] = !{i32 0, i32 1}