Index: llvm/include/llvm/IR/Value.h =================================================================== --- llvm/include/llvm/IR/Value.h +++ llvm/include/llvm/IR/Value.h @@ -321,6 +321,13 @@ /// values or constant users. void replaceUsesOutsideBlock(Value *V, BasicBlock *BB); + /// replaceUsesInsideBlock - Go through the uses list for this definition and + /// make each use point to "V" instead of "this" when the use is inside the + /// block. + /// Unlike replaceAllUsesWith this function does not support basic block + /// values or constant users. + void replaceUsesInsideBlock(Value *V, BasicBlock *BB); + //---------------------------------------------------------------------- // Methods for handling the chain of uses of this Value. // Index: llvm/lib/Analysis/MemorySSA.cpp =================================================================== --- llvm/lib/Analysis/MemorySSA.cpp +++ llvm/lib/Analysis/MemorySSA.cpp @@ -284,6 +284,11 @@ case Intrinsic::invariant_start: case Intrinsic::invariant_end: case Intrinsic::assume: + case Intrinsic::noalias_decl: + case Intrinsic::noalias: + case Intrinsic::side_noalias: + case Intrinsic::noalias_arg_guard: + case Intrinsic::noalias_copy_guard: return {false, NoAlias}; case Intrinsic::dbg_addr: case Intrinsic::dbg_declare: @@ -1736,9 +1741,18 @@ // dependencies here. // FIXME: Replace this special casing with a more accurate modelling of // assume's control dependency. - if (IntrinsicInst *II = dyn_cast(I)) - if (II->getIntrinsicID() == Intrinsic::assume) + if (IntrinsicInst *II = dyn_cast(I)) { + switch(II->getIntrinsicID()) { + default: break; + case Intrinsic::assume: + case Intrinsic::noalias_decl: + case Intrinsic::noalias: + case Intrinsic::side_noalias: + case Intrinsic::noalias_arg_guard: + case Intrinsic::noalias_copy_guard: return nullptr; + } + } // Using a nonstandard AA pipelines might leave us with unexpected modref // results for I, so add a check to not model instructions that may not read Index: llvm/lib/IR/Value.cpp =================================================================== --- llvm/lib/IR/Value.cpp +++ llvm/lib/IR/Value.cpp @@ -451,6 +451,26 @@ }); } +// Like replaceAllUsesWith except it does not handle constants or basic blocks. +// This routine leaves uses outside BB. +void Value::replaceUsesInsideBlock(Value *New, BasicBlock *BB) { + assert(New && "Value::replaceUsesInsideBlock(, BB) is invalid!"); + assert(!contains(New, this) && + "this->replaceUsesInsideBlock(expr(this), BB) is NOT valid!"); + assert(New->getType() == getType() && + "replaceUses of value with new value of different type!"); + assert(BB && "Basic block that may contain a use of 'New' must be defined\n"); + + use_iterator UI = use_begin(), E = use_end(); + for (; UI != E;) { + Use &U = *UI; + ++UI; + auto *Usr = dyn_cast(U.getUser()); + if (Usr && Usr->getParent() == BB) + U.set(New); + } +} + namespace { // Various metrics for how much to strip off of pointers. enum PointerStripKind { Index: llvm/lib/Transforms/Utils/LoopRotationUtils.cpp =================================================================== --- llvm/lib/Transforms/Utils/LoopRotationUtils.cpp +++ llvm/lib/Transforms/Utils/LoopRotationUtils.cpp @@ -38,6 +38,7 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/NoAliasUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include "llvm/Transforms/Utils/ValueMapper.h" using namespace llvm; @@ -255,6 +256,81 @@ return false; } + // check if there are local restrict declarations and how they are used. + // - avoid breaking up mixed usage: + // -- either all usages must be in the OrigHeader, or no usages must be in the + // OrigHeader + // -- when usages are outside the function, and we decide to continue, break + // the connection with the llvm.noalias.decl, as it will have no impact any + // more. + SmallVector NoAliasDeclInstructions; + { + SmallVector SideNoAliasOrNoAliasToDisconnect; + for (Instruction &I : *OrigHeader) { + if (IntrinsicInst *II = dyn_cast(&I)) { + if (II->getIntrinsicID() == Intrinsic::noalias_decl) { + // Check usage validity: + bool UsedInsideHeader = false; + bool UsedOutsideHeaderInsideLoop = false; + for (User *U : II->users()) { + Instruction *UI=cast(U); + if (UI->getParent() == OrigHeader) { + UsedInsideHeader = true; + } else if (L->contains(UI)) { + UsedOutsideHeaderInsideLoop = true; + } else { + if (PHINode *PUI = dyn_cast(UI)) { + if (PUI->getNumIncomingValues() > 1) { + LLVM_DEBUG(llvm::dbgs() << "LoopRotation: NOT rotating - " << + *II << "\n used in PHI node " << *PUI << + ",\n in exit block with multiple entries.\n"); + return false; + } + } + SideNoAliasOrNoAliasToDisconnect.push_back(UI); + } + } + + if (UsedInsideHeader && UsedOutsideHeaderInsideLoop) { + LLVM_DEBUG(llvm::dbgs() << "LoopRotation: NOT rotating - " << + *II << " used in header and other parts of the loop.\n" + " Rotation would reduced the llvm.noalias quality too much.\n"); + return false; + } + + NoAliasDeclInstructions.push_back(II); + } + } + } + + // If we get here, we will do the rotate. + // First break the link between any llvm.noalias.decl and its outside-loop + // usage + for (Instruction *I : SideNoAliasOrNoAliasToDisconnect) { + unsigned OpN; + if (PHINode* PI = dyn_cast(I)) { + // can only happen when in the exit block -> single predecessor ! + assert(PI->getNumIncomingValues() == 1 && + "PHI node should have a single value"); + (void)PI; // Silence not-used warning in Release builds + OpN=0; + } else { + IntrinsicInst *II = cast(I); + if (II->getIntrinsicID() == Intrinsic::side_noalias) { + OpN = Intrinsic::SideNoAliasNoAliasDeclArg; + } else if (II->getIntrinsicID() == Intrinsic::noalias) { + OpN = Intrinsic::NoAliasNoAliasDeclArg; + } else { + assert(II->getIntrinsicID() == Intrinsic::noalias_copy_guard); + OpN = Intrinsic::NoAliasCopyGuardNoAliasDeclArg; + } + } + + auto *PT = cast(I->getOperand(OpN)->getType()); + I->setOperand(OpN, ConstantPointerNull::get(PT)); + } + } + // Now, this loop is suitable for rotation. BasicBlock *OrigPreheader = L->getLoopPreheader(); @@ -380,6 +456,74 @@ } } + if (!NoAliasDeclInstructions.empty()) { + // There are local restrict declarations: + // (general): + // Original: OrigPre { OrigHeader NewHeader ... Latch } + // after: (OrigPre+OrigHeader') { NewHeader ... Latch OrigHeader } + // + // with D: llvm.noalias.decl, U: side_noalias, depending on D + // ... { D U } can transform into: + // (0) : ... { D U } // no relevant rotation for this part + // (1) : ... D' { U D } + // (2) : ... D' U' { D U } + // + // We now want to transform: + // (1) -> : ... D' { D U D'' } + // (2) -> : ... D' U' { D D'' U'' } + // D: original llvm.noalias.decl + // D', U': duplicate with replaced scopes + // D'', U'': different duplicate with replaced scopes + // This ensures a safe fallback to 'may_alias' introduced by the rotate, as + // U'' and U' + // scopes will not be compatible wrt to the local restrict + + // Clone the NoAliasDecl again, insert right after the original, move the + // original to the NewHeader. + + Instruction* NewHeaderInsertionPoint = &(*NewHeader->begin()); + for (Instruction* NAD : NoAliasDeclInstructions) { + LLVM_DEBUG(llvm::dbgs() << " Cloning llvm.noalias.decl:" << *NAD << "\n"); + Instruction* NewNAD=NAD->clone(); + NewNAD->insertBefore(NAD); + + // remap dependencies in the OrigHeader block to NewNAD + NAD->replaceUsesInsideBlock(NewNAD, OrigHeader); + + // And move the original NAD to the NewHeader + NAD->moveBefore(NewHeaderInsertionPoint); + + // Now forget about the original NAD mapping + auto tmp = ValueMap[NAD]; // use a local copy to avoid undefined behavior + ValueMap[NewNAD] = tmp; // this could trigger a reallocation. + ValueMap.erase(NAD); + } + + // Scopes must now be duplicated, once for OrigHeader and once for + // OrigPreHeader' + { + auto& Context = NewHeader->getContext(); + + SmallVector NoAliasDeclScopes; + for (Instruction* NAD : NoAliasDeclInstructions) + NoAliasDeclScopes.push_back(cast( + NAD->getOperand(Intrinsic::NoAliasDeclScopeArg))); + + LLVM_DEBUG(llvm::dbgs() << " Updating OrigHeader scopes\n"); + llvm::cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, { OrigHeader }, + Context, "h.rot1"); + LLVM_DEBUG(OrigHeader->dump()); + + LLVM_DEBUG(llvm::dbgs() << " Updating OrigPreheader scopes\n"); + llvm::cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, { OrigPreheader }, + Context, "pre.rot1"); + LLVM_DEBUG(OrigPreheader->dump()); + + LLVM_DEBUG(llvm::dbgs() << " Updated NewHeader:\n"); + LLVM_DEBUG(NewHeader->dump()); + } + } + // Along with all the other instructions, we just cloned OrigHeader's // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's // successors by duplicating their incoming values for OrigHeader. Index: llvm/test/Transforms/LoopRotate/noalias.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopRotate/noalias.ll @@ -0,0 +1,186 @@ +; RUN: opt -S -loop-rotate < %s | FileCheck %s +; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -passes='require,require,loop(rotate)' < %s | FileCheck %s +; RUN: opt -S -passes='require,require,loop(rotate)' -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare void @g(i32*) + +define void @test_02(i32* nocapture %_pA) nounwind ssp { +entry: + %array = alloca [20 x i32], align 16 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %p.decl = tail call i8* @llvm.noalias.decl.p0i8.p0p0i32.i32(i32** null, i32 0, metadata !2) + %side.p = tail call i32* @llvm.side.noalias.p0i32.p0i8.p0p0i32.p0p0i32.i32(i32* %_pA, i8* %p.decl, i32** null, i32** undef, i32 0, metadata !2), !tbaa !5, !noalias !2 + store i32 42, i32* %_pA, noalias_sidechannel i32* %side.p, align 16 + %cmp = icmp slt i32 %i.0, 100 + %arrayidx = getelementptr inbounds [20 x i32], [20 x i32]* %array, i64 0, i64 0 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + store i32 0, i32* %arrayidx, align 16 + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + %arrayidx.lcssa = phi i32* [ %arrayidx, %for.cond ] + call void @g(i32* %arrayidx.lcssa) nounwind + ret void +} + +; CHECK-LABEL: @test_02( +; CHECK: entry: +; CHECK: %p.decl1 = tail call i8* @llvm.noalias.decl.p0i8.p0p0i32.i32(i32** null, i32 0, metadata !2) +; CHECK: %side.p2 = tail call i32* @llvm.side.noalias.p0i32.p0i8.p0p0i32.p0p0i32.i32(i32* %_pA, i8* %p.decl1, i32** null, i32** undef, i32 0, metadata !2), !tbaa !5, !noalias !2 +; CHECK: store i32 42, i32* %_pA, noalias_sidechannel i32* undef, align 16 +; CHECK: for.body: +; CHECK: %p.decl = tail call i8* @llvm.noalias.decl.p0i8.p0p0i32.i32(i32** null, i32 0, metadata !9) +; CHECK: %0 = tail call i8* @llvm.noalias.decl.p0i8.p0p0i32.i32(i32** null, i32 0, metadata !11) +; CHECK: %side.p = tail call i32* @llvm.side.noalias.p0i32.p0i8.p0p0i32.p0p0i32.i32(i32* %_pA, i8* %0, i32** null, i32** undef, i32 0, metadata !11), !tbaa !5, !noalias !11 +; CHECK: store i32 42, i32* %_pA, noalias_sidechannel i32* %side.p, align 16 +; CHECK: for.end: + + +define void @test_03(i32* nocapture %_pA) nounwind ssp { +entry: + %array = alloca [20 x i32], align 16 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %cmp = icmp slt i32 %i.0, 100 + %arrayidx = getelementptr inbounds [20 x i32], [20 x i32]* %array, i64 0, i64 0 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %p.decl = tail call i8* @llvm.noalias.decl.p0i8.p0p0i32.i32(i32** null, i32 0, metadata !2) + %side.p = tail call i32* @llvm.side.noalias.p0i32.p0i8.p0p0i32.p0p0i32.i32(i32* %_pA, i8* %p.decl, i32** null, i32** undef, i32 0, metadata !2), !tbaa !5, !noalias !2 + store i32 42, i32* %_pA, noalias_sidechannel i32* %side.p, align 16 + store i32 0, i32* %arrayidx, align 16 + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + %arrayidx.lcssa = phi i32* [ %arrayidx, %for.cond ] + call void @g(i32* %arrayidx.lcssa) nounwind + ret void +} +; CHECK-LABEL: @test_03( +; CHECK: entry: +; CHECK: for.body: +; CHECK: %p.decl = tail call i8* @llvm.noalias.decl.p0i8.p0p0i32.i32(i32** null, i32 0, metadata !9) +; CHECK: %side.p = tail call i32* @llvm.side.noalias.p0i32.p0i8.p0p0i32.p0p0i32.i32(i32* %_pA, i8* %p.decl, i32** null, i32** undef, i32 0, metadata !9), !tbaa !5, !noalias !9 +; CHECK: store i32 42, i32* %_pA, noalias_sidechannel i32* %side.p, align 16 +; CHECK: for.end: + +define void @test_04(i32* nocapture %_pA) nounwind ssp { +entry: + %array = alloca [20 x i32], align 16 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %p.decl = tail call i8* @llvm.noalias.decl.p0i8.p0p0i32.i32(i32** null, i32 0, metadata !2) + %side.p = tail call i32* @llvm.side.noalias.p0i32.p0i8.p0p0i32.p0p0i32.i32(i32* %_pA, i8* %p.decl, i32** null, i32** undef, i32 0, metadata !2), !tbaa !5, !noalias !2 + store i32 42, i32* %_pA, noalias_sidechannel i32* %side.p, align 16 + %cmp = icmp slt i32 %i.0, 100 + %arrayidx = getelementptr inbounds [20 x i32], [20 x i32]* %array, i64 0, i64 0 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + store i32 0, i32* %arrayidx, align 16 + store i32 43, i32* %_pA, noalias_sidechannel i32* %side.p, align 16 + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + %arrayidx.lcssa = phi i32* [ %arrayidx, %for.cond ] + call void @g(i32* %arrayidx.lcssa) nounwind + ret void +} +; CHECK-LABEL: @test_04( +; CHECK: entry: +; CHECK: %p.decl1 = tail call i8* @llvm.noalias.decl.p0i8.p0p0i32.i32(i32** null, i32 0, metadata !13) +; CHECK: %side.p2 = tail call i32* @llvm.side.noalias.p0i32.p0i8.p0p0i32.p0p0i32.i32(i32* %_pA, i8* %p.decl1, i32** null, i32** undef, i32 0, metadata !13), !tbaa !5, !noalias !13 +; CHECK: store i32 42, i32* %_pA, noalias_sidechannel i32* undef, align 16 +; CHECK: for.body: +; CHECK: %side.p4 = phi i32* [ %side.p2, %entry ], [ %side.p, %for.body ] +; CHECK: %p.decl = tail call i8* @llvm.noalias.decl.p0i8.p0p0i32.i32(i32** null, i32 0, metadata !9) +; CHECK: store i32 43, i32* %_pA, noalias_sidechannel i32* %side.p4, align 16 +; CHECK: %0 = tail call i8* @llvm.noalias.decl.p0i8.p0p0i32.i32(i32** null, i32 0, metadata !15) +; CHECK: %side.p = tail call i32* @llvm.side.noalias.p0i32.p0i8.p0p0i32.p0p0i32.i32(i32* %_pA, i8* %0, i32** null, i32** undef, i32 0, metadata !15), !tbaa !5, !noalias !15 +; CHECK: store i32 42, i32* %_pA, noalias_sidechannel i32* %side.p, align 16 +; CHECK: for.end: + +define void @test_05(i32* nocapture %_pA) nounwind ssp { +entry: + %array = alloca [20 x i32], align 16 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %p.decl = tail call i8* @llvm.noalias.decl.p0i8.p0p0i32.i32(i32** null, i32 0, metadata !2) + %side.p = tail call i32* @llvm.side.noalias.p0i32.p0i8.p0p0i32.p0p0i32.i32(i32* %_pA, i8* %p.decl, i32** null, i32** undef, i32 0, metadata !2), !tbaa !5, !noalias !2 + store i32 42, i32* %_pA, noalias_sidechannel i32* %side.p, align 16 + %cmp = icmp slt i32 %i.0, 100 + %arrayidx = getelementptr inbounds [20 x i32], [20 x i32]* %array, i64 0, i64 0 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + store i32 0, i32* %arrayidx, align 16 + store i32 43, i32* %_pA, noalias_sidechannel i32* %side.p, align 16 + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + %arrayidx.lcssa = phi i32* [ %arrayidx, %for.cond ] + store i32 44, i32* %_pA, noalias_sidechannel i32* %side.p, align 16 + call void @g(i32* %arrayidx.lcssa) nounwind + ret void +} +; CHECK-LABEL: @test_05( +; CHECK: entry: +; CHECK: %p.decl1 = tail call i8* @llvm.noalias.decl.p0i8.p0p0i32.i32(i32** null, i32 0, metadata !17) +; CHECK: %side.p2 = tail call i32* @llvm.side.noalias.p0i32.p0i8.p0p0i32.p0p0i32.i32(i32* %_pA, i8* %p.decl1, i32** null, i32** undef, i32 0, metadata !17), !tbaa !5, !noalias !17 +; CHECK: store i32 42, i32* %_pA, noalias_sidechannel i32* undef, align 16 +; CHECK: for.body: +; CHECK: %side.p4 = phi i32* [ %side.p2, %entry ], [ %side.p, %for.body ] +; CHECK: %p.decl = tail call i8* @llvm.noalias.decl.p0i8.p0p0i32.i32(i32** null, i32 0, metadata !9) +; CHECK: store i32 43, i32* %_pA, noalias_sidechannel i32* %side.p4, align 16 +; CHECK: %0 = tail call i8* @llvm.noalias.decl.p0i8.p0p0i32.i32(i32** null, i32 0, metadata !19) +; CHECK: %side.p = tail call i32* @llvm.side.noalias.p0i32.p0i8.p0p0i32.p0p0i32.i32(i32* %_pA, i8* %0, i32** null, i32** undef, i32 0, metadata !19), !tbaa !5, !noalias !19 +; CHECK: store i32 42, i32* %_pA, noalias_sidechannel i32* %side.p, align 16 +; CHECK: for.end: +; CHECK: %side.p.lcssa = phi i32* [ %side.p, %for.body ] +; CHECK: store i32 44, i32* %_pA, noalias_sidechannel i32* %side.p.lcssa, align 16 + + +; Function Attrs: argmemonly nounwind +declare i8* @llvm.noalias.decl.p0i8.p0p0i32.i32(i32**, i32, metadata) #1 + +; Function Attrs: nounwind readnone speculatable +declare i32* @llvm.side.noalias.p0i32.p0i8.p0p0i32.p0p0i32.i32(i32*, i8*, i32**, i32**, i32, metadata) #2 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind readnone speculatable } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang"} +!2 = !{!3} +!3 = distinct !{!3, !4, !"test_loop_rotate_XX: pA"} +!4 = distinct !{!4, !"test_loop_rotate_XX"} +!5 = !{!6, !6, i64 0, i64 4} +!6 = !{!7, i64 4, !"any pointer"} +!7 = !{!8, i64 1, !"omnipotent char"} +!8 = !{!"Simple C/C++ TBAA"} +!9 = !{!10, !10, i64 0, i64 4} +!10 = !{!7, i64 4, !"int"}