Index: llvm/lib/Transforms/Utils/LoopRotationUtils.cpp =================================================================== --- llvm/lib/Transforms/Utils/LoopRotationUtils.cpp +++ llvm/lib/Transforms/Utils/LoopRotationUtils.cpp @@ -35,6 +35,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -400,6 +401,15 @@ break; } + // Remember the local noalias scope declarations in the header. After the + // rotation, they must be duplicated and the scope must be cloned. This + // avoids unwanted interaction across iterations. + SmallVector NoAliasDeclInstructions; + for (Instruction &I : *OrigHeader) + if (IntrinsicInst *II = dyn_cast(&I)) + if (II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) + NoAliasDeclInstructions.push_back(II); + while (I != E) { Instruction *Inst = &*I++; @@ -460,6 +470,62 @@ } } + if (!NoAliasDeclInstructions.empty()) { + // There are noalias scope declarations: + // (general): + // Original: OrigPre { OrigHeader NewHeader ... Latch } + // after: (OrigPre+OrigHeader') { NewHeader ... Latch OrigHeader } + // + // with D: llvm.experimental.noalias.scope.decl, + // U: !noalias or !alias.scope depending on D + // ... { D U1 U2 } can transform into: + // (0) : ... { D U1 U2 } // no relevant rotation for this part + // (1) : ... D' { U1 U2 D } // D is part of OrigHeader + // (2) : ... D' U1' { U2 D U1 } // D, U1 are part of OrigHeader + // + // We now want to transform: + // (1) -> : ... D' { D U1 U2 D'' } + // (2) -> : ... D' U1' { D U2 D'' U1'' } + // D: original llvm.experimental.noalias.scope.decl + // D', U1': duplicate with replaced scopes + // D'', U1'': different duplicate with replaced scopes + // This ensures a safe fallback to 'may_alias' introduced by the rotate, + // as U1'' and U1' scopes will not be compatible wrt to the local restrict + + // Clone the llvm.experimental.noalias.decl again for the NewHeader. + Instruction *NewHeaderInsertionPoint = &(*NewHeader->getFirstNonPHI()); + for (Instruction *NAD : NoAliasDeclInstructions) { + LLVM_DEBUG(dbgs() << " Cloning llvm.experimental.noalias.scope.decl:" + << *NAD << "\n"); + Instruction *NewNAD = NAD->clone(); + NewNAD->insertBefore(NewHeaderInsertionPoint); + } + + // Scopes must now be duplicated, once for OrigHeader and once for + // OrigPreHeader'. + { + auto &Context = NewHeader->getContext(); + + SmallVector NoAliasDeclScopes; + for (Instruction *NAD : NoAliasDeclInstructions) + NoAliasDeclScopes.push_back(cast( + NAD->getOperand(Intrinsic::NoAliasScopeDeclScopeArg))); + + LLVM_DEBUG(dbgs() << " Updating OrigHeader scopes\n"); + cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, {OrigHeader}, Context, + "h.rot"); + LLVM_DEBUG(OrigHeader->dump()); + + LLVM_DEBUG(dbgs() << " Updating OrigPreheader scopes\n"); + cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, {OrigPreheader}, Context, + "pre.rot"); + LLVM_DEBUG(OrigPreheader->dump()); + + LLVM_DEBUG(dbgs() << " Updated NewHeader:\n"); + LLVM_DEBUG(NewHeader->dump()); + } + } + // Along with all the other instructions, we just cloned OrigHeader's // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's // successors by duplicating their incoming values for OrigHeader. Index: llvm/test/Transforms/LoopRotate/noalias.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopRotate/noalias.ll @@ -0,0 +1,185 @@ +; RUN: opt -S -loop-rotate < %s | FileCheck %s +; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -passes='require,require,loop(loop-rotate)' < %s | FileCheck %s +; RUN: opt -S -passes='require,require,loop(loop-rotate)' -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare void @g(i32*) + +define void @test_02(i32* nocapture %_pA) nounwind ssp { +; CHECK-LABEL: @test_02( +; CHECK: entry: +; CHECK: tail call void @llvm.experimental.noalias.scope.decl(metadata !2) +; CHECK: store i32 42, i32* %_pA, align 16, !alias.scope !2 +; CHECK: for.body: +; CHECK: tail call void @llvm.experimental.noalias.scope.decl(metadata !5) +; CHECK: store i32 0, i32* %arrayidx, align 16, !noalias !5 +; CHECK: tail call void @llvm.experimental.noalias.scope.decl(metadata !7) +; CHECK: store i32 42, i32* %_pA, align 16, !alias.scope !7 +; CHECK: for.end: + +entry: + %array = alloca [20 x i32], align 16 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + tail call void @llvm.experimental.noalias.scope.decl(metadata !2) + store i32 42, i32* %_pA, align 16, !alias.scope !2 + %cmp = icmp slt i32 %i.0, 100 + %arrayidx = getelementptr inbounds [20 x i32], [20 x i32]* %array, i64 0, i64 0 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + store i32 0, i32* %arrayidx, align 16, !noalias !2 + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + %arrayidx.lcssa = phi i32* [ %arrayidx, %for.cond ] + call void @g(i32* %arrayidx.lcssa) nounwind + ret void +} + +define void @test_03(i32* nocapture %_pA) nounwind ssp { +; CHECK-LABEL: @test_03( +; CHECK: entry: +; CHECK: for.body: +; CHECK: tail call void @llvm.experimental.noalias.scope.decl(metadata !5) +; CHECK: store i32 42, i32* %_pA, align 16, !alias.scope !5 +; CHECK: store i32 0, i32* %arrayidx, align 16, !noalias !5 +; CHECK: for.end: + +entry: + %array = alloca [20 x i32], align 16 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %cmp = icmp slt i32 %i.0, 100 + %arrayidx = getelementptr inbounds [20 x i32], [20 x i32]* %array, i64 0, i64 0 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + tail call void @llvm.experimental.noalias.scope.decl(metadata !2) + store i32 42, i32* %_pA, align 16, !alias.scope !2 + store i32 0, i32* %arrayidx, align 16, !noalias !2 + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + %arrayidx.lcssa = phi i32* [ %arrayidx, %for.cond ] + call void @g(i32* %arrayidx.lcssa) nounwind + ret void +} + +define void @test_04(i32* nocapture %_pA) nounwind ssp { +; CHECK-LABEL: @test_04( +; CHECK: entry: +; CHECK: tail call void @llvm.experimental.noalias.scope.decl(metadata !9) +; CHECK: store i32 42, i32* %_pA, align 16, !alias.scope !9 +; CHECK: for.body: +; CHECK: tail call void @llvm.experimental.noalias.scope.decl(metadata !5) +; CHECK: store i32 0, i32* %arrayidx, align 16, !noalias !5 +; CHECK: store i32 43, i32* %_pA, align 16, !alias.scope !5 +; CHECK: tail call void @llvm.experimental.noalias.scope.decl(metadata !11) +; CHECK: store i32 42, i32* %_pA, align 16, !alias.scope !11 +; CHECK: for.end: +entry: + %array = alloca [20 x i32], align 16 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + tail call void @llvm.experimental.noalias.scope.decl(metadata !2) + store i32 42, i32* %_pA, align 16, !alias.scope !2 + %cmp = icmp slt i32 %i.0, 100 + %arrayidx = getelementptr inbounds [20 x i32], [20 x i32]* %array, i64 0, i64 0 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + store i32 0, i32* %arrayidx, align 16, !noalias !2 + store i32 43, i32* %_pA, align 16, !alias.scope !2 + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + %arrayidx.lcssa = phi i32* [ %arrayidx, %for.cond ] + call void @g(i32* %arrayidx.lcssa) nounwind + ret void +} + +define void @test_05(i32* nocapture %_pA) nounwind ssp { +; CHECK-LABEL: @test_05( +; CHECK: entry: +; CHECK: tail call void @llvm.experimental.noalias.scope.decl(metadata !13) +; CHECK: store i32 42, i32* %_pA, align 16, !alias.scope !13 +; CHECK: for.body: +; CHECK: tail call void @llvm.experimental.noalias.scope.decl(metadata !5) +; CHECK: store i32 0, i32* %arrayidx, align 16, !noalias !5 +; CHECK: store i32 43, i32* %_pA, align 16, !alias.scope !5 +; CHECK: tail call void @llvm.experimental.noalias.scope.decl(metadata !15) +; CHECK: store i32 42, i32* %_pA, align 16, !alias.scope !15 +; CHECK: for.end: +; CHECK: store i32 44, i32* %_pA, align 16, !alias.scope !5 + +entry: + %array = alloca [20 x i32], align 16 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + tail call void @llvm.experimental.noalias.scope.decl(metadata !2) + store i32 42, i32* %_pA, align 16, !alias.scope !2 + %cmp = icmp slt i32 %i.0, 100 + %arrayidx = getelementptr inbounds [20 x i32], [20 x i32]* %array, i64 0, i64 0 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + store i32 0, i32* %arrayidx, align 16, !noalias !2 + store i32 43, i32* %_pA, align 16, !alias.scope !2 + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + %arrayidx.lcssa = phi i32* [ %arrayidx, %for.cond ] + store i32 44, i32* %_pA, align 16, !alias.scope !2 + call void @g(i32* %arrayidx.lcssa) nounwind + ret void +} + +; Function Attrs: inaccessiblememonly nounwind +declare void @llvm.experimental.noalias.scope.decl(metadata) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { inaccessiblememonly nounwind } +attributes #2 = { nounwind readnone speculatable } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang"} +!2 = !{!3} +!3 = distinct !{!3, !4, !"test_loop_rotate_XX: pA"} +!4 = distinct !{!4, !"test_loop_rotate_XX"} + +; CHECK: !0 = !{i32 1, !"wchar_size", i32 4} +; CHECK: !1 = !{!"clang"} +; CHECK: !2 = !{!3} +; CHECK: !3 = distinct !{!3, !4, !"test_loop_rotate_XX: pA:pre.rot"} +; CHECK: !4 = distinct !{!4, !"test_loop_rotate_XX"} +; CHECK: !5 = !{!6} +; CHECK: !6 = distinct !{!6, !4, !"test_loop_rotate_XX: pA"} +; CHECK: !7 = !{!8} +; CHECK: !8 = distinct !{!8, !4, !"test_loop_rotate_XX: pA:h.rot"} +; CHECK: !9 = !{!10} +; CHECK: !10 = distinct !{!10, !4, !"test_loop_rotate_XX: pA:pre.rot"} +; CHECK: !11 = !{!12} +; CHECK: !12 = distinct !{!12, !4, !"test_loop_rotate_XX: pA:h.rot"} +; CHECK: !13 = !{!14} +; CHECK: !14 = distinct !{!14, !4, !"test_loop_rotate_XX: pA:pre.rot"} +; CHECK: !15 = !{!16} +; CHECK: !16 = distinct !{!16, !4, !"test_loop_rotate_XX: pA:h.rot"}