Index: include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- include/llvm/Analysis/LoopAccessAnalysis.h +++ include/llvm/Analysis/LoopAccessAnalysis.h @@ -522,6 +522,11 @@ /// no memory dependence cycles. bool canVectorizeMemory() const { return CanVecMem; } + /// Return true if it's legal to insert a runtime pointer check. There may + /// still be reported runtime pointer checks that would be required, but it is + /// not legal to insert them. + bool canInsertRuntimeCheck() const { return CanInsertRuntimeCheck; } + const RuntimePointerChecking *getRuntimePointerChecking() const { return PtrRtChecking.get(); } @@ -642,6 +647,7 @@ /// Cache the result of analyzeLoop. bool CanVecMem; + bool CanInsertRuntimeCheck; /// Indicator that there are non vectorizable stores to a uniform address. bool HasDependenceInvolvingLoopInvariantAddress; Index: lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- lib/Analysis/LoopAccessAnalysis.cpp +++ lib/Analysis/LoopAccessAnalysis.cpp @@ -1778,15 +1778,25 @@ unsigned NumReads = 0; unsigned NumReadWrites = 0; + // A runtime check is only legal to insert if there are no convergent calls. + // At first assume there are. + CanInsertRuntimeCheck = false; + PtrRtChecking->Pointers.clear(); PtrRtChecking->Need = false; + bool HasConvergentOp = false; const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); // For each block. for (BasicBlock *BB : TheLoop->blocks()) { // Scan the BB and collect legal loads and stores. for (Instruction &I : *BB) { + if (auto *Call = dyn_cast(&I)) { + if (Call->isConvergent()) + HasConvergentOp = true; + } + // If this is a load, save it. If this instruction can read from memory // but is not a load, then we quit. Notice that we don't handle function // calls that read or write. @@ -1845,6 +1855,8 @@ } // Next instr. } // Next block. + CanInsertRuntimeCheck = !HasConvergentOp; + // Now we have two lists that hold the loads and the stores. // Next, we find the pointers that they use. @@ -1997,6 +2009,16 @@ } } + if (HasConvergentOp) { + recordAnalysis("CantInsertRuntimeCheckWithConvergent") + << "cannot add control dependency to convergent operation"; + + LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check " + "would be needed with a convergent operation\n"); + CanVecMem = false; + return; + } + if (CanVecMem) LLVM_DEBUG( dbgs() << "LAA: No unsafe dependent memory operations in loop. We" @@ -2285,6 +2307,7 @@ PtrRtChecking(llvm::make_unique(SE)), DepChecker(llvm::make_unique(*PSE, L)), TheLoop(L), NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false), + CanInsertRuntimeCheck(true), HasDependenceInvolvingLoopInvariantAddress(false) { if (canAnalyzeLoop()) analyzeLoop(AA, LI, TLI, DT); Index: lib/Transforms/Scalar/LoopDistribute.cpp =================================================================== --- lib/Transforms/Scalar/LoopDistribute.cpp +++ lib/Transforms/Scalar/LoopDistribute.cpp @@ -690,6 +690,12 @@ if (!Dependences || Dependences->empty()) return fail("NoUnsafeDeps", "no unsafe dependences to isolate"); + if (!LAI->canInsertRuntimeCheck() && + LAI->getNumRuntimePointerChecks() != 0) { + return fail("RuntimeCheckWithConvergent", + "may not insert runtime check with convergent operation"); + } + InstPartitionContainer Partitions(L, LI, DT); // First, go through each memory operation and assign them to consecutive Index: test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks-convergent.ll =================================================================== --- /dev/null +++ test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks-convergent.ll @@ -0,0 +1,73 @@ +; RUN: opt -loop-accesses -analyze < %s | FileCheck %s +; RUN: opt -passes='require,require,loop(print-access-info)' -disable-output < %s 2>&1 | FileCheck %s + +; Analyze this loop: +; for (i = 0; i < n; i++) +; A[i + 1] = A[i] * B[i] * C[i]; + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.10.0" + +; CHECK: Report: cannot add control dependency to convergent operation +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Backward: +; CHECK-NEXT: %loadA = load i16, i16* %arrayidxA, align 2 -> +; CHECK-NEXT: store i16 %mul1, i16* %arrayidxA_plus_2, align 2 +; CHECK: Run-time memory checks: +; CHECK-NEXT: 0: +; CHECK-NEXT: Comparing group +; CHECK-NEXT: %arrayidxA = getelementptr inbounds i16, i16* %a, i64 %storemerge3 +; CHECK-NEXT: %arrayidxA_plus_2 = getelementptr inbounds i16, i16* %a, i64 %add +; CHECK-NEXT: Against group +; CHECK-NEXT: %arrayidxB = getelementptr inbounds i16, i16* %b, i64 %storemerge3 +; CHECK-NEXT: 1: +; CHECK-NEXT: Comparing group +; CHECK-NEXT: %arrayidxA = getelementptr inbounds i16, i16* %a, i64 %storemerge3 +; CHECK-NEXT: %arrayidxA_plus_2 = getelementptr inbounds i16, i16* %a, i64 %add +; CHECK-NEXT: Against group +; CHECK-NEXT: %arrayidxC = getelementptr inbounds i16, i16* %c, i64 %storemerge3 + +@B = common global i16* null, align 8 +@A = common global i16* null, align 8 +@C = common global i16* null, align 8 + +define void @f() { +entry: + %a = load i16*, i16** @A, align 8 + %b = load i16*, i16** @B, align 8 + %c = load i16*, i16** @C, align 8 + br label %for.body + +for.body: ; preds = %for.body, %entry + %storemerge3 = phi i64 [ 0, %entry ], [ %add, %for.body ] + + %arrayidxA = getelementptr inbounds i16, i16* %a, i64 %storemerge3 + %loadA = load i16, i16* %arrayidxA, align 2 + + %arrayidxB = getelementptr inbounds i16, i16* %b, i64 %storemerge3 + %loadB = load i16, i16* %arrayidxB, align 2 + + %arrayidxC = getelementptr inbounds i16, i16* %c, i64 %storemerge3 + %loadC = load i16, i16* %arrayidxC, align 2 + + call void @llvm.convergent() + + %mul = mul i16 %loadB, %loadA + %mul1 = mul i16 %mul, %loadC + + %add = add nuw nsw i64 %storemerge3, 1 + %arrayidxA_plus_2 = getelementptr inbounds i16, i16* %a, i64 %add + store i16 %mul1, i16* %arrayidxA_plus_2, align 2 + + %exitcond = icmp eq i64 %add, 20 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +declare void @llvm.convergent() #0 + +attributes #0 = { nounwind readnone convergent } + + Index: test/Transforms/LoopDistribute/basic-with-memchecks.ll =================================================================== --- test/Transforms/LoopDistribute/basic-with-memchecks.ll +++ test/Transforms/LoopDistribute/basic-with-memchecks.ll @@ -5,6 +5,9 @@ ; RUN: -verify-loop-info -verify-dom-info -S < %s | \ ; RUN: FileCheck --check-prefix=VECTORIZE %s +; RUN: opt -basicaa -loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info \ +; RUN: -loop-accesses -analyze < %s | FileCheck %s --check-prefix=ANALYSIS + ; The memcheck version of basic.ll. We should distribute and vectorize the ; second part of this loop with 5 memchecks (A+1 x {C, D, E} + C x {A, B}) ; @@ -108,3 +111,60 @@ for.end: ; preds = %for.body ret void } + +declare i32 @llvm.convergent(i32) #0 + +; This is the same as f, and would require the same bounds +; check. However, it is not OK to introduce new control dependencies +; on the convergent call. + +; CHECK-LAEBL: @f_with_convergent( +; CHECK: call i32 @llvm.convergent +; CHECK-NOT: call i32 @llvm.convergent + +; ANALYSIS: for.body: +; ANALYSIS: Report: cannot add control dependency to convergent operation +define void @f_with_convergent() { +entry: + %a = load i32*, i32** @A, align 8 + %b = load i32*, i32** @B, align 8 + %c = load i32*, i32** @C, align 8 + %d = load i32*, i32** @D, align 8 + %e = load i32*, i32** @E, align 8 + br label %for.body + +for.body: ; preds = %for.body, %entry + %ind = phi i64 [ 0, %entry ], [ %add, %for.body ] + + %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind + %loadA = load i32, i32* %arrayidxA, align 4 + + %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind + %loadB = load i32, i32* %arrayidxB, align 4 + + %mulA = mul i32 %loadB, %loadA + + %add = add nuw nsw i64 %ind, 1 + %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add + store i32 %mulA, i32* %arrayidxA_plus_4, align 4 + + %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind + %loadD = load i32, i32* %arrayidxD, align 4 + + %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind + %loadE = load i32, i32* %arrayidxE, align 4 + + %convergentD = call i32 @llvm.convergent(i32 %loadD) + %mulC = mul i32 %convergentD, %loadE + + %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind + store i32 %mulC, i32* %arrayidxC, align 4 + + %exitcond = icmp eq i64 %add, 20 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +attributes #0 = { nounwind readnone convergent } Index: test/Transforms/LoopDistribute/basic.ll =================================================================== --- test/Transforms/LoopDistribute/basic.ll +++ test/Transforms/LoopDistribute/basic.ll @@ -18,6 +18,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.10.0" +; CHECK-LABEL: @f( define void @f(i32* noalias %a, i32* noalias %b, i32* noalias %c, @@ -81,3 +82,77 @@ for.end: ; preds = %for.body ret void } + +declare i32 @llvm.convergent(i32) #0 + +; It is OK to distribute with a convergent operation, since in each +; new loop the convergent operation has the ssame control dependency. +; CHECK-LABEL: @f_with_convergent( +define void @f_with_convergent(i32* noalias %a, + i32* noalias %b, + i32* noalias %c, + i32* noalias %d, + i32* noalias %e) { +entry: + br label %for.body + +; Verify the two distributed loops. + +; CHECK: entry.split.ldist1: +; CHECK: br label %for.body.ldist1 +; CHECK: for.body.ldist1: +; CHECK: %mulA.ldist1 = mul i32 %loadB.ldist1, %loadA.ldist1 +; CHECK: br i1 %exitcond.ldist1, label %entry.split, label %for.body.ldist1 + +; CHECK: entry.split: +; CHECK: br label %for.body +; CHECK: for.body: +; CHECK: %convergentD = call i32 @llvm.convergent(i32 %loadD) +; CHECK: %mulC = mul i32 %convergentD, %loadE +; CHECK: for.end: + + +; ANALYSIS: for.body: +; ANALYSIS-NEXT: Report: cannot add control dependency to convergent operation +; ANALYSIS: for.body.ldist1: +; ANALYSIS-NEXT: Report: unsafe dependent memory operations in loop + +; convergent instruction happens to block vectorization +; VECTORIZE: call i32 @llvm.convergent +; VECTORIZE: mul i32 + +for.body: ; preds = %for.body, %entry + %ind = phi i64 [ 0, %entry ], [ %add, %for.body ] + + %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind + %loadA = load i32, i32* %arrayidxA, align 4 + + %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind + %loadB = load i32, i32* %arrayidxB, align 4 + + %mulA = mul i32 %loadB, %loadA + + %add = add nuw nsw i64 %ind, 1 + %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add + store i32 %mulA, i32* %arrayidxA_plus_4, align 4 + + %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind + %loadD = load i32, i32* %arrayidxD, align 4 + + %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind + %loadE = load i32, i32* %arrayidxE, align 4 + + %convergentD = call i32 @llvm.convergent(i32 %loadD) + %mulC = mul i32 %convergentD, %loadE + + %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind + store i32 %mulC, i32* %arrayidxC, align 4 + + %exitcond = icmp eq i64 %add, 20 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +attributes #0 = { nounwind readnone convergent } Index: test/Transforms/LoopDistribute/diagnostics.ll =================================================================== --- test/Transforms/LoopDistribute/diagnostics.ll +++ test/Transforms/LoopDistribute/diagnostics.ll @@ -131,6 +131,50 @@ ret void, !dbg !34 } +; MISSED_REMARKS: /tmp/t.c:27:5: loop not distributed: use -Rpass-analysis=loop-distribute for more info +; ANALYSIS_REMARKS: /tmp/t.c:27:5: loop not distributed: may not insert runtime check with convergent operation +; ALWAYS: warning: /tmp/t.c:27:5: loop not distributed: failed explicitly specified loop distribution +define void @convergent(i8* %A, i8* %B, i8* %C, i8* %D, i8* %E, i32 %N) #1 !dbg !45 { +entry: + %cmp28 = icmp sgt i32 %N, 0, !dbg !46 + br i1 %cmp28, label %ph, label %for.cond.cleanup, !dbg !47 + +ph: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %ph ] + %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv, !dbg !49 + %0 = load i8, i8* %arrayidx, align 1, !dbg !49, !tbaa !13 + %arrayidx2 = getelementptr inbounds i8, i8* %B, i64 %indvars.iv, !dbg !50 + %1 = load i8, i8* %arrayidx2, align 1, !dbg !50, !tbaa !13 + %add = add i8 %1, %0, !dbg !51 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !57 + %arrayidx7 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv.next, !dbg !52 + store i8 %add, i8* %arrayidx7, align 1, !dbg !53, !tbaa !13 + %arrayidx9 = getelementptr inbounds i8, i8* %D, i64 %indvars.iv, !dbg !54 + %2 = load i8, i8* %arrayidx9, align 1, !dbg !54, !tbaa !13 + %arrayidx12 = getelementptr inbounds i8, i8* %E, i64 %indvars.iv, !dbg !55 + %3 = load i8, i8* %arrayidx12, align 1, !dbg !55, !tbaa !13 + %mul = mul i8 %3, %2, !dbg !56 + %arrayidx16 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv, !dbg !57 + store i8 %mul, i8* %arrayidx16, align 1, !dbg !58, !tbaa !13 + call void @llvm.convergent() + %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !57 + %exitcond = icmp eq i32 %lftr.wideiv, %N, !dbg !57 + br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !20, !dbg !57 + +for.cond.cleanup: + ret void, !dbg !58 +} + + +declare void @llvm.convergent() #0 + +attributes #0 = { nounwind readnone convergent } +attributes #1 = { nounwind convergent } + + !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4} @@ -177,3 +221,17 @@ !42 = !DILocation(line: 17, column: 17, scope: !31) !43 = !DILocation(line: 17, column: 5, scope: !31) !44 = !DILocation(line: 17, column: 10, scope: !31) +!45 = distinct !DISubprogram(name: "convergent", scope: !1, file: !1, line: 24, type: !8, isLocal: false, isDefinition: true, scopeLine: 24, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) +!46 = !DILocation(line: 25, column: 20, scope: !45) +!47 = !DILocation(line: 25, column: 3, scope: !45) +!48 = !DILocation(line: 29, column: 1, scope: !45) +!49 = !DILocation(line: 26, column: 16, scope: !45) +!50 = !DILocation(line: 26, column: 23, scope: !45) +!51 = !DILocation(line: 26, column: 21, scope: !45) +!52 = !DILocation(line: 26, column: 5, scope: !45) +!53 = !DILocation(line: 26, column: 14, scope: !45) +!54 = !DILocation(line: 27, column: 12, scope: !45) +!55 = !DILocation(line: 27, column: 19, scope: !45) +!56 = !DILocation(line: 27, column: 17, scope: !45) +!57 = !DILocation(line: 27, column: 5, scope: !45) +!58 = !DILocation(line: 27, column: 10, scope: !45)