Index: include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- include/llvm/Analysis/LoopAccessAnalysis.h +++ include/llvm/Analysis/LoopAccessAnalysis.h @@ -522,6 +522,11 @@ /// no memory dependence cycles. bool canVectorizeMemory() const { return CanVecMem; } + /// Return true if there is a convergent operation in the loop. There may + /// still be reported runtime pointer checks that would be required, but it is + /// not legal to insert them. + bool hasConvergentOp() const { return HasConvergentOp; } + const RuntimePointerChecking *getRuntimePointerChecking() const { return PtrRtChecking.get(); } @@ -642,6 +647,7 @@ /// Cache the result of analyzeLoop. bool CanVecMem; + bool HasConvergentOp; /// Indicator that there are non vectorizable stores to a uniform address. bool HasDependenceInvolvingLoopInvariantAddress; Index: lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- lib/Analysis/LoopAccessAnalysis.cpp +++ lib/Analysis/LoopAccessAnalysis.cpp @@ -1778,6 +1778,11 @@ unsigned NumReads = 0; unsigned NumReadWrites = 0; + bool HasComplexMemInst = false; + + // A runtime check is only legal to insert if there are no convergent calls. + HasConvergentOp = false; + PtrRtChecking->Pointers.clear(); PtrRtChecking->Need = false; @@ -1785,8 +1790,25 @@ // For each block. for (BasicBlock *BB : TheLoop->blocks()) { - // Scan the BB and collect legal loads and stores. + // Scan the BB and collect legal loads and stores. Also detect any + // convergent instructions. for (Instruction &I : *BB) { + if (auto *Call = dyn_cast(&I)) { + if (Call->isConvergent()) + HasConvergentOp = true; + } + + // With both a non-vectorizable memory instruction and a convergent + // operation, found in this loop, no reason to continue the search. + if (HasComplexMemInst && HasConvergentOp) { + CanVecMem = false; + return; + } + + // Avoid hitting recordAnalysis multiple times. + if (HasComplexMemInst) + continue; + // If this is a load, save it. If this instruction can read from memory // but is not a load, then we quit. Notice that we don't handle function // calls that read or write. @@ -1805,12 +1827,18 @@ continue; auto *Ld = dyn_cast(&I); - if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) { + if (!Ld) { + recordAnalysis("CantVectorizeInstruction", Ld) + << "instruction cannot be vectorized"; + HasComplexMemInst = true; + continue; + } + if (!Ld->isSimple() && !IsAnnotatedParallel) { recordAnalysis("NonSimpleLoad", Ld) << "read with atomic ordering or volatile read"; LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n"); - CanVecMem = false; - return; + HasComplexMemInst = true; + continue; } NumLoads++; Loads.push_back(Ld); @@ -1826,15 +1854,15 @@ if (!St) { recordAnalysis("CantVectorizeInstruction", St) << "instruction cannot be vectorized"; - CanVecMem = false; - return; + HasComplexMemInst = true; + continue; } if (!St->isSimple() && !IsAnnotatedParallel) { recordAnalysis("NonSimpleStore", St) << "write with atomic ordering or volatile write"; LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n"); - CanVecMem = false; - return; + HasComplexMemInst = true; + continue; } NumStores++; Stores.push_back(St); @@ -1845,6 +1873,11 @@ } // Next instr. } // Next block. + if (HasComplexMemInst) { + CanVecMem = false; + return; + } + // Now we have two lists that hold the loads and the stores. // Next, we find the pointers that they use. @@ -1894,7 +1927,7 @@ } } - if (IsAnnotatedParallel) { + if (IsAnnotatedParallel/* && !HasConvergentOp*/) { LLVM_DEBUG( dbgs() << "LAA: A loop annotated parallel, ignore memory dependency " << "checks.\n"); @@ -1962,7 +1995,7 @@ } LLVM_DEBUG( - dbgs() << "LAA: We can perform a memory runtime check if needed.\n"); + dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n"); CanVecMem = true; if (Accesses.isDependencyCheckNeeded()) { @@ -1997,6 +2030,15 @@ } } + if (HasConvergentOp) { + recordAnalysis("CantInsertRuntimeCheckWithConvergent") + << "cannot add control dependency to convergent operation"; + LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check " + "would be needed with a convergent operation\n"); + CanVecMem = false; + return; + } + if (CanVecMem) LLVM_DEBUG( dbgs() << "LAA: No unsafe dependent memory operations in loop. We" @@ -2285,6 +2327,7 @@ PtrRtChecking(llvm::make_unique(SE)), DepChecker(llvm::make_unique(*PSE, L)), TheLoop(L), NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false), + HasConvergentOp(false), HasDependenceInvolvingLoopInvariantAddress(false) { if (canAnalyzeLoop()) analyzeLoop(AA, LI, TLI, DT); @@ -2301,6 +2344,9 @@ OS << "\n"; } + if (HasConvergentOp) + OS.indent(Depth) << "Has convergent operation in loop\n"; + if (Report) OS.indent(Depth) << "Report: " << Report->getMsg() << "\n"; Index: lib/Transforms/Scalar/LoopDistribute.cpp =================================================================== --- lib/Transforms/Scalar/LoopDistribute.cpp +++ lib/Transforms/Scalar/LoopDistribute.cpp @@ -768,8 +768,14 @@ "cannot isolate unsafe dependencies"); } - // Don't distribute the loop if we need too many SCEV run-time checks. + // Don't distribute the loop if we need too many SCEV run-time checks, or + // any if it's illegal. const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate(); + if (LAI->hasConvergentOp() && !Pred.isAlwaysTrue()) { + return fail("RuntimeCheckWithConvergent", + "may not insert runtime check with convergent operation"); + } + if (Pred.getComplexity() > (IsForced.getValueOr(false) ? PragmaDistributeSCEVCheckThreshold : DistributeSCEVCheckThreshold)) @@ -797,7 +803,14 @@ auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, RtPtrChecking); + if (LAI->hasConvergentOp() && !Checks.empty()) { + return fail("RuntimeCheckWithConvergent", + "may not insert runtime check with convergent operation"); + } + if (!Pred.isAlwaysTrue() || !Checks.empty()) { + assert(!LAI->hasConvergentOp() && "inserting illegal loop versioning"); + MDNode *OrigLoopID = L->getLoopID(); LLVM_DEBUG(dbgs() << "\nPointers:\n"); Index: test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks-convergent.ll =================================================================== --- /dev/null +++ test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks-convergent.ll @@ -0,0 +1,73 @@ +; RUN: opt -loop-accesses -analyze < %s | FileCheck %s +; RUN: opt -passes='require,require,loop(print-access-info)' -disable-output < %s 2>&1 | FileCheck %s + +; Analyze this loop: +; for (i = 0; i < n; i++) +; A[i + 1] = A[i] * B[i] * C[i]; + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +; CHECK: for.body: +; CHECK: Has convergent operation in loop +; CHECK: Report: cannot add control dependency to convergent operation +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Backward: +; CHECK-NEXT: %loadA = load i16, i16* %arrayidxA, align 2 -> +; CHECK-NEXT: store i16 %mul1, i16* %arrayidxA_plus_2, align 2 +; CHECK: Run-time memory checks: +; CHECK-NEXT: 0: +; CHECK-NEXT: Comparing group +; CHECK-NEXT: %arrayidxA = getelementptr inbounds i16, i16* %a, i64 %storemerge3 +; CHECK-NEXT: %arrayidxA_plus_2 = getelementptr inbounds i16, i16* %a, i64 %add +; CHECK-NEXT: Against group +; CHECK-NEXT: %arrayidxB = getelementptr inbounds i16, i16* %b, i64 %storemerge3 +; CHECK-NEXT: 1: +; CHECK-NEXT: Comparing group +; CHECK-NEXT: %arrayidxA = getelementptr inbounds i16, i16* %a, i64 %storemerge3 +; CHECK-NEXT: %arrayidxA_plus_2 = getelementptr inbounds i16, i16* %a, i64 %add +; CHECK-NEXT: Against group +; CHECK-NEXT: %arrayidxC = getelementptr inbounds i16, i16* %c, i64 %storemerge3 + +@B = common global i16* null, align 8 +@A = common global i16* null, align 8 +@C = common global i16* null, align 8 + +define void @f() #1 { +entry: + %a = load i16*, i16** @A, align 8 + %b = load i16*, i16** @B, align 8 + %c = load i16*, i16** @C, align 8 + br label %for.body + +for.body: ; preds = %for.body, %entry + %storemerge3 = phi i64 [ 0, %entry ], [ %add, %for.body ] + + %arrayidxA = getelementptr inbounds i16, i16* %a, i64 %storemerge3 + %loadA = load i16, i16* %arrayidxA, align 2 + + %arrayidxB = getelementptr inbounds i16, i16* %b, i64 %storemerge3 + %loadB = load i16, i16* %arrayidxB, align 2 + + %arrayidxC = getelementptr inbounds i16, i16* %c, i64 %storemerge3 + %loadC = load i16, i16* %arrayidxC, align 2 + + call void @llvm.convergent() + + %mul = mul i16 %loadB, %loadA + %mul1 = mul i16 %mul, %loadC + + %add = add nuw nsw i64 %storemerge3, 1 + %arrayidxA_plus_2 = getelementptr inbounds i16, i16* %a, i64 %add + store i16 %mul1, i16* %arrayidxA_plus_2, align 2 + + %exitcond = icmp eq i64 %add, 20 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +declare void @llvm.convergent() #0 + +attributes #0 = { nounwind readnone convergent } +attributes #1 = { nounwind convergent } Index: test/Transforms/LoopDistribute/basic-with-memchecks.ll =================================================================== --- test/Transforms/LoopDistribute/basic-with-memchecks.ll +++ test/Transforms/LoopDistribute/basic-with-memchecks.ll @@ -5,6 +5,9 @@ ; RUN: -verify-loop-info -verify-dom-info -S < %s | \ ; RUN: FileCheck --check-prefix=VECTORIZE %s +; RUN: opt -basicaa -loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info \ +; RUN: -loop-accesses -analyze < %s | FileCheck %s --check-prefix=ANALYSIS + ; The memcheck version of basic.ll. We should distribute and vectorize the ; second part of this loop with 5 memchecks (A+1 x {C, D, E} + C x {A, B}) ; @@ -169,3 +172,113 @@ for.end: ret void } + +declare i32 @llvm.convergent(i32) #0 + +; This is the same as f, and would require the same bounds +; check. However, it is not OK to introduce new control dependencies +; on the convergent call. + +; CHECK-LABEL: @f_with_convergent( +; CHECK: call i32 @llvm.convergent +; CHECK-NOT: call i32 @llvm.convergent + +; ANALYSIS: for.body: +; ANALYSIS: Report: cannot add control dependency to convergent operation +define void @f_with_convergent() #1 { +entry: + %a = load i32*, i32** @A, align 8 + %b = load i32*, i32** @B, align 8 + %c = load i32*, i32** @C, align 8 + %d = load i32*, i32** @D, align 8 + %e = load i32*, i32** @E, align 8 + br label %for.body + +for.body: ; preds = %for.body, %entry + %ind = phi i64 [ 0, %entry ], [ %add, %for.body ] + + %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind + %loadA = load i32, i32* %arrayidxA, align 4 + + %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind + %loadB = load i32, i32* %arrayidxB, align 4 + + %mulA = mul i32 %loadB, %loadA + + %add = add nuw nsw i64 %ind, 1 + %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add + store i32 %mulA, i32* %arrayidxA_plus_4, align 4 + + %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind + %loadD = load i32, i32* %arrayidxD, align 4 + + %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind + %loadE = load i32, i32* %arrayidxE, align 4 + + %convergentD = call i32 @llvm.convergent(i32 %loadD) + %mulC = mul i32 %convergentD, %loadE + + %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind + store i32 %mulC, i32* %arrayidxC, align 4 + + %exitcond = icmp eq i64 %add, 20 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; Make sure an explicit request for distribution is ignored if it +; requires possibly illegal checks. + +; CHECK-LABEL: @f_with_convergent_forced_distribute( +; CHECK: call i32 @llvm.convergent +; CHECK-NOT: call i32 @llvm.convergent +define void @f_with_convergent_forced_distribute() #1 { +entry: + %a = load i32*, i32** @A, align 8 + %b = load i32*, i32** @B, align 8 + %c = load i32*, i32** @C, align 8 + %d = load i32*, i32** @D, align 8 + %e = load i32*, i32** @E, align 8 + br label %for.body + +for.body: ; preds = %for.body, %entry + %ind = phi i64 [ 0, %entry ], [ %add, %for.body ] + + %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind + %loadA = load i32, i32* %arrayidxA, align 4 + + %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind + %loadB = load i32, i32* %arrayidxB, align 4 + + %mulA = mul i32 %loadB, %loadA + + %add = add nuw nsw i64 %ind, 1 + %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add + store i32 %mulA, i32* %arrayidxA_plus_4, align 4 + + %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind + %loadD = load i32, i32* %arrayidxD, align 4 + + %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind + %loadE = load i32, i32* %arrayidxE, align 4 + + %convergentD = call i32 @llvm.convergent(i32 %loadD) + %mulC = mul i32 %convergentD, %loadE + + %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind + store i32 %mulC, i32* %arrayidxC, align 4 + + %exitcond = icmp eq i64 %add, 20 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0 + +for.end: ; preds = %for.body + ret void +} + +attributes #0 = { nounwind readnone convergent } +attributes #1 = { nounwind convergent } + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.distribute.enable", i1 true} Index: test/Transforms/LoopDistribute/basic.ll =================================================================== --- test/Transforms/LoopDistribute/basic.ll +++ test/Transforms/LoopDistribute/basic.ll @@ -18,6 +18,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.10.0" +; CHECK-LABEL: @f( define void @f(i32* noalias %a, i32* noalias %b, i32* noalias %c, @@ -81,3 +82,78 @@ for.end: ; preds = %for.body ret void } + +declare i32 @llvm.convergent(i32) #0 + +; It is OK to distribute with a convergent operation, since in each +; new loop the convergent operation has the ssame control dependency. +; CHECK-LABEL: @f_with_convergent( +define void @f_with_convergent(i32* noalias %a, + i32* noalias %b, + i32* noalias %c, + i32* noalias %d, + i32* noalias %e) { +entry: + br label %for.body + +; Verify the two distributed loops. + +; CHECK: entry.split.ldist1: +; CHECK: br label %for.body.ldist1 +; CHECK: for.body.ldist1: +; CHECK: %mulA.ldist1 = mul i32 %loadB.ldist1, %loadA.ldist1 +; CHECK: br i1 %exitcond.ldist1, label %entry.split, label %for.body.ldist1 + +; CHECK: entry.split: +; CHECK: br label %for.body +; CHECK: for.body: +; CHECK: %convergentD = call i32 @llvm.convergent(i32 %loadD) +; CHECK: %mulC = mul i32 %convergentD, %loadE +; CHECK: for.end: + + +; ANALYSIS: for.body: +; ANALYSIS-NEXT: Has convergent operation in loop +; ANALYSIS-NEXT: Report: cannot add control dependency to convergent operation +; ANALYSIS: for.body.ldist1: +; ANALYSIS-NEXT: Report: unsafe dependent memory operations in loop + +; convergent instruction happens to block vectorization +; VECTORIZE: call i32 @llvm.convergent +; VECTORIZE: mul i32 + +for.body: ; preds = %for.body, %entry + %ind = phi i64 [ 0, %entry ], [ %add, %for.body ] + + %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind + %loadA = load i32, i32* %arrayidxA, align 4 + + %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind + %loadB = load i32, i32* %arrayidxB, align 4 + + %mulA = mul i32 %loadB, %loadA + + %add = add nuw nsw i64 %ind, 1 + %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add + store i32 %mulA, i32* %arrayidxA_plus_4, align 4 + + %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind + %loadD = load i32, i32* %arrayidxD, align 4 + + %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind + %loadE = load i32, i32* %arrayidxE, align 4 + + %convergentD = call i32 @llvm.convergent(i32 %loadD) + %mulC = mul i32 %convergentD, %loadE + + %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind + store i32 %mulC, i32* %arrayidxC, align 4 + + %exitcond = icmp eq i64 %add, 20 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +attributes #0 = { nounwind readnone convergent } Index: test/Transforms/LoopDistribute/convergent-no-cross-partition-checks.ll =================================================================== --- /dev/null +++ test/Transforms/LoopDistribute/convergent-no-cross-partition-checks.ll @@ -0,0 +1,87 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -basicaa -loop-distribute -enable-loop-distribute \ +; RUN: -verify-loop-info -verify-dom-info -S < %s | FileCheck %s + +; Derived from crash-in-memcheck-generation.ll + +; Make sure the loop is distributed even with a convergent +; op. LoopAccessAnalysis says that runtime checks are necessary, but +; none are cross partition, so none are truly needed. + +define void @f(i32* %a, i32* %b, i32* noalias %c, i32* noalias %d, i32* noalias %e) #1 { +; CHECK-LABEL: @f( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[ENTRY_SPLIT_LDIST1:%.*]] +; CHECK: entry.split.ldist1: +; CHECK-NEXT: br label [[FOR_BODY_LDIST1:%.*]] +; CHECK: for.body.ldist1: +; CHECK-NEXT: [[IND_LDIST1:%.*]] = phi i64 [ 0, [[ENTRY_SPLIT_LDIST1]] ], [ [[ADD_LDIST1:%.*]], [[FOR_BODY_LDIST1]] ] +; CHECK-NEXT: [[ARRAYIDXA_LDIST1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IND_LDIST1]] +; CHECK-NEXT: [[LOADA_LDIST1:%.*]] = load i32, i32* [[ARRAYIDXA_LDIST1]], align 4 +; CHECK-NEXT: [[ARRAYIDXB_LDIST1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[IND_LDIST1]] +; CHECK-NEXT: [[LOADB_LDIST1:%.*]] = load i32, i32* [[ARRAYIDXB_LDIST1]], align 4 +; CHECK-NEXT: [[MULA_LDIST1:%.*]] = mul i32 [[LOADB_LDIST1]], [[LOADA_LDIST1]] +; CHECK-NEXT: [[ADD_LDIST1]] = add nuw nsw i64 [[IND_LDIST1]], 1 +; CHECK-NEXT: [[ARRAYIDXA_PLUS_4_LDIST1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[ADD_LDIST1]] +; CHECK-NEXT: store i32 [[MULA_LDIST1]], i32* [[ARRAYIDXA_PLUS_4_LDIST1]], align 4 +; CHECK-NEXT: [[EXITCOND_LDIST1:%.*]] = icmp eq i64 [[ADD_LDIST1]], 20 +; CHECK-NEXT: br i1 [[EXITCOND_LDIST1]], label [[ENTRY_SPLIT:%.*]], label [[FOR_BODY_LDIST1]] +; CHECK: entry.split: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IND:%.*]] = phi i64 [ 0, [[ENTRY_SPLIT]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ADD]] = add nuw nsw i64 [[IND]], 1 +; CHECK-NEXT: [[ARRAYIDXD:%.*]] = getelementptr inbounds i32, i32* [[D:%.*]], i64 [[IND]] +; CHECK-NEXT: [[LOADD:%.*]] = load i32, i32* [[ARRAYIDXD]], align 4 +; CHECK-NEXT: [[CONVERGENTD:%.*]] = call i32 @llvm.convergent(i32 [[LOADD]]) +; CHECK-NEXT: [[ARRAYIDXE:%.*]] = getelementptr inbounds i32, i32* [[E:%.*]], i64 [[IND]] +; CHECK-NEXT: [[LOADE:%.*]] = load i32, i32* [[ARRAYIDXE]], align 4 +; CHECK-NEXT: [[MULC:%.*]] = mul i32 [[CONVERGENTD]], [[LOADE]] +; CHECK-NEXT: [[ARRAYIDXC:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[IND]] +; CHECK-NEXT: store i32 [[MULC]], i32* [[ARRAYIDXC]], align 4 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[ADD]], 20 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %ind = phi i64 [ 0, %entry ], [ %add, %for.body ] + + %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind + %loadA = load i32, i32* %arrayidxA, align 4 + + %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind + %loadB = load i32, i32* %arrayidxB, align 4 + + %mulA = mul i32 %loadB, %loadA + + %add = add nuw nsw i64 %ind, 1 + %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add + store i32 %mulA, i32* %arrayidxA_plus_4, align 4 + + %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind + %loadD = load i32, i32* %arrayidxD, align 4 + %convergentD = call i32 @llvm.convergent(i32 %loadD) + + %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind + %loadE = load i32, i32* %arrayidxE, align 4 + + %mulC = mul i32 %convergentD, %loadE + + %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind + store i32 %mulC, i32* %arrayidxC, align 4 + + %exitcond = icmp eq i64 %add, 20 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +declare i32 @llvm.convergent(i32) #0 + +attributes #0 = { nounwind readnone convergent } +attributes #1 = { nounwind convergent } Index: test/Transforms/LoopDistribute/diagnostics.ll =================================================================== --- test/Transforms/LoopDistribute/diagnostics.ll +++ test/Transforms/LoopDistribute/diagnostics.ll @@ -131,6 +131,50 @@ ret void, !dbg !34 } +; MISSED_REMARKS: /tmp/t.c:27:5: loop not distributed: use -Rpass-analysis=loop-distribute for more info +; ANALYSIS_REMARKS: /tmp/t.c:27:5: loop not distributed: may not insert runtime check with convergent operation +; ALWAYS: warning: /tmp/t.c:27:5: loop not distributed: failed explicitly specified loop distribution +define void @convergent(i8* %A, i8* %B, i8* %C, i8* %D, i8* %E, i32 %N) #1 !dbg !45 { +entry: + %cmp28 = icmp sgt i32 %N, 0, !dbg !46 + br i1 %cmp28, label %ph, label %for.cond.cleanup, !dbg !47 + +ph: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %ph ] + %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv, !dbg !49 + %0 = load i8, i8* %arrayidx, align 1, !dbg !49, !tbaa !13 + %arrayidx2 = getelementptr inbounds i8, i8* %B, i64 %indvars.iv, !dbg !50 + %1 = load i8, i8* %arrayidx2, align 1, !dbg !50, !tbaa !13 + %add = add i8 %1, %0, !dbg !51 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !57 + %arrayidx7 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv.next, !dbg !52 + store i8 %add, i8* %arrayidx7, align 1, !dbg !53, !tbaa !13 + %arrayidx9 = getelementptr inbounds i8, i8* %D, i64 %indvars.iv, !dbg !54 + %2 = load i8, i8* %arrayidx9, align 1, !dbg !54, !tbaa !13 + %arrayidx12 = getelementptr inbounds i8, i8* %E, i64 %indvars.iv, !dbg !55 + %3 = load i8, i8* %arrayidx12, align 1, !dbg !55, !tbaa !13 + %mul = mul i8 %3, %2, !dbg !56 + %arrayidx16 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv, !dbg !57 + store i8 %mul, i8* %arrayidx16, align 1, !dbg !58, !tbaa !13 + call void @llvm.convergent() + %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !57 + %exitcond = icmp eq i32 %lftr.wideiv, %N, !dbg !57 + br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !20, !dbg !57 + +for.cond.cleanup: + ret void, !dbg !58 +} + + +declare void @llvm.convergent() #0 + +attributes #0 = { nounwind readnone convergent } +attributes #1 = { nounwind convergent } + + !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4} @@ -177,3 +221,17 @@ !42 = !DILocation(line: 17, column: 17, scope: !31) !43 = !DILocation(line: 17, column: 5, scope: !31) !44 = !DILocation(line: 17, column: 10, scope: !31) +!45 = distinct !DISubprogram(name: "convergent", scope: !1, file: !1, line: 24, type: !8, isLocal: false, isDefinition: true, scopeLine: 24, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) +!46 = !DILocation(line: 25, column: 20, scope: !45) +!47 = !DILocation(line: 25, column: 3, scope: !45) +!48 = !DILocation(line: 29, column: 1, scope: !45) +!49 = !DILocation(line: 26, column: 16, scope: !45) +!50 = !DILocation(line: 26, column: 23, scope: !45) +!51 = !DILocation(line: 26, column: 21, scope: !45) +!52 = !DILocation(line: 26, column: 5, scope: !45) +!53 = !DILocation(line: 26, column: 14, scope: !45) +!54 = !DILocation(line: 27, column: 12, scope: !45) +!55 = !DILocation(line: 27, column: 19, scope: !45) +!56 = !DILocation(line: 27, column: 17, scope: !45) +!57 = !DILocation(line: 27, column: 5, scope: !45) +!58 = !DILocation(line: 27, column: 10, scope: !45) Index: test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll =================================================================== --- test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll +++ test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll @@ -7,7 +7,6 @@ ; not based on memory access. define void @f(i32* noalias %a, i32* noalias %b, i32* noalias %c, i32* noalias %d, i32* noalias %e, i64 %N) { - ; CHECK-LABEL: @f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A2:%.*]] = ptrtoint i32* [[A:%.*]] to i64 @@ -101,6 +100,7 @@ ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]] ; CHECK: for.end: ; CHECK-NEXT: ret void +; entry: br label %for.body @@ -143,3 +143,84 @@ for.end: ; preds = %for.body ret void } + +; Can't add control dependency with convergent in loop body. +define void @f_with_convergent(i32* noalias %a, i32* noalias %b, i32* noalias %c, i32* noalias %d, i32* noalias %e, i64 %N) #1 { +; CHECK-LABEL: @f_with_convergent( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IND:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IND1:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC1:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[IND1]], 2 +; CHECK-NEXT: [[MUL_EXT:%.*]] = zext i32 [[MUL]] to i64 +; CHECK-NEXT: [[ARRAYIDXA:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[MUL_EXT]] +; CHECK-NEXT: [[LOADA:%.*]] = load i32, i32* [[ARRAYIDXA]], align 4 +; CHECK-NEXT: [[ARRAYIDXB:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[MUL_EXT]] +; CHECK-NEXT: [[LOADB:%.*]] = load i32, i32* [[ARRAYIDXB]], align 4 +; CHECK-NEXT: [[MULA:%.*]] = mul i32 [[LOADB]], [[LOADA]] +; CHECK-NEXT: [[ADD]] = add nuw nsw i64 [[IND]], 1 +; CHECK-NEXT: [[INC1]] = add i32 [[IND1]], 1 +; CHECK-NEXT: [[ARRAYIDXA_PLUS_4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[ADD]] +; CHECK-NEXT: store i32 [[MULA]], i32* [[ARRAYIDXA_PLUS_4]], align 4 +; CHECK-NEXT: [[ARRAYIDXD:%.*]] = getelementptr inbounds i32, i32* [[D:%.*]], i64 [[MUL_EXT]] +; CHECK-NEXT: [[LOADD:%.*]] = load i32, i32* [[ARRAYIDXD]], align 4 +; CHECK-NEXT: [[ARRAYIDXE:%.*]] = getelementptr inbounds i32, i32* [[E:%.*]], i64 [[MUL_EXT]] +; CHECK-NEXT: [[LOADE:%.*]] = load i32, i32* [[ARRAYIDXE]], align 4 +; CHECK-NEXT: [[CONVERGENTD:%.*]] = call i32 @llvm.convergent(i32 [[LOADD]]) +; CHECK-NEXT: [[MULC:%.*]] = mul i32 [[CONVERGENTD]], [[LOADE]] +; CHECK-NEXT: [[ARRAYIDXC:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[MUL_EXT]] +; CHECK-NEXT: store i32 [[MULC]], i32* [[ARRAYIDXC]], align 4 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[ADD]], [[N:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %ind = phi i64 [ 0, %entry ], [ %add, %for.body ] + %ind1 = phi i32 [ 0, %entry ], [ %inc1, %for.body ] + + %mul = mul i32 %ind1, 2 + %mul_ext = zext i32 %mul to i64 + + + %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %mul_ext + %loadA = load i32, i32* %arrayidxA, align 4 + + %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %mul_ext + %loadB = load i32, i32* %arrayidxB, align 4 + + %mulA = mul i32 %loadB, %loadA + + %add = add nuw nsw i64 %ind, 1 + %inc1 = add i32 %ind1, 1 + + %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add + store i32 %mulA, i32* %arrayidxA_plus_4, align 4 + + %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %mul_ext + %loadD = load i32, i32* %arrayidxD, align 4 + + %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %mul_ext + %loadE = load i32, i32* %arrayidxE, align 4 + + %convergentD = call i32 @llvm.convergent(i32 %loadD) + %mulC = mul i32 %convergentD, %loadE + + %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %mul_ext + store i32 %mulC, i32* %arrayidxC, align 4 + + %exitcond = icmp eq i64 %add, %N + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +declare i32 @llvm.convergent(i32) #0 + +attributes #0 = { nounwind readnone convergent } +attributes #1 = { nounwind convergent }