Index: include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- include/llvm/Analysis/LoopAccessAnalysis.h +++ include/llvm/Analysis/LoopAccessAnalysis.h @@ -564,10 +564,10 @@ /// Print the information about the memory accesses in the loop. void print(raw_ostream &OS, unsigned Depth = 0) const; - /// If the loop has multiple stores to an invariant address, then - /// return true, else return false. - bool hasMultipleStoresToLoopInvariantAddress() const { - return HasMultipleStoresToLoopInvariantAddress; + /// If the loop has multiple stores to an invariant address or an unsafe + /// dependency with a uniform load, then return true, else return false. + bool hasNonVectorizableStoresToLoopInvariantAddress() const { + return HasNonVectorizableStoresToLoopInvariantAddress; } /// Used to add runtime SCEV checks. Simplifies SCEV expressions and converts @@ -620,8 +620,8 @@ /// Cache the result of analyzeLoop. bool CanVecMem; - /// Indicator that there are multiple stores to a uniform address. - bool HasMultipleStoresToLoopInvariantAddress; + /// Indicator that there are non vectorizable stores to a uniform address. + bool HasNonVectorizableStoresToLoopInvariantAddress; /// The diagnostics report generated for the analysis. E.g. why we /// couldn't analyze the loop. Index: lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- lib/Analysis/LoopAccessAnalysis.cpp +++ lib/Analysis/LoopAccessAnalysis.cpp @@ -1870,7 +1870,7 @@ Value *Ptr = ST->getPointerOperand(); if (isUniform(Ptr)) - HasMultipleStoresToLoopInvariantAddress |= + HasNonVectorizableStoresToLoopInvariantAddress |= !UniformStores.insert(Ptr).second; // If we did *not* see this pointer before, insert it to the read-write @@ -1914,6 +1914,14 @@ IsReadOnlyPtr = true; } + // See if there is an unsafe dependency between a load to a uniform address and + // store to the same uniform address. + if (UniformStores.find(Ptr) != UniformStores.end()) { + LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform " + "load and uniform store to the same address!\n"); + HasNonVectorizableStoresToLoopInvariantAddress = true; + } + MemoryLocation Loc = MemoryLocation::get(LD); // The TBAA metadata could have a control dependency on the predication // condition, so we cannot rely on it when determining whether or not we @@ -2272,7 +2280,7 @@ PtrRtChecking(llvm::make_unique(SE)), DepChecker(llvm::make_unique(*PSE, L)), TheLoop(L), NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false), - HasMultipleStoresToLoopInvariantAddress(false) { + HasNonVectorizableStoresToLoopInvariantAddress(false) { if (canAnalyzeLoop()) analyzeLoop(AA, LI, TLI, DT); } @@ -2304,8 +2312,8 @@ PtrRtChecking->print(OS, Depth); OS << "\n"; - OS.indent(Depth) << "Multiple stores to invariant address were " - << (HasMultipleStoresToLoopInvariantAddress ? "" : "not ") + OS.indent(Depth) << "Non vectorizable stores to invariant address were " + << (HasNonVectorizableStoresToLoopInvariantAddress ? "" : "not ") << "found in loop.\n"; OS.indent(Depth) << "SCEV assumptions:\n"; Index: lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -814,18 +814,18 @@ "loop not vectorized: ", *LAR); }); } - if (!LAI->canVectorizeMemory()) + if (!LAI->canVectorizeMemory()) { return false; + } - if (LAI->hasMultipleStoresToLoopInvariantAddress()) { + if (LAI->hasNonVectorizableStoresToLoopInvariantAddress()) { ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress") - << "multiple writes to a loop invariant address could not " + << "write to a loop invariant address could not " "be vectorized"); LLVM_DEBUG( - dbgs() << "LV: We don't allow multiple stores to a uniform address\n"); + dbgs() << "LV: Non vectorizable stores to a uniform address\n"); return false; } - Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); PSE.addPredicate(LAI->getPSE().getUnionPredicate()); Index: test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll =================================================================== --- test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll +++ test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll @@ -39,7 +39,7 @@ ; CHECK-NEXT: Group ; CHECK-NEXT: (Low: %b High: ((4 * (1 umax %x)) + %b)) ; CHECK-NEXT: Member: {%b,+,4}<%for.body> -; CHECK: Multiple stores to invariant address were not found in loop. +; CHECK: Non vectorizable stores to invariant address were not found in loop. ; CHECK-NEXT: SCEV assumptions: ; CHECK-NEXT: {1,+,1}<%for.body> Added Flags: ; CHECK-NEXT: {0,+,1}<%for.body> Added Flags: Index: test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll =================================================================== --- test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll +++ test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll @@ -14,14 +14,14 @@ ; The LAA with the new PM is a loop pass so we go from inner to outer loops. ; OLDPM: for.cond1.preheader: -; OLDPM: Multiple stores to invariant address were not found in loop. +; OLDPM: Non vectorizable stores to invariant address were not found in loop. ; OLDPM: for.body3: -; OLDPM: Multiple stores to invariant address were found in loop. +; OLDPM: Non vectorizable stores to invariant address were found in loop. ; NEWPM: for.body3: -; NEWPM: Multiple stores to invariant address were found in loop. +; NEWPM: Non vectorizable stores to invariant address were found in loop. ; NEWPM: for.cond1.preheader: -; NEWPM: Multiple stores to invariant address were not found in loop. +; NEWPM: Non vectorizable stores to invariant address were not found in loop. define i32 @foo(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 { entry: Index: test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll =================================================================== --- test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll +++ test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll @@ -10,8 +10,8 @@ ; } ; } -; CHECK: Multiple stores to invariant address were not found in loop. -; CHECK-NOT: Multiple stores to invariant address were found in loop. +; CHECK: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NOT: Non vectorizable stores to invariant address were found in loop. define i32 @foo(i32* nocapture readonly %var1, i32* nocapture %var2, i32 %itr) #0 { Index: test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll =================================================================== --- test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll +++ test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll @@ -10,7 +10,7 @@ ; } ; } -; CHECK: Multiple stores to invariant address were not found in loop. +; CHECK: Non vectorizable stores to invariant address were not found in loop. define void @foo(i32* nocapture %var1, i32* nocapture %var2, i32 %itr) #0 { entry: Index: test/Transforms/LoopVectorize/invariant-store-vectorization.ll =================================================================== --- test/Transforms/LoopVectorize/invariant-store-vectorization.ll +++ test/Transforms/LoopVectorize/invariant-store-vectorization.ll @@ -549,3 +549,43 @@ for.end10: ; preds = %for.inc8, %entry ret i32 undef } + +; cannot vectorize a loop with unsafe dependency between uniform load and store +; to the same address +; PR39653 +define void @unsafe_dep_uniform_load_store(i32 %arg, i32 %arg1, i64 %arg2, i16* %arg3, i32 %arg4, i64 %arg5) { +; CHECK-LABEL: unsafe_dep_uniform_load_store +; CHECK-NOT: <4 x i32> +bb: + %tmp = alloca i32 + store i32 %arg4, i32* %tmp + %tmp6 = getelementptr inbounds i16, i16* %arg3, i64 %arg5 + br label %bb7 + +bb7: + %tmp8 = phi i64 [ 0, %bb ], [ %tmp24, %bb7 ] + %tmp9 = phi i32 [ %arg1, %bb ], [ %tmp23, %bb7 ] + %tmp10 = load i32, i32* %tmp + %tmp11 = mul nsw i32 %tmp9, %tmp10 + %tmp12 = srem i32 %tmp11, 65536 + %tmp13 = add nsw i32 %tmp12, %tmp9 + %tmp14 = trunc i32 %tmp13 to i16 + %tmp15 = trunc i64 %tmp8 to i32 + %tmp16 = add i32 %arg, %tmp15 + %tmp17 = zext i32 %tmp16 to i64 + %tmp18 = getelementptr inbounds i16, i16* %tmp6, i64 %tmp17 + store i16 %tmp14, i16* %tmp18, align 2 + %tmp19 = add i32 %tmp13, %tmp9 + %tmp20 = trunc i32 %tmp19 to i16 + %tmp21 = and i16 %tmp20, 255 + %tmp22 = getelementptr inbounds i16, i16* %arg3, i64 %tmp17 + store i16 %tmp21, i16* %tmp22, align 2 + %tmp23 = add nsw i32 %tmp9, 1 + %tmp24 = add nuw nsw i64 %tmp8, 1 + %tmp25 = icmp eq i64 %tmp24, %arg2 + store i32 %tmp12, i32* %tmp + br i1 %tmp25, label %bb26, label %bb7 + +bb26: + ret void +}