Index: include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- include/llvm/Analysis/LoopAccessAnalysis.h +++ include/llvm/Analysis/LoopAccessAnalysis.h @@ -564,10 +564,10 @@ /// Print the information about the memory accesses in the loop. void print(raw_ostream &OS, unsigned Depth = 0) const; - /// If the loop has multiple stores to an invariant address, then + /// If the loop has non-vectorizable stores to an invariant address, then /// return true, else return false. - bool hasMultipleStoresToLoopInvariantAddress() const { - return HasMultipleStoresToLoopInvariantAddress; + bool hasNonVectorizableStoresToLoopInvariantAddress() const { + return HasNonVectorizableStoresToLoopInvariantAddress; } /// Used to add runtime SCEV checks. Simplifies SCEV expressions and converts @@ -620,8 +620,8 @@ /// Cache the result of analyzeLoop. bool CanVecMem; - /// Indicator that there are multiple stores to a uniform address. - bool HasMultipleStoresToLoopInvariantAddress; + /// Indicator that there are non vectorizable stores to a uniform address. + bool HasNonVectorizableStoresToLoopInvariantAddress; /// The diagnostics report generated for the analysis. E.g. why we /// couldn't analyze the loop. Index: lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- lib/Analysis/LoopAccessAnalysis.cpp +++ lib/Analysis/LoopAccessAnalysis.cpp @@ -1865,13 +1865,19 @@ // Record uniform store addresses to identify if we have multiple stores // to the same address. ValueSet UniformStores; + // Record the uniform store along with the uniform address. Since we allow + // only one store to the same address, we do not care if we see multiple + // stores to the same address when populating the map. + DenseMap UniformStoreMap; for (StoreInst *ST : Stores) { Value *Ptr = ST->getPointerOperand(); - if (isUniform(Ptr)) - HasMultipleStoresToLoopInvariantAddress |= + if (isUniform(Ptr)) { + HasNonVectorizableStoresToLoopInvariantAddress |= !UniformStores.insert(Ptr).second; + UniformStoreMap[Ptr] = ST; + } // If we did *not* see this pointer before, insert it to the read-write // list. At this phase it is only a 'write' list. @@ -1914,6 +1920,16 @@ IsReadOnlyPtr = true; } + // See if there is an unsafe dependency between a load to a uniform address and + // store to the same uniform address. + if (!HasNonVectorizableStoresToLoopInvariantAddress && + UniformStores.find(Ptr) != UniformStores.end() && + !DT->dominates(UniformStoreMap[Ptr], LD)) { + LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform " + "load and uniform store to the same address!\n"); + HasNonVectorizableStoresToLoopInvariantAddress = true; + } + MemoryLocation Loc = MemoryLocation::get(LD); // The TBAA metadata could have a control dependency on the predication // condition, so we cannot rely on it when determining whether or not we @@ -2272,7 +2288,7 @@ PtrRtChecking(llvm::make_unique(SE)), DepChecker(llvm::make_unique(*PSE, L)), TheLoop(L), NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false), - HasMultipleStoresToLoopInvariantAddress(false) { + HasNonVectorizableStoresToLoopInvariantAddress(false) { if (canAnalyzeLoop()) analyzeLoop(AA, LI, TLI, DT); } @@ -2304,8 +2320,8 @@ PtrRtChecking->print(OS, Depth); OS << "\n"; - OS.indent(Depth) << "Multiple stores to invariant address were " - << (HasMultipleStoresToLoopInvariantAddress ? "" : "not ") + OS.indent(Depth) << "Non vectorizable stores to invariant address were " + << (HasNonVectorizableStoresToLoopInvariantAddress ? "" : "not ") << "found in loop.\n"; OS.indent(Depth) << "SCEV assumptions:\n"; Index: lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -814,18 +814,18 @@ "loop not vectorized: ", *LAR); }); } - if (!LAI->canVectorizeMemory()) + if (!LAI->canVectorizeMemory()) { return false; + } - if (LAI->hasMultipleStoresToLoopInvariantAddress()) { + if (LAI->hasNonVectorizableStoresToLoopInvariantAddress()) { ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress") - << "multiple writes to a loop invariant address could not " + << "write to a loop invariant address could not " "be vectorized"); LLVM_DEBUG( - dbgs() << "LV: We don't allow multiple stores to a uniform address\n"); + dbgs() << "LV: Non vectorizable stores to a uniform address\n"); return false; } - Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); PSE.addPredicate(LAI->getPSE().getUnionPredicate()); Index: test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll =================================================================== --- test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll +++ test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll @@ -39,7 +39,7 @@ ; CHECK-NEXT: Group ; CHECK-NEXT: (Low: %b High: ((4 * (1 umax %x)) + %b)) ; CHECK-NEXT: Member: {%b,+,4}<%for.body> -; CHECK: Multiple stores to invariant address were not found in loop. +; CHECK: Non vectorizable stores to invariant address were not found in loop. ; CHECK-NEXT: SCEV assumptions: ; CHECK-NEXT: {1,+,1}<%for.body> Added Flags: ; CHECK-NEXT: {0,+,1}<%for.body> Added Flags: Index: test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll =================================================================== --- test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll +++ test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll @@ -14,14 +14,14 @@ ; The LAA with the new PM is a loop pass so we go from inner to outer loops. ; OLDPM: for.cond1.preheader: -; OLDPM: Multiple stores to invariant address were not found in loop. +; OLDPM: Non vectorizable stores to invariant address were not found in loop. ; OLDPM: for.body3: -; OLDPM: Multiple stores to invariant address were found in loop. +; OLDPM: Non vectorizable stores to invariant address were found in loop. ; NEWPM: for.body3: -; NEWPM: Multiple stores to invariant address were found in loop. +; NEWPM: Non vectorizable stores to invariant address were found in loop. ; NEWPM: for.cond1.preheader: -; NEWPM: Multiple stores to invariant address were not found in loop. +; NEWPM: Non vectorizable stores to invariant address were not found in loop. define i32 @foo(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 { entry: Index: test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll =================================================================== --- test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll +++ test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll @@ -10,8 +10,8 @@ ; } ; } -; CHECK: Multiple stores to invariant address were not found in loop. -; CHECK-NOT: Multiple stores to invariant address were found in loop. +; CHECK: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NOT: Non vectorizable stores to invariant address were found in loop. define i32 @foo(i32* nocapture readonly %var1, i32* nocapture %var2, i32 %itr) #0 { Index: test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll =================================================================== --- test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll +++ test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll @@ -10,7 +10,7 @@ ; } ; } -; CHECK: Multiple stores to invariant address were not found in loop. +; CHECK: Non vectorizable stores to invariant address were not found in loop. define void @foo(i32* nocapture %var1, i32* nocapture %var2, i32 %itr) #0 { entry: Index: test/Transforms/LoopVectorize/invariant-store-vectorization.ll =================================================================== --- test/Transforms/LoopVectorize/invariant-store-vectorization.ll +++ test/Transforms/LoopVectorize/invariant-store-vectorization.ll @@ -549,3 +549,117 @@ for.end10: ; preds = %for.inc8, %entry ret i32 undef } + +; cannot vectorize a loop with unsafe dependency between uniform load and store +; to the same address +define void @unsafe_dep_uniform_load_store(i32 %arg, i32 %arg1, i64 %arg2, i16* %arg3, i32 %arg4, i64 %arg5) { +; CHECK-LABEL: unsafe_dep_uniform_load_store +; CHECK-NOT: <4 x i32> +bb: + %tmp = alloca i32 + store i32 %arg4, i32* %tmp + %tmp6 = getelementptr inbounds i16, i16* %arg3, i64 %arg5 + br label %bb7 + +bb7: + %tmp8 = phi i64 [ 0, %bb ], [ %tmp24, %bb7 ] + %tmp9 = phi i32 [ %arg1, %bb ], [ %tmp23, %bb7 ] + %tmp10 = load i32, i32* %tmp + %tmp11 = mul nsw i32 %tmp9, %tmp10 + %tmp12 = srem i32 %tmp11, 65536 + %tmp13 = add nsw i32 %tmp12, %tmp9 + %tmp14 = trunc i32 %tmp13 to i16 + %tmp15 = trunc i64 %tmp8 to i32 + %tmp16 = add i32 %arg, %tmp15 + %tmp17 = zext i32 %tmp16 to i64 + %tmp18 = getelementptr inbounds i16, i16* %tmp6, i64 %tmp17 + store i16 %tmp14, i16* %tmp18, align 2 + %tmp19 = add i32 %tmp13, %tmp9 + %tmp20 = trunc i32 %tmp19 to i16 + %tmp21 = and i16 %tmp20, 255 + %tmp22 = getelementptr inbounds i16, i16* %arg3, i64 %tmp17 + store i16 %tmp21, i16* %tmp22, align 2 + %tmp23 = add nsw i32 %tmp9, 1 + %tmp24 = add nuw nsw i64 %tmp8, 1 + %tmp25 = icmp eq i64 %tmp24, %arg2 + store i32 %tmp12, i32* %tmp + br i1 %tmp25, label %bb26, label %bb7 + +bb26: + ret void +} + +; uniform load and store to the same address, but it is a safe dependency that can be vectorized +define void @safe_dep_uniform_load_store(i32 %arg, i32 %arg1, i64 %arg2, i16* %arg3, i32 %arg4, i64 %arg5) { +; CHECK-LABEL: safe_dep_uniform_load_store +; CHECK: vector.scevcheck: +; CHECK: [[TMP5:%.*]] = or i1 +; CHECK: br i1 [[TMP5]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 %arg2, -4 +; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END:%.*]] = add i32 [[CAST_CRD]], [[ARG1:%.*]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[ARG1]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND9:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT10:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND11:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT12:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <4 x i32> [[VEC_IND9]], [[VEC_IND11]] +; CHECK-NEXT: [[TMP12:%.*]] = srem <4 x i32> [[TMP11]], +; CHECK-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], [[VEC_IND9]] +; CHECK-NEXT: [[TMP14:%.*]] = trunc <4 x i32> [[TMP13]] to <4 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = trunc i64 [[INDEX]] to i32 +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], %arg +; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, i16* %tmp6, i64 [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i16* [[TMP18]] to <4 x i16>* +; CHECK-NEXT: store <4 x i16> [[TMP14]], <4 x i16>* [[TMP19]], align 2, !alias.scope !43, !noalias !46 +; CHECK-NEXT: [[TMP20:%.*]] = add <4 x i32> [[TMP13]], [[VEC_IND9]] +; CHECK-NEXT: [[TMP21:%.*]] = trunc <4 x i32> [[TMP20]] to <4 x i16> +; CHECK-NEXT: [[TMP22:%.*]] = and <4 x i16> [[TMP21]], +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, i16* %arg3, i64 [[TMP17]] +; CHECK-NEXT: [[TMP24:%.*]] = bitcast i16* [[TMP23]] to <4 x i16>* +; CHECK-NEXT: store <4 x i16> [[TMP22]], <4 x i16>* [[TMP24]], align 2, !alias.scope !46 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT10]] = add <4 x i32> [[VEC_IND9]], +; CHECK-NEXT: [[VEC_IND_NEXT12]] = add <4 x i32> [[VEC_IND11]], +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !48 +bb: + %tmp = alloca i32 + store i32 %arg4, i32* %tmp + %tmp6 = getelementptr inbounds i16, i16* %arg3, i64 %arg5 + br label %bb7 + +bb7: + %tmp8 = phi i64 [ 0, %bb ], [ %tmp24, %bb7 ] + %tmp9 = phi i32 [ %arg1, %bb ], [ %tmp23, %bb7 ] + %sttrunc = trunc i64 %tmp8 to i32 + store i32 %sttrunc, i32* %tmp + %tmp10 = load i32, i32* %tmp + %tmp11 = mul nsw i32 %tmp9, %tmp10 + %tmp12 = srem i32 %tmp11, 65536 + %tmp13 = add nsw i32 %tmp12, %tmp9 + %tmp14 = trunc i32 %tmp13 to i16 + %tmp15 = trunc i64 %tmp8 to i32 + %tmp16 = add i32 %arg, %tmp15 + %tmp17 = zext i32 %tmp16 to i64 + %tmp18 = getelementptr inbounds i16, i16* %tmp6, i64 %tmp17 + store i16 %tmp14, i16* %tmp18, align 2 + %tmp19 = add i32 %tmp13, %tmp9 + %tmp20 = trunc i32 %tmp19 to i16 + %tmp21 = and i16 %tmp20, 255 + %tmp22 = getelementptr inbounds i16, i16* %arg3, i64 %tmp17 + store i16 %tmp21, i16* %tmp22, align 2 + %tmp23 = add nsw i32 %tmp9, 1 + %tmp24 = add nuw nsw i64 %tmp8, 1 + %tmp25 = icmp eq i64 %tmp24, %arg2 + br i1 %tmp25, label %bb26, label %bb7 + +bb26: + ret void +}