Index: include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- include/llvm/Analysis/LoopAccessAnalysis.h +++ include/llvm/Analysis/LoopAccessAnalysis.h @@ -571,6 +571,12 @@ return StoreToLoopInvariantAddress; } + /// Checks existence of stores to invariant address inside loop. + /// If such stores exist, checks if those are non-vectorizable stores. + bool hasNonVectorizableStoreToLoopInvariantAddress() const { + return NonVectorizableStoreToLoopInvariantAddress; + } + /// Used to add runtime SCEV checks. Simplifies SCEV expressions and converts /// them to a more usable form. All SCEV expressions during the analysis /// should be re-written (and therefore simplified) according to PSE. @@ -625,6 +631,11 @@ /// If a loop has write to a loop invariant address then it should be true. bool StoreToLoopInvariantAddress; + /// Indicator that there is a store to uniform address that is + /// non-vectorizable. + /// These are a subset of stores identified through + /// StoreToLoopInvariantAddress. + bool NonVectorizableStoreToLoopInvariantAddress; /// The diagnostics report generated for the analysis. E.g. why we /// couldn't analyze the loop. std::unique_ptr Report; Index: lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- lib/Analysis/LoopAccessAnalysis.cpp +++ lib/Analysis/LoopAccessAnalysis.cpp @@ -1864,8 +1864,15 @@ for (StoreInst *ST : Stores) { Value *Ptr = ST->getPointerOperand(); + bool isUniformPtr = isUniform(Ptr); // Check for store to loop invariant address. - StoreToLoopInvariantAddress |= isUniform(Ptr); + StoreToLoopInvariantAddress |= isUniformPtr; + + // Loop invariant values stored into loop invariant addresses are + // vectorizable. + NonVectorizableStoreToLoopInvariantAddress |= + (isUniformPtr && !TheLoop->isLoopInvariant(ST->getValueOperand())); + // If we did *not* see this pointer before, insert it to the read-write // list. At this phase it is only a 'write' list. if (Seen.insert(Ptr).second) { @@ -2265,7 +2272,7 @@ PtrRtChecking(llvm::make_unique(SE)), DepChecker(llvm::make_unique(*PSE, L)), TheLoop(L), NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false), - StoreToLoopInvariantAddress(false) { + StoreToLoopInvariantAddress(false), NonVectorizableStoreToLoopInvariantAddress(false) { if (canAnalyzeLoop()) analyzeLoop(AA, LI, TLI, DT); } Index: lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -754,7 +754,7 @@ if (!LAI->canVectorizeMemory()) return false; - if (LAI->hasStoreToLoopInvariantAddress()) { + if (LAI->hasNonVectorizableStoreToLoopInvariantAddress()) { ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress") << "write to a loop invariant address could not be vectorized"); LLVM_DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1482,8 +1482,10 @@ /// memory access. unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); - /// The cost calculation for Load instruction \p I with uniform pointer - - /// scalar load + broadcast. + /// The cost calculation for Load/Store instruction \p I with uniform pointer - + /// Load: scalar load + broadcast. + /// Store: scalar store + (loop invariant value stored? 0 : extract of last + /// element) unsigned getUniformMemOpCost(Instruction *I, unsigned VF); /// Returns whether the instruction is a load or store and will be a emitted @@ -5745,6 +5747,7 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, unsigned VF) { + if (isa(I)) { LoadInst *LI = cast(I); Type *ValTy = LI->getType(); Type *VectorTy = ToVectorTy(ValTy, VF); @@ -5754,6 +5757,20 @@ return TTI.getAddressComputationCost(ValTy) + TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); + } + StoreInst *SI = cast(I); + Type *ValTy = SI->getType(); + Type *VectorTy = ToVectorTy(ValTy, VF); + unsigned Alignment = SI->getAlignment(); + unsigned AS = SI->getPointerAddressSpace(); + + bool isLoopInvariantValueStored = + TheLoop->isLoopInvariant(SI->getValueOperand()); + return TTI.getAddressComputationCost(ValTy) + + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + + (isLoopInvariantValueStored ? 0 : TTI.getVectorInstrCost( + Instruction::ExtractElement, + VectorTy, VF - 1)); } unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, @@ -5851,8 +5868,10 @@ if (isa(&I) && isScalarWithPredication(&I)) NumPredStores++; - if (isa(&I) && Legal->isUniform(Ptr)) { - // Scalar load + broadcast + + if ((isa(&I) || isa(&I)) && Legal->isUniform(Ptr)) { + // Load: Scalar load + broadcast + // Store: Scalar store + isLoopInvariantValueStored ? 0 : extract unsigned Cost = getUniformMemOpCost(&I, VF); setWideningDecision(&I, VF, CM_Scalarize, Cost); continue; Index: test/Transforms/LoopVectorize/invariant-store-vectorization.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/invariant-store-vectorization.ll @@ -0,0 +1,174 @@ +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -dce -instcombine -S | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +; all tests that check whether it is legal to vectorize the stores to invariant +; address. + + +; CHECK-LABEL: inv_val_store_to_inv_address_with_reduction( +; memory check is found.conflict = b[max(n-1,1)] > a && (i8* a)+1 > (i8* b) +; CHECK: vector.memcheck: +; CHECK: found.conflict + +; CHECK-LABEL: vector.body: +; CHECK: %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[ADD:%[a-zA-Z0-9.]+]], %vector.body ] +; CHECK: %wide.load = load <4 x i32> +; CHECK: [[ADD]] = add <4 x i32> %vec.phi, %wide.load +; CHECK-NEXT: store i32 %ntrunc, i32* %a +; CHECK-NEXT: %index.next = add i64 %index, 4 +; CHECK-NEXT: icmp eq i64 %index.next, %n.vec +; CHECK-NEXT: br i1 + +; CHECK-LABEL: middle.block: +; CHECK: %rdx.shuf = shufflevector <4 x i32> +define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) { +entry: + %ntrunc = trunc i64 %n to i32 + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + %tmp3 = add i32 %tmp0, %tmp2 + store i32 %ntrunc, i32* %a + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + %tmp4 = phi i32 [ %tmp3, %for.body ] + ret i32 %tmp4 +} + +; CHECK-LABEL: inv_val_store_to_inv_address( +; CHECK-LABEL: vector.body: +; CHECK: store i32 %ntrunc, i32* %a +; CHECK: store <4 x i32> +; CHECK-NEXT: %index.next = add i64 %index, 4 +; CHECK-NEXT: icmp eq i64 %index.next, %n.vec +; CHECK-NEXT: br i1 +define void @inv_val_store_to_inv_address(i32* %a, i64 %n, i32* %b) { +entry: + %ntrunc = trunc i64 %n to i32 + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + store i32 %ntrunc, i32* %a + store i32 %ntrunc, i32* %tmp1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + + +; Both of these tests below are handled as predicated stores and have the cost model +; as identifying these as predicated stores. + + +; Conditional store +; if (b[i] == k) a = ntrunc +; TODO: We can be better with the code gen for the first test and we can have +; just one scalar store if vector.or.reduce(vector_cmp(b[i] == k)) is 1. + +; CHECK-LABEL:inv_val_store_to_inv_address_conditional( +; CHECK-LABEL: vector.body: +; CHECK: %wide.load = load <4 x i32>, <4 x i32>* +; CHECK: [[CMP:%[a-zA-Z0-9.]+]] = icmp eq <4 x i32> %wide.load, %{{.*}} +; CHECK: store <4 x i32> +; CHECK-NEXT: [[EE:%[a-zA-Z0-9.]+]] = extractelement <4 x i1> [[CMP]], i32 0 +; CHECK-NEXT: br i1 [[EE]], label %pred.store.if, label %pred.store.continue + +; CHECK-LABEL: pred.store.if: +; CHECK-NEXT: store i32 %ntrunc, i32* %a +; CHECK-NEXT: br label %pred.store.continue + +; CHECK-LABEL: pred.store.continue: +; CHECK-NEXT: [[EE1:%[a-zA-Z0-9.]+]] = extractelement <4 x i1> [[CMP]], i32 1 +define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32 %k) { +entry: + %ntrunc = trunc i64 %n to i32 + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + %cmp = icmp eq i32 %tmp2, %k + store i32 %ntrunc, i32* %tmp1 + br i1 %cmp, label %cond_store, label %latch + +cond_store: + store i32 %ntrunc, i32* %a + br label %latch + +latch: + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + +; if (b[i] == k) +; a = ntrunc +; else a = k; +; For this case, we still vectorize, by generating predicated stores for the if +; and else cases. +; TODO: Code gen can be improved by select(extract(vec_cmp(b[i], k), VF - 1) == 1, a = ntrunc, a = k) +; CHECK-LABEL:inv_val_store_to_inv_address_conditional_diff_values( +; CHECK-LABEL: vector.body: +; CHECK: %wide.load = load <4 x i32>, <4 x i32>* +; CHECK: [[CMP:%[a-zA-Z0-9.]+]] = icmp eq <4 x i32> %wide.load, %{{.*}} +; CHECK: store <4 x i32> +; CHECK: [[CMPNOT:%[a-zA-Z0-9.]+]] = xor <4 x i1> [[CMP]], +; CHECK: [[EENOT1:%[a-zA-Z0-9.]+]] = extractelement <4 x i1> [[CMPNOT]], i32 0 +; CHECK: br i1 [[EENOT1]], label %pred.store.if, label %pred.store.continue + +; CHECK-LABEL: pred.store.if: +; CHECK: store i32 %k, i32* %a +; CHECK: br label %pred.store.continue + +; all predicated stores for a = k +; then we check the original condition and do a predicated stores for a = ntrunc. + +; CHECK-LABEL: pred.store.continue14: +; CHECK: [[EE1:%[a-zA-Z0-9.]+]] = extractelement <4 x i1> [[CMP]], i32 0 +define void @inv_val_store_to_inv_address_conditional_diff_values(i32* %a, i64 %n, i32* %b, i32 %k) { +entry: + %ntrunc = trunc i64 %n to i32 + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + %cmp = icmp eq i32 %tmp2, %k + store i32 %ntrunc, i32* %tmp1 + br i1 %cmp, label %cond_store, label %cond_store_k + +cond_store: + store i32 %ntrunc, i32* %a + br label %latch + +cond_store_k: + store i32 %k, i32 * %a + br label %latch + +latch: + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} Index: test/Transforms/LoopVectorize/pr31190.ll =================================================================== --- test/Transforms/LoopVectorize/pr31190.ll +++ test/Transforms/LoopVectorize/pr31190.ll @@ -29,7 +29,8 @@ @a = external global i32, align 4 @b = external global [1 x i32], align 4 -; CHECK: LV: Not vectorizing: Cannot prove legality. +; We can vectorize this loop because we are storing an invariant value into an +; invariant address. ; CHECK-LABEL: @test define void @test() { entry: