Index: include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- include/llvm/Analysis/LoopAccessAnalysis.h +++ include/llvm/Analysis/LoopAccessAnalysis.h @@ -571,6 +571,12 @@ return StoreToLoopInvariantAddress; } + /// Checks existence of stores to invariant address inside loop. + /// If such stores exist, checks if those are non-vectorizable stores. + bool hasNonVectorizableStoreToLoopInvariantAddress() const { + return NonVectorizableStoreToLoopInvariantAddress; + } + /// Used to add runtime SCEV checks. Simplifies SCEV expressions and converts /// them to a more usable form. All SCEV expressions during the analysis /// should be re-written (and therefore simplified) according to PSE. @@ -625,6 +631,11 @@ /// If a loop has write to a loop invariant address then it should be true. bool StoreToLoopInvariantAddress; + /// Indicator that there is a store to uniform address that is + /// non-vectorizable. + /// These are a subset of stores identified through + /// StoreToLoopInvariantAddress. + bool NonVectorizableStoreToLoopInvariantAddress; /// The diagnostics report generated for the analysis. E.g. why we /// couldn't analyze the loop. std::unique_ptr Report; Index: lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- lib/Analysis/LoopAccessAnalysis.cpp +++ lib/Analysis/LoopAccessAnalysis.cpp @@ -1862,10 +1862,26 @@ // writes and between reads and writes, but not between reads and reads. ValueSet Seen; + auto isLoopInvariant = [this](StoreInst *ST) { + auto StoreVal = ST->getValueOperand(); + if (TheLoop->isLoopInvariant(StoreVal)) + return true; + if (!isa(StoreVal)) + return false; + return TheLoop->hasLoopInvariantOperands(cast(StoreVal)); + }; + for (StoreInst *ST : Stores) { Value *Ptr = ST->getPointerOperand(); + bool isUniformPtr = isUniform(Ptr); // Check for store to loop invariant address. - StoreToLoopInvariantAddress |= isUniform(Ptr); + StoreToLoopInvariantAddress |= isUniformPtr; + + // Loop invariant values stored into loop invariant addresses are + // vectorizable. + NonVectorizableStoreToLoopInvariantAddress |= + (isUniformPtr && !isLoopInvariant(ST)); + // If we did *not* see this pointer before, insert it to the read-write // list. At this phase it is only a 'write' list. if (Seen.insert(Ptr).second) { @@ -2265,7 +2281,7 @@ PtrRtChecking(llvm::make_unique(SE)), DepChecker(llvm::make_unique(*PSE, L)), TheLoop(L), NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false), - StoreToLoopInvariantAddress(false) { + StoreToLoopInvariantAddress(false), NonVectorizableStoreToLoopInvariantAddress(false) { if (canAnalyzeLoop()) analyzeLoop(AA, LI, TLI, DT); } Index: lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -755,7 +755,7 @@ if (!LAI->canVectorizeMemory()) return false; - if (LAI->hasStoreToLoopInvariantAddress()) { + if (LAI->hasNonVectorizableStoreToLoopInvariantAddress()) { ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress") << "write to a loop invariant address could not be vectorized"); LLVM_DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1482,8 +1482,10 @@ /// memory access. unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); - /// The cost calculation for Load instruction \p I with uniform pointer - - /// scalar load + broadcast. + /// The cost calculation for Load/Store instruction \p I with uniform pointer - + /// Load: scalar load + broadcast. + /// Store: scalar store + (loop invariant value stored? 0 : extract of last + /// element) unsigned getUniformMemOpCost(Instruction *I, unsigned VF); /// Returns whether the instruction is a load or store and will be a emitted @@ -5749,6 +5751,7 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, unsigned VF) { + if (isa(I)) { LoadInst *LI = cast(I); Type *ValTy = LI->getType(); Type *VectorTy = ToVectorTy(ValTy, VF); @@ -5758,6 +5761,20 @@ return TTI.getAddressComputationCost(ValTy) + TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); + } + StoreInst *SI = cast(I); + Type *ValTy = SI->getType(); + Type *VectorTy = ToVectorTy(ValTy, VF); + unsigned Alignment = SI->getAlignment(); + unsigned AS = SI->getPointerAddressSpace(); + + bool isLoopInvariantValueStored = + TheLoop->isLoopInvariant(SI->getValueOperand()); + return TTI.getAddressComputationCost(ValTy) + + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + + (isLoopInvariantValueStored ? 0 : TTI.getVectorInstrCost( + Instruction::ExtractElement, + VectorTy, VF - 1)); } unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, @@ -5855,8 +5872,10 @@ if (isa(&I) && isScalarWithPredication(&I)) NumPredStores++; - if (isa(&I) && Legal->isUniform(Ptr)) { - // Scalar load + broadcast + + if ((isa(&I) || isa(&I)) && Legal->isUniform(Ptr)) { + // Load: Scalar load + broadcast + // Store: Scalar store + isLoopInvariantValueStored ? 0 : extract unsigned Cost = getUniformMemOpCost(&I, VF); setWideningDecision(&I, VF, CM_Scalarize, Cost); continue; Index: test/Transforms/LoopVectorize/invariant-store-vectorization.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/invariant-store-vectorization.ll @@ -0,0 +1,277 @@ +; RUN: opt < %s -licm -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s + +; First licm pass is to hoist/sink invariant stores if possible. Today LICM does +; not hoist/sink the invariant stores. Even if that changes, we should still +; vectorize this loop in case licm is not run. + +; The next licm pass after vectorization is to hoist/sink loop invariant +; instructions. +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +; all tests that check whether it is legal to vectorize the stores to invariant +; address. + + +; CHECK-LABEL: inv_val_store_to_inv_address_with_reduction( +; memory check is found.conflict = b[max(n-1,1)] > a && (i8* a)+1 > (i8* b) +; CHECK: vector.memcheck: +; CHECK: found.conflict + +; CHECK-LABEL: vector.body: +; CHECK: %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[ADD:%[a-zA-Z0-9.]+]], %vector.body ] +; CHECK: %wide.load = load <4 x i32> +; CHECK: [[ADD]] = add <4 x i32> %vec.phi, %wide.load +; CHECK-NEXT: store i32 %ntrunc, i32* %a +; CHECK-NEXT: %index.next = add i64 %index, 4 +; CHECK-NEXT: icmp eq i64 %index.next, %n.vec +; CHECK-NEXT: br i1 + +; CHECK-LABEL: middle.block: +; CHECK: %rdx.shuf = shufflevector <4 x i32> +define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) { +entry: + %ntrunc = trunc i64 %n to i32 + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + %tmp3 = add i32 %tmp0, %tmp2 + store i32 %ntrunc, i32* %a + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + %tmp4 = phi i32 [ %tmp3, %for.body ] + ret i32 %tmp4 +} + +; CHECK-LABEL: inv_val_store_to_inv_address( +; CHECK-LABEL: vector.body: +; CHECK: store i32 %ntrunc, i32* %a +; CHECK: store <4 x i32> +; CHECK-NEXT: %index.next = add i64 %index, 4 +; CHECK-NEXT: icmp eq i64 %index.next, %n.vec +; CHECK-NEXT: br i1 +define void @inv_val_store_to_inv_address(i32* %a, i64 %n, i32* %b) { +entry: + %ntrunc = trunc i64 %n to i32 + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + store i32 %ntrunc, i32* %a + store i32 %ntrunc, i32* %tmp1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + + +; Both of these tests below are handled as predicated stores and have the cost model +; as identifying these as predicated stores. + + +; Conditional store +; if (b[i] == k) a = ntrunc +; TODO: We can be better with the code gen for the first test and we can have +; just one scalar store if vector.or.reduce(vector_cmp(b[i] == k)) is 1. + +; CHECK-LABEL:inv_val_store_to_inv_address_conditional( +; CHECK-LABEL: vector.body: +; CHECK: %wide.load = load <4 x i32>, <4 x i32>* +; CHECK: [[CMP:%[a-zA-Z0-9.]+]] = icmp eq <4 x i32> %wide.load, %{{.*}} +; CHECK: store <4 x i32> +; CHECK-NEXT: [[EE:%[a-zA-Z0-9.]+]] = extractelement <4 x i1> [[CMP]], i32 0 +; CHECK-NEXT: br i1 [[EE]], label %pred.store.if, label %pred.store.continue + +; CHECK-LABEL: pred.store.if: +; CHECK-NEXT: store i32 %ntrunc, i32* %a +; CHECK-NEXT: br label %pred.store.continue + +; CHECK-LABEL: pred.store.continue: +; CHECK-NEXT: [[EE1:%[a-zA-Z0-9.]+]] = extractelement <4 x i1> [[CMP]], i32 1 +define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32 %k) { +entry: + %ntrunc = trunc i64 %n to i32 + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + %cmp = icmp eq i32 %tmp2, %k + store i32 %ntrunc, i32* %tmp1 + br i1 %cmp, label %cond_store, label %latch + +cond_store: + store i32 %ntrunc, i32* %a + br label %latch + +latch: + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + +; if (b[i] == k) +; a = ntrunc +; else a = k; +; For this case, we still vectorize, by generating predicated stores for the if +; and else cases. +; TODO: Code gen can be improved by select(extract(vec_cmp(b[i], k), VF - 1) == 1, a = ntrunc, a = k) +; CHECK-LABEL:inv_val_store_to_inv_address_conditional_diff_values( +; CHECK-LABEL: vector.body: +; CHECK: %wide.load = load <4 x i32>, <4 x i32>* +; CHECK: [[CMP:%[a-zA-Z0-9.]+]] = icmp eq <4 x i32> %wide.load, %{{.*}} +; CHECK: store <4 x i32> +; CHECK: [[CMPNOT:%[a-zA-Z0-9.]+]] = xor <4 x i1> [[CMP]], +; CHECK: [[EENOT1:%[a-zA-Z0-9.]+]] = extractelement <4 x i1> [[CMPNOT]], i32 0 +; CHECK: br i1 [[EENOT1]], label %pred.store.if, label %pred.store.continue + +; CHECK-LABEL: pred.store.if: +; CHECK: store i32 %k, i32* %a +; CHECK: br label %pred.store.continue + +; all predicated stores for a = k +; then we check the original condition and do a predicated stores for a = ntrunc. + +; CHECK-LABEL: pred.store.continue14: +; CHECK: [[EE1:%[a-zA-Z0-9.]+]] = extractelement <4 x i1> [[CMP]], i32 0 +define void @inv_val_store_to_inv_address_conditional_diff_values(i32* %a, i64 %n, i32* %b, i32 %k) { +entry: + %ntrunc = trunc i64 %n to i32 + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + %cmp = icmp eq i32 %tmp2, %k + store i32 %ntrunc, i32* %tmp1 + br i1 %cmp, label %cond_store, label %cond_store_k + +cond_store: + store i32 %ntrunc, i32* %a + br label %latch + +cond_store_k: + store i32 %k, i32 * %a + br label %latch + +latch: + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + +; Instcombine'd version of above test. Now the store is no longer predicated. +; CHECK-LABEL: inv_val_store_to_inv_address_conditional_diff_values_ic +; CHECK-LABEL: vector.memcheck: +; CHECK-LABEL: vector.ph: +; CHECK: [[BSPLATIN1:%[a-zA-Z0-9.]+]] = insertelement <4 x i32> undef, i32 %k, i32 0 +; CHECK: [[BSPLATK:%[a-zA-Z0-9.]+]] = shufflevector <4 x i32> [[BSPLATIN1]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: [[BSPLATIN2:%[a-zA-Z0-9.]+]] = insertelement <4 x i32> undef, i32 %ntrunc, i32 0 +; CHECK: [[BSPLAT2:%[a-zA-Z0-9.]+]] = shufflevector <4 x i32> [[BSPLATIN2]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %vector.body + +; CHECK-LABEL: vector.body: +; CHECK: [[GEPB:%[a-zA-Z0-9.]+]] = getelementptr inbounds i32, i32* %b, i64 %index +; CHECK-NEXT: [[BCB:%[a-zA-Z0-9.]+]] = bitcast i32* [[GEPB]] to <4 x i32>* +; CHECK-NEXT: %wide.load = load <4 x i32>, <4 x i32>* [[BCB]] +; CHECK-NEXT: [[INVCOND:%[a-zA-Z0-9.]+]] = icmp eq <4 x i32> %wide.load, [[BSPLATK]] +; CHECK: %predphi = select <4 x i1> [[INVCOND]], <4 x i32> [[BSPLAT2]], <4 x i32> [[BSPLATK]] +; CHECK-NEXT: [[EE:%[a-zA-Z0-9.]+]] = extractelement <4 x i32> %predphi, i32 3 +; CHECK-NEXT: store i32 [[EE]], i32* %a +; CHECK-NEXT: %index.next = add i64 %index, 4 +define void @inv_val_store_to_inv_address_conditional_diff_values_ic(i32* %a, i64 %n, i32* %b, i32 %k) { +entry: + %ntrunc = trunc i64 %n to i32 + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + %cmp = icmp eq i32 %tmp2, %k + store i32 %ntrunc, i32* %tmp1 + br i1 %cmp, label %cond_store, label %cond_store_k + +cond_store: + br label %latch + +cond_store_k: + br label %latch + +latch: + %storeval = phi i32 [ %ntrunc, %cond_store ], [ %k, %cond_store_k ] + store i32 %storeval, i32* %a + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + +; invariant val stored to invariant address predicated on invariant condition +; This is not treated as a predicated store since the block the store belongs to +; is the latch block (which doesn't need to be predicated). +; CHECK-LABEL: inv_val_store_to_inv_address_conditional_inv +; CHECK-LABEL: vector.memcheck: +; CHECK-LABEL: vector.ph: +; CHECK: [[BSPLATIN1:%[a-zA-Z0-9.]+]] = insertelement <4 x i32> undef, i32 %ntrunc, i32 0 +; CHECK: [[BSPLATN:%[a-zA-Z0-9.]+]] = shufflevector <4 x i32> [[BSPLATIN1]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: [[INSK:%[a-zA-Z0-9.]+]] = insertelement <4 x i32> undef, i32 %k, i32 3 +; CHECK: %predphi = select <4 x i1> {{.*}}, <4 x i32> [[INSK]], <4 x i32> [[BSPLATN]] +; CHECK: [[STOREVAL:%[a-zA-Z0-9.]+]] = extractelement <4 x i32> %predphi, i32 3 +; CHECK-NEXT: br label %vector.body + +; CHECK-LABEL: vector.body: +; CHECK: [[GEPB:%[a-zA-Z0-9.]+]] = getelementptr inbounds i32, i32* %b, i64 %index +; CHECK-NEXT: [[BCB:%[a-zA-Z0-9.]+]] = bitcast i32* [[GEPB]] to <4 x i32>* +; CHECK: store i32 [[STOREVAL]], i32* %a +; CHECK-NEXT: %index.next = add i64 %index, 4 +define void @inv_val_store_to_inv_address_conditional_inv(i32* %a, i64 %n, i32* %b, i32 %k) { +entry: + %ntrunc = trunc i64 %n to i32 + %cmp = icmp eq i32 %ntrunc, %k + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + store i32 %ntrunc, i32* %tmp1 + br i1 %cmp, label %cond_store, label %cond_store_k + +cond_store: + br label %latch + +cond_store_k: + br label %latch + +latch: + %storeval = phi i32 [ %ntrunc, %cond_store ], [ %k, %cond_store_k ] + store i32 %storeval, i32* %a + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} Index: test/Transforms/LoopVectorize/pr31190.ll =================================================================== --- test/Transforms/LoopVectorize/pr31190.ll +++ test/Transforms/LoopVectorize/pr31190.ll @@ -29,7 +29,8 @@ @a = external global i32, align 4 @b = external global [1 x i32], align 4 -; CHECK: LV: Not vectorizing: Cannot prove legality. +; We can vectorize this loop because we are storing an invariant value into an +; invariant address. ; CHECK-LABEL: @test define void @test() { entry: