Index: llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h +++ llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h @@ -564,11 +564,10 @@ /// Print the information about the memory accesses in the loop. void print(raw_ostream &OS, unsigned Depth = 0) const; - /// Checks existence of store to invariant address inside loop. - /// If the loop has any store to invariant address, then it returns true, - /// else returns false. - bool hasStoreToLoopInvariantAddress() const { - return StoreToLoopInvariantAddress; + /// If the loop has any store of a variant value to an invariant address, then + /// return true, else return false. + bool hasVariantStoreToLoopInvariantAddress() const { + return HasVariantStoreToLoopInvariantAddress; } /// Used to add runtime SCEV checks. Simplifies SCEV expressions and converts @@ -621,9 +620,8 @@ /// Cache the result of analyzeLoop. bool CanVecMem; - /// Indicator for storing to uniform addresses. - /// If a loop has write to a loop invariant address then it should be true. - bool StoreToLoopInvariantAddress; + /// Indicator that there is a store of a variant value to a uniform address. + bool HasVariantStoreToLoopInvariantAddress; /// The diagnostics report generated for the analysis. E.g. why we /// couldn't analyze the loop. Index: llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp +++ llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp @@ -1862,10 +1862,21 @@ // writes and between reads and writes, but not between reads and reads. ValueSet Seen; + // Record uniform store addresses to identify if we have multiple stores + // to the same address. + ValueSet UniformStores; + for (StoreInst *ST : Stores) { Value *Ptr = ST->getPointerOperand(); - // Check for store to loop invariant address. - StoreToLoopInvariantAddress |= isUniform(Ptr); + + if (isUniform(Ptr)) { + // Consider multiple stores to the same uniform address as a store of a + // variant value. + bool MultipleStoresToUniformPtr = !UniformStores.insert(Ptr).second; + HasVariantStoreToLoopInvariantAddress |= + (!isUniform(ST->getValueOperand()) || MultipleStoresToUniformPtr); + } + // If we did *not* see this pointer before, insert it to the read-write // list. At this phase it is only a 'write' list. if (Seen.insert(Ptr).second) { @@ -2265,7 +2276,7 @@ PtrRtChecking(llvm::make_unique(SE)), DepChecker(llvm::make_unique(*PSE, L)), TheLoop(L), NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false), - StoreToLoopInvariantAddress(false) { + HasVariantStoreToLoopInvariantAddress(false) { if (canAnalyzeLoop()) analyzeLoop(AA, LI, TLI, DT); } @@ -2297,8 +2308,8 @@ PtrRtChecking->print(OS, Depth); OS << "\n"; - OS.indent(Depth) << "Store to invariant address was " - << (StoreToLoopInvariantAddress ? "" : "not ") + OS.indent(Depth) << "Variant Store to invariant address was " + << (HasVariantStoreToLoopInvariantAddress ? "" : "not ") << "found in loop.\n"; OS.indent(Depth) << "SCEV assumptions:\n"; Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -817,9 +817,10 @@ if (!LAI->canVectorizeMemory()) return false; - if (LAI->hasStoreToLoopInvariantAddress()) { + if (LAI->hasVariantStoreToLoopInvariantAddress()) { ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress") - << "write to a loop invariant address could not be vectorized"); + << "write of variant value to a loop invariant address could not " + "be vectorized"); LLVM_DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); return false; } Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1174,8 +1174,11 @@ /// memory access. unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); - /// The cost calculation for Load instruction \p I with uniform pointer - - /// scalar load + broadcast. + /// The cost calculation for Load/Store instruction \p I with uniform pointer - + /// Load: scalar load + broadcast. + /// Store: scalar store + (loop invariant value stored? 0 : extract of last + /// element) + /// TODO: Test the extra cost of the extract when loop variant value stored. unsigned getUniformMemOpCost(Instruction *I, unsigned VF); /// Returns whether the instruction is a load or store and will be a emitted @@ -5297,15 +5300,23 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, unsigned VF) { - LoadInst *LI = cast(I); - Type *ValTy = LI->getType(); + Type *ValTy = getMemInstValueType(I); Type *VectorTy = ToVectorTy(ValTy, VF); - unsigned Alignment = LI->getAlignment(); - unsigned AS = LI->getPointerAddressSpace(); + unsigned Alignment = getLoadStoreAlignment(I); + unsigned AS = getLoadStoreAddressSpace(I); + if (isa(I)) { + return TTI.getAddressComputationCost(ValTy) + + TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); + } + StoreInst *SI = cast(I); + bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + - TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + + (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost( + Instruction::ExtractElement, + VectorTy, VF - 1)); } unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, @@ -5404,15 +5415,22 @@ if (!Ptr) continue; + // TODO: We should generate better code and update the cost model for + // predicated uniform stores. Today they are treated as any other + // predicated store (see added test cases in + // invariant-store-vectorization.ll). if (isa(&I) && isScalarWithPredication(&I)) NumPredStores++; - if (isa(&I) && Legal->isUniform(Ptr) && - // Conditional loads should be scalarized and predicated. + if (Legal->isUniform(Ptr) && + // Conditional loads and stores should be scalarized and predicated. // isScalarWithPredication cannot be used here since masked // gather/scatters are not considered scalar with predication. !Legal->blockNeedsPredication(I.getParent())) { - // Scalar load + broadcast + // TODO: Avoid replicating loads and stores instead of + // relying on instcombine to remove them. + // Load: Scalar load + broadcast + // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract unsigned Cost = getUniformMemOpCost(&I, VF); setWideningDecision(&I, VF, CM_Scalarize, Cost); continue; Index: llvm/trunk/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll =================================================================== --- llvm/trunk/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll +++ llvm/trunk/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll @@ -39,7 +39,7 @@ ; CHECK-NEXT: Group ; CHECK-NEXT: (Low: %b High: ((4 * (1 umax %x)) + %b)) ; CHECK-NEXT: Member: {%b,+,4}<%for.body> -; CHECK: Store to invariant address was not found in loop. +; CHECK: Variant Store to invariant address was not found in loop. ; CHECK-NEXT: SCEV assumptions: ; CHECK-NEXT: {1,+,1}<%for.body> Added Flags: ; CHECK-NEXT: {0,+,1}<%for.body> Added Flags: Index: llvm/trunk/test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll =================================================================== --- llvm/trunk/test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll +++ llvm/trunk/test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll @@ -13,14 +13,14 @@ ; The LAA with the new PM is a loop pass so we go from inner to outer loops. ; OLDPM: for.cond1.preheader: -; OLDPM: Store to invariant address was not found in loop. +; OLDPM: Variant Store to invariant address was not found in loop. ; OLDPM: for.body3: -; OLDPM: Store to invariant address was found in loop. +; OLDPM: Variant Store to invariant address was found in loop. ; NEWPM: for.body3: -; NEWPM: Store to invariant address was found in loop. +; NEWPM: Variant Store to invariant address was found in loop. ; NEWPM: for.cond1.preheader: -; NEWPM: Store to invariant address was not found in loop. +; NEWPM: Variant Store to invariant address was not found in loop. define i32 @foo(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 { entry: Index: llvm/trunk/test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll =================================================================== --- llvm/trunk/test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll +++ llvm/trunk/test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll @@ -10,8 +10,8 @@ ; } ; } -; CHECK: Store to invariant address was not found in loop. -; CHECK-NOT: Store to invariant address was found in loop. +; CHECK: Variant Store to invariant address was not found in loop. +; CHECK-NOT: Variant Store to invariant address was found in loop. define i32 @foo(i32* nocapture readonly %var1, i32* nocapture %var2, i32 %itr) #0 { Index: llvm/trunk/test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll =================================================================== --- llvm/trunk/test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll +++ llvm/trunk/test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll @@ -10,7 +10,7 @@ ; } ; } -; CHECK: Store to invariant address was found in loop. +; CHECK: Variant Store to invariant address was found in loop. define void @foo(i32* nocapture %var1, i32* nocapture %var2, i32 %itr) #0 { entry: Index: llvm/trunk/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll @@ -0,0 +1,132 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -loop-vectorize -S -mcpu=skylake-avx512 -instcombine < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; first test checks that loop with a reduction and a uniform store gets +; vectorized. +; CHECK-LABEL: inv_val_store_to_inv_address_with_reduction +; CHECK-LABEL: vector.memcheck: +; CHECK: found.conflict + +; CHECK-LABEL: vector.body: +; CHECK: %vec.phi = phi <16 x i32> [ zeroinitializer, %vector.ph ], [ [[ADD:%[a-zA-Z0-9.]+]], %vector.body ] +; CHECK: %wide.load = load <16 x i32> +; CHECK: [[ADD]] = add <16 x i32> %vec.phi, %wide.load +; CHECK: store i32 %ntrunc, i32* %a +; CHECK-NOT: store i32 %ntrunc, i32* %a +; CHECK: %index.next = add i64 %index, 64 + +; CHECK-LABEL: middle.block: +; CHECK: %rdx.shuf = shufflevector <16 x i32> +define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) { +entry: + %ntrunc = trunc i64 %n to i32 + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + %tmp3 = add i32 %tmp0, %tmp2 + store i32 %ntrunc, i32* %a + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + %tmp4 = phi i32 [ %tmp3, %for.body ] + ret i32 %tmp4 +} + +; Conditional store +; if (b[i] == k) a = ntrunc +define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32 %k) { +; CHECK-LABEL: @inv_val_store_to_inv_address_conditional( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[N]], 1 +; CHECK-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8* +; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i64 [[N]], 1 +; CHECK-NEXT: [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]] +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A4]], i64 1 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i32> undef, i32 [[K:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT5]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT7]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <16 x i32*> undef, i32* [[A]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT9]], <16 x i32*> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 8, !alias.scope !8, !noalias !11 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT6]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>* +; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT8]], <16 x i32>* [[TMP5]], align 4, !alias.scope !8, !noalias !11 +; CHECK-NEXT: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[BROADCAST_SPLAT8]], <16 x i32*> [[BROADCAST_SPLAT10]], i32 4, <16 x i1> [[TMP4]]), !alias.scope !11 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !13 +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 8 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP2]], [[K]] +; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[TMP1]], align 4 +; CHECK-NEXT: br i1 [[CMP]], label [[COND_STORE:%.*]], label [[LATCH]] +; CHECK: cond_store: +; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4 +; CHECK-NEXT: br label [[LATCH]] +; CHECK: latch: +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !14 +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %ntrunc = trunc i64 %n to i32 + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + %cmp = icmp eq i32 %tmp2, %k + store i32 %ntrunc, i32* %tmp1 + br i1 %cmp, label %cond_store, label %latch + +cond_store: + store i32 %ntrunc, i32* %a + br label %latch + +latch: + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} Index: llvm/trunk/test/Transforms/LoopVectorize/invariant-store-vectorization.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/invariant-store-vectorization.ll +++ llvm/trunk/test/Transforms/LoopVectorize/invariant-store-vectorization.ll @@ -0,0 +1,260 @@ +; RUN: opt < %s -licm -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s + +; First licm pass is to hoist/sink invariant stores if possible. Today LICM does +; not hoist/sink the invariant stores. Even if that changes, we should still +; vectorize this loop in case licm is not run. + +; The next licm pass after vectorization is to hoist/sink loop invariant +; instructions. +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +; all tests check that it is legal to vectorize the stores to invariant +; address. + + +; CHECK-LABEL: inv_val_store_to_inv_address_with_reduction( +; memory check is found.conflict = b[max(n-1,1)] > a && (i8* a)+1 > (i8* b) +; CHECK: vector.memcheck: +; CHECK: found.conflict + +; CHECK-LABEL: vector.body: +; CHECK: %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[ADD:%[a-zA-Z0-9.]+]], %vector.body ] +; CHECK: %wide.load = load <4 x i32> +; CHECK: [[ADD]] = add <4 x i32> %vec.phi, %wide.load +; CHECK-NEXT: store i32 %ntrunc, i32* %a +; CHECK-NEXT: %index.next = add i64 %index, 4 +; CHECK-NEXT: icmp eq i64 %index.next, %n.vec +; CHECK-NEXT: br i1 + +; CHECK-LABEL: middle.block: +; CHECK: %rdx.shuf = shufflevector <4 x i32> +define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) { +entry: + %ntrunc = trunc i64 %n to i32 + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + %tmp3 = add i32 %tmp0, %tmp2 + store i32 %ntrunc, i32* %a + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + %tmp4 = phi i32 [ %tmp3, %for.body ] + ret i32 %tmp4 +} + +; CHECK-LABEL: inv_val_store_to_inv_address( +; CHECK-LABEL: vector.body: +; CHECK: store i32 %ntrunc, i32* %a +; CHECK: store <4 x i32> +; CHECK-NEXT: %index.next = add i64 %index, 4 +; CHECK-NEXT: icmp eq i64 %index.next, %n.vec +; CHECK-NEXT: br i1 +define void @inv_val_store_to_inv_address(i32* %a, i64 %n, i32* %b) { +entry: + %ntrunc = trunc i64 %n to i32 + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + store i32 %ntrunc, i32* %a + store i32 %ntrunc, i32* %tmp1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + + +; Both of these tests below are handled as predicated stores. + +; Conditional store +; if (b[i] == k) a = ntrunc +; TODO: We can be better with the code gen for the first test and we can have +; just one scalar store if vector.or.reduce(vector_cmp(b[i] == k)) is 1. + +; CHECK-LABEL:inv_val_store_to_inv_address_conditional( +; CHECK-LABEL: vector.body: +; CHECK: %wide.load = load <4 x i32>, <4 x i32>* +; CHECK: [[CMP:%[a-zA-Z0-9.]+]] = icmp eq <4 x i32> %wide.load, %{{.*}} +; CHECK: store <4 x i32> +; CHECK-NEXT: [[EE:%[a-zA-Z0-9.]+]] = extractelement <4 x i1> [[CMP]], i32 0 +; CHECK-NEXT: br i1 [[EE]], label %pred.store.if, label %pred.store.continue + +; CHECK-LABEL: pred.store.if: +; CHECK-NEXT: store i32 %ntrunc, i32* %a +; CHECK-NEXT: br label %pred.store.continue + +; CHECK-LABEL: pred.store.continue: +; CHECK-NEXT: [[EE1:%[a-zA-Z0-9.]+]] = extractelement <4 x i1> [[CMP]], i32 1 +define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32 %k) { +entry: + %ntrunc = trunc i64 %n to i32 + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + %cmp = icmp eq i32 %tmp2, %k + store i32 %ntrunc, i32* %tmp1 + br i1 %cmp, label %cond_store, label %latch + +cond_store: + store i32 %ntrunc, i32* %a + br label %latch + +latch: + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + +; if (b[i] == k) +; a = ntrunc +; else a = k; +; TODO: We could vectorize this once we support multiple uniform stores to the +; same address. +; CHECK-LABEL:inv_val_store_to_inv_address_conditional_diff_values( +; CHECK-NOT: load <4 x i32> +define void @inv_val_store_to_inv_address_conditional_diff_values(i32* %a, i64 %n, i32* %b, i32 %k) { +entry: + %ntrunc = trunc i64 %n to i32 + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + %cmp = icmp eq i32 %tmp2, %k + store i32 %ntrunc, i32* %tmp1 + br i1 %cmp, label %cond_store, label %cond_store_k + +cond_store: + store i32 %ntrunc, i32* %a + br label %latch + +cond_store_k: + store i32 %k, i32 * %a + br label %latch + +latch: + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + +; Instcombine'd version of above test. Now the store is no longer of invariant +; value. +; TODO: We should be able to vectorize this loop once we support vectorizing +; stores of variant values to invariant addresses. +; CHECK-LABEL: inv_val_store_to_inv_address_conditional_diff_values_ic +; CHECK-NOT: <4 x +define void @inv_val_store_to_inv_address_conditional_diff_values_ic(i32* %a, i64 %n, i32* %b, i32 %k) { +entry: + %ntrunc = trunc i64 %n to i32 + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + %cmp = icmp eq i32 %tmp2, %k + store i32 %ntrunc, i32* %tmp1 + br i1 %cmp, label %cond_store, label %cond_store_k + +cond_store: + br label %latch + +cond_store_k: + br label %latch + +latch: + %storeval = phi i32 [ %ntrunc, %cond_store ], [ %k, %cond_store_k ] + store i32 %storeval, i32* %a + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + +; invariant val stored to invariant address predicated on invariant condition +; This is not treated as a predicated store since the block the store belongs to +; is the latch block (which doesn't need to be predicated). +; TODO: We should vectorize this loop once we relax the check for +; variant/invariant values being stored to invariant address. +; CHECK-LABEL: inv_val_store_to_inv_address_conditional_inv +; CHECK-NOT: <4 x +define void @inv_val_store_to_inv_address_conditional_inv(i32* %a, i64 %n, i32* %b, i32 %k) { +entry: + %ntrunc = trunc i64 %n to i32 + %cmp = icmp eq i32 %ntrunc, %k + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + store i32 %ntrunc, i32* %tmp1 + br i1 %cmp, label %cond_store, label %cond_store_k + +cond_store: + br label %latch + +cond_store_k: + br label %latch + +latch: + %storeval = phi i32 [ %ntrunc, %cond_store ], [ %k, %cond_store_k ] + store i32 %storeval, i32* %a + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + +; TODO: This loop can be vectorized once we support variant value being +; stored into invariant address. +; CHECK-LABEL: variant_val_store_to_inv_address +; CHECK-NOT: <4 x i32> +define i32 @variant_val_store_to_inv_address(i32* %a, i64 %n, i32* %b, i32 %k) { +entry: + %ntrunc = trunc i64 %n to i32 + %cmp = icmp eq i32 %ntrunc, %k + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + store i32 %tmp2, i32* %a + %tmp3 = add i32 %tmp0, %tmp2 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + %rdx.lcssa = phi i32 [ %tmp0, %for.body ] + ret i32 %rdx.lcssa +} Index: llvm/trunk/test/Transforms/LoopVectorize/pr31190.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/pr31190.ll +++ llvm/trunk/test/Transforms/LoopVectorize/pr31190.ll @@ -29,7 +29,10 @@ @a = external global i32, align 4 @b = external global [1 x i32], align 4 -; CHECK: LV: Not vectorizing: Cannot prove legality. +; We can vectorize this loop because we are storing an invariant value into an +; invariant address. + +; CHECK: LV: We can vectorize this loop! ; CHECK-LABEL: @test define void @test() { entry: