Index: include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- include/llvm/Analysis/LoopAccessAnalysis.h +++ include/llvm/Analysis/LoopAccessAnalysis.h @@ -564,11 +564,10 @@ /// Print the information about the memory accesses in the loop. void print(raw_ostream &OS, unsigned Depth = 0) const; - /// Checks existence of store to invariant address inside loop. - /// If the loop has any store to invariant address, then it returns true, - /// else returns false. - bool hasStoreToLoopInvariantAddress() const { - return StoreToLoopInvariantAddress; + /// Checks existence of stores to invariant address inside loop. + /// If such stores exist, checks if those are non-vectorizable stores. + bool hasVariantStoreToLoopInvariantAddress() const { + return VariantStoreToLoopInvariantAddress; } /// Used to add runtime SCEV checks. Simplifies SCEV expressions and converts @@ -621,9 +620,8 @@ /// Cache the result of analyzeLoop. bool CanVecMem; - /// Indicator for storing to uniform addresses. - /// If a loop has write to a loop invariant address then it should be true. - bool StoreToLoopInvariantAddress; + /// Indicator that there is a store of a variant value to a uniform address. + bool VariantStoreToLoopInvariantAddress; /// The diagnostics report generated for the analysis. E.g. why we /// couldn't analyze the loop. Index: lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- lib/Analysis/LoopAccessAnalysis.cpp +++ lib/Analysis/LoopAccessAnalysis.cpp @@ -1862,10 +1862,21 @@ // writes and between reads and writes, but not between reads and reads. ValueSet Seen; + auto isLoopInvariantStoreValue = [this](StoreInst *ST) { + auto StoreVal = ST->getValueOperand(); + if (TheLoop->isLoopInvariant(StoreVal)) + return true; + if (!isa(StoreVal)) + return false; + return TheLoop->hasLoopInvariantOperands(cast(StoreVal)); + }; + for (StoreInst *ST : Stores) { Value *Ptr = ST->getPointerOperand(); - // Check for store to loop invariant address. - StoreToLoopInvariantAddress |= isUniform(Ptr); + + VariantStoreToLoopInvariantAddress |= + (isUniform(Ptr) && !isLoopInvariantStoreValue(ST)); + // If we did *not* see this pointer before, insert it to the read-write // list. At this phase it is only a 'write' list. if (Seen.insert(Ptr).second) { @@ -2265,7 +2276,7 @@ PtrRtChecking(llvm::make_unique(SE)), DepChecker(llvm::make_unique(*PSE, L)), TheLoop(L), NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false), - StoreToLoopInvariantAddress(false) { + VariantStoreToLoopInvariantAddress(false) { if (canAnalyzeLoop()) analyzeLoop(AA, LI, TLI, DT); } @@ -2297,8 +2308,8 @@ PtrRtChecking->print(OS, Depth); OS << "\n"; - OS.indent(Depth) << "Store to invariant address was " - << (StoreToLoopInvariantAddress ? "" : "not ") + OS.indent(Depth) << "Variant Store to invariant address was " + << (VariantStoreToLoopInvariantAddress ? "" : "not ") << "found in loop.\n"; OS.indent(Depth) << "SCEV assumptions:\n"; Index: lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -755,7 +755,7 @@ if (!LAI->canVectorizeMemory()) return false; - if (LAI->hasStoreToLoopInvariantAddress()) { + if (LAI->hasVariantStoreToLoopInvariantAddress()) { ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress") << "write to a loop invariant address could not be vectorized"); LLVM_DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1482,8 +1482,10 @@ /// memory access. unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); - /// The cost calculation for Load instruction \p I with uniform pointer - - /// scalar load + broadcast. + /// The cost calculation for Load/Store instruction \p I with uniform pointer - + /// Load: scalar load + broadcast. + /// Store: scalar store + (loop invariant value stored? 0 : extract of last + /// element) unsigned getUniformMemOpCost(Instruction *I, unsigned VF); /// Returns whether the instruction is a load or store and will be a emitted @@ -5749,15 +5751,24 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, unsigned VF) { - LoadInst *LI = cast(I); - Type *ValTy = LI->getType(); + Type *ValTy = getMemInstValueType(I); Type *VectorTy = ToVectorTy(ValTy, VF); - unsigned Alignment = LI->getAlignment(); - unsigned AS = LI->getPointerAddressSpace(); + unsigned Alignment = getMemInstAlignment(I); + unsigned AS = getMemInstAddressSpace(I); + if (isa(I)) { + return TTI.getAddressComputationCost(ValTy) + + TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); + } + StoreInst *SI = cast(I); + bool isLoopInvariantValueStored = + TheLoop->isLoopInvariant(SI->getValueOperand()); return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + - TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + + (isLoopInvariantValueStored ? 0 : TTI.getVectorInstrCost( + Instruction::ExtractElement, + VectorTy, VF - 1)); } unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, @@ -5855,8 +5866,10 @@ if (isa(&I) && isScalarWithPredication(&I)) NumPredStores++; - if (isa(&I) && Legal->isUniform(Ptr)) { - // Scalar load + broadcast + + if ((isa(&I) || isa(&I)) && Legal->isUniform(Ptr)) { + // Load: Scalar load + broadcast + // Store: Scalar store + isLoopInvariantValueStored ? 0 : extract unsigned Cost = getUniformMemOpCost(&I, VF); setWideningDecision(&I, VF, CM_Scalarize, Cost); continue; Index: test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll =================================================================== --- test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll +++ test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll @@ -39,7 +39,7 @@ ; CHECK-NEXT: Group ; CHECK-NEXT: (Low: %b High: ((4 * (1 umax %x)) + %b)) ; CHECK-NEXT: Member: {%b,+,4}<%for.body> -; CHECK: Store to invariant address was not found in loop. +; CHECK: Variant Store to invariant address was not found in loop. ; CHECK-NEXT: SCEV assumptions: ; CHECK-NEXT: {1,+,1}<%for.body> Added Flags: ; CHECK-NEXT: {0,+,1}<%for.body> Added Flags: Index: test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll =================================================================== --- test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll +++ test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll @@ -13,14 +13,14 @@ ; The LAA with the new PM is a loop pass so we go from inner to outer loops. ; OLDPM: for.cond1.preheader: -; OLDPM: Store to invariant address was not found in loop. +; OLDPM: Variant Store to invariant address was not found in loop. ; OLDPM: for.body3: -; OLDPM: Store to invariant address was found in loop. +; OLDPM: Variant Store to invariant address was found in loop. ; NEWPM: for.body3: -; NEWPM: Store to invariant address was found in loop. +; NEWPM: Variant Store to invariant address was found in loop. ; NEWPM: for.cond1.preheader: -; NEWPM: Store to invariant address was not found in loop. +; NEWPM: Variant Store to invariant address was not found in loop. define i32 @foo(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 { entry: Index: test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll =================================================================== --- test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll +++ test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll @@ -10,8 +10,8 @@ ; } ; } -; CHECK: Store to invariant address was not found in loop. -; CHECK-NOT: Store to invariant address was found in loop. +; CHECK: Variant Store to invariant address was not found in loop. +; CHECK-NOT: Variant Store to invariant address was found in loop. define i32 @foo(i32* nocapture readonly %var1, i32* nocapture %var2, i32 %itr) #0 { Index: test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll =================================================================== --- test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll +++ test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll @@ -10,7 +10,7 @@ ; } ; } -; CHECK: Store to invariant address was found in loop. +; CHECK: Variant Store to invariant address was found in loop. define void @foo(i32* nocapture %var1, i32* nocapture %var2, i32 %itr) #0 { entry: Index: test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll @@ -0,0 +1,154 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -loop-vectorize -S -mcpu=skylake-avx512 -instcombine < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; CHECK-LABEL: inv_val_store_to_inv_address_with_reduction +; CHECK-LABEL: vector.memcheck: +; CHECK: found.conflict + +; CHECK-LABEL: vector.body: +; CHECK: %vec.phi = phi <16 x i32> [ zeroinitializer, %vector.ph ], [ [[ADD:%[a-zA-Z0-9.]+]], %vector.body ] +; CHECK: %wide.load = load <16 x i32> +; CHECK: [[ADD]] = add <16 x i32> %vec.phi, %wide.load +; CHECK: store i32 %ntrunc, i32* %a +; CHECK-NOT: store i32 %ntrunc, i32* %a +; CHECK: %index.next = add i64 %index, 64 + +; CHECK-LABEL: middle.block: +; CHECK: %rdx.shuf = shufflevector <16 x i32> +define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) { +entry: + %ntrunc = trunc i64 %n to i32 + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + %tmp3 = add i32 %tmp0, %tmp2 + store i32 %ntrunc, i32* %a + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + %tmp4 = phi i32 [ %tmp3, %for.body ] + ret i32 %tmp4 +} + +define void @inv_val_store_to_inv_address_conditional_diff_values_ic(i32* %a, i64 %n, i32* %b, i32 %k) { +; CHECK-LABEL: @inv_val_store_to_inv_address_conditional_diff_values_ic( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[N]], 1 +; CHECK-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 64 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8* +; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i64 [[N]], 1 +; CHECK-NEXT: [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]] +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A4]], i64 1 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775744 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <16 x i32> undef, i32 [[K:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT18:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT17]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT20:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT19]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT21:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT22:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT21]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT23:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT24:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT23]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT25:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT26:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT25]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 16 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 32 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 48 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <16 x i32>* +; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i32>, <16 x i32>* [[TMP6]], align 8, !alias.scope !8, !noalias !11 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD10]], [[BROADCAST_SPLAT18]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>* +; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT20]], <16 x i32>* [[TMP8]], align 4, !alias.scope !8, !noalias !11 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP3]] to <16 x i32>* +; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT22]], <16 x i32>* [[TMP9]], align 4, !alias.scope !8, !noalias !11 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP4]] to <16 x i32>* +; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT24]], <16 x i32>* [[TMP10]], align 4, !alias.scope !8, !noalias !11 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP5]] to <16 x i32>* +; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT26]], <16 x i32>* [[TMP11]], align 4, !alias.scope !8, !noalias !11 +; CHECK-NEXT: [[PREDPHI29:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[BROADCAST_SPLAT26]], <16 x i32> [[BROADCAST_SPLAT18]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i32> [[PREDPHI29]], i32 15 +; CHECK-NEXT: store i32 [[TMP12]], i32* [[A]], align 4, !alias.scope !11 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 64 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !13 +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 8 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP2]], [[K]] +; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[TMP1]], align 4 +; CHECK-NEXT: br i1 [[CMP]], label [[COND_STORE:%.*]], label [[COND_STORE_K:%.*]] +; CHECK: cond_store: +; CHECK-NEXT: br label [[LATCH]] +; CHECK: cond_store_k: +; CHECK-NEXT: br label [[LATCH]] +; CHECK: latch: +; CHECK-NEXT: [[STOREVAL:%.*]] = phi i32 [ [[NTRUNC]], [[COND_STORE]] ], [ [[K]], [[COND_STORE_K]] ] +; CHECK-NEXT: store i32 [[STOREVAL]], i32* [[A]], align 4 +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !14 +; CHECK: for.end: +; CHECK-NEXT: ret void +; + + + + +entry: + %ntrunc = trunc i64 %n to i32 + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + %cmp = icmp eq i32 %tmp2, %k + store i32 %ntrunc, i32* %tmp1 + br i1 %cmp, label %cond_store, label %cond_store_k + +cond_store: + br label %latch + +cond_store_k: + br label %latch + +latch: + %storeval = phi i32 [ %ntrunc, %cond_store ], [ %k, %cond_store_k ] + store i32 %storeval, i32* %a + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} Index: test/Transforms/LoopVectorize/invariant-store-vectorization.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/invariant-store-vectorization.ll @@ -0,0 +1,277 @@ +; RUN: opt < %s -licm -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s + +; First licm pass is to hoist/sink invariant stores if possible. Today LICM does +; not hoist/sink the invariant stores. Even if that changes, we should still +; vectorize this loop in case licm is not run. + +; The next licm pass after vectorization is to hoist/sink loop invariant +; instructions. +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +; all tests that check whether it is legal to vectorize the stores to invariant +; address. + + +; CHECK-LABEL: inv_val_store_to_inv_address_with_reduction( +; memory check is found.conflict = b[max(n-1,1)] > a && (i8* a)+1 > (i8* b) +; CHECK: vector.memcheck: +; CHECK: found.conflict + +; CHECK-LABEL: vector.body: +; CHECK: %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[ADD:%[a-zA-Z0-9.]+]], %vector.body ] +; CHECK: %wide.load = load <4 x i32> +; CHECK: [[ADD]] = add <4 x i32> %vec.phi, %wide.load +; CHECK-NEXT: store i32 %ntrunc, i32* %a +; CHECK-NEXT: %index.next = add i64 %index, 4 +; CHECK-NEXT: icmp eq i64 %index.next, %n.vec +; CHECK-NEXT: br i1 + +; CHECK-LABEL: middle.block: +; CHECK: %rdx.shuf = shufflevector <4 x i32> +define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) { +entry: + %ntrunc = trunc i64 %n to i32 + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + %tmp3 = add i32 %tmp0, %tmp2 + store i32 %ntrunc, i32* %a + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + %tmp4 = phi i32 [ %tmp3, %for.body ] + ret i32 %tmp4 +} + +; CHECK-LABEL: inv_val_store_to_inv_address( +; CHECK-LABEL: vector.body: +; CHECK: store i32 %ntrunc, i32* %a +; CHECK: store <4 x i32> +; CHECK-NEXT: %index.next = add i64 %index, 4 +; CHECK-NEXT: icmp eq i64 %index.next, %n.vec +; CHECK-NEXT: br i1 +define void @inv_val_store_to_inv_address(i32* %a, i64 %n, i32* %b) { +entry: + %ntrunc = trunc i64 %n to i32 + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + store i32 %ntrunc, i32* %a + store i32 %ntrunc, i32* %tmp1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + + +; Both of these tests below are handled as predicated stores and have the cost model +; as identifying these as predicated stores. + + +; Conditional store +; if (b[i] == k) a = ntrunc +; TODO: We can be better with the code gen for the first test and we can have +; just one scalar store if vector.or.reduce(vector_cmp(b[i] == k)) is 1. + +; CHECK-LABEL:inv_val_store_to_inv_address_conditional( +; CHECK-LABEL: vector.body: +; CHECK: %wide.load = load <4 x i32>, <4 x i32>* +; CHECK: [[CMP:%[a-zA-Z0-9.]+]] = icmp eq <4 x i32> %wide.load, %{{.*}} +; CHECK: store <4 x i32> +; CHECK-NEXT: [[EE:%[a-zA-Z0-9.]+]] = extractelement <4 x i1> [[CMP]], i32 0 +; CHECK-NEXT: br i1 [[EE]], label %pred.store.if, label %pred.store.continue + +; CHECK-LABEL: pred.store.if: +; CHECK-NEXT: store i32 %ntrunc, i32* %a +; CHECK-NEXT: br label %pred.store.continue + +; CHECK-LABEL: pred.store.continue: +; CHECK-NEXT: [[EE1:%[a-zA-Z0-9.]+]] = extractelement <4 x i1> [[CMP]], i32 1 +define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32 %k) { +entry: + %ntrunc = trunc i64 %n to i32 + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + %cmp = icmp eq i32 %tmp2, %k + store i32 %ntrunc, i32* %tmp1 + br i1 %cmp, label %cond_store, label %latch + +cond_store: + store i32 %ntrunc, i32* %a + br label %latch + +latch: + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + +; if (b[i] == k) +; a = ntrunc +; else a = k; +; For this case, we still vectorize, by generating predicated stores for the if +; and else cases. +; TODO: Code gen can be improved by select(extract(vec_cmp(b[i], k), VF - 1) == 1, a = ntrunc, a = k) +; CHECK-LABEL:inv_val_store_to_inv_address_conditional_diff_values( +; CHECK-LABEL: vector.body: +; CHECK: %wide.load = load <4 x i32>, <4 x i32>* +; CHECK: [[CMP:%[a-zA-Z0-9.]+]] = icmp eq <4 x i32> %wide.load, %{{.*}} +; CHECK: store <4 x i32> +; CHECK: [[CMPNOT:%[a-zA-Z0-9.]+]] = xor <4 x i1> [[CMP]], +; CHECK: [[EENOT1:%[a-zA-Z0-9.]+]] = extractelement <4 x i1> [[CMPNOT]], i32 0 +; CHECK: br i1 [[EENOT1]], label %pred.store.if, label %pred.store.continue + +; CHECK-LABEL: pred.store.if: +; CHECK: store i32 %k, i32* %a +; CHECK: br label %pred.store.continue + +; all predicated stores for a = k +; then we check the original condition and do a predicated stores for a = ntrunc. + +; CHECK-LABEL: pred.store.continue14: +; CHECK: [[EE1:%[a-zA-Z0-9.]+]] = extractelement <4 x i1> [[CMP]], i32 0 +define void @inv_val_store_to_inv_address_conditional_diff_values(i32* %a, i64 %n, i32* %b, i32 %k) { +entry: + %ntrunc = trunc i64 %n to i32 + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + %cmp = icmp eq i32 %tmp2, %k + store i32 %ntrunc, i32* %tmp1 + br i1 %cmp, label %cond_store, label %cond_store_k + +cond_store: + store i32 %ntrunc, i32* %a + br label %latch + +cond_store_k: + store i32 %k, i32 * %a + br label %latch + +latch: + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + +; Instcombine'd version of above test. Now the store is no longer predicated. +; CHECK-LABEL: inv_val_store_to_inv_address_conditional_diff_values_ic +; CHECK-LABEL: vector.memcheck: +; CHECK-LABEL: vector.ph: +; CHECK: [[BSPLATIN1:%[a-zA-Z0-9.]+]] = insertelement <4 x i32> undef, i32 %k, i32 0 +; CHECK: [[BSPLATK:%[a-zA-Z0-9.]+]] = shufflevector <4 x i32> [[BSPLATIN1]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: [[BSPLATIN2:%[a-zA-Z0-9.]+]] = insertelement <4 x i32> undef, i32 %ntrunc, i32 0 +; CHECK: [[BSPLAT2:%[a-zA-Z0-9.]+]] = shufflevector <4 x i32> [[BSPLATIN2]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %vector.body + +; CHECK-LABEL: vector.body: +; CHECK: [[GEPB:%[a-zA-Z0-9.]+]] = getelementptr inbounds i32, i32* %b, i64 %index +; CHECK-NEXT: [[BCB:%[a-zA-Z0-9.]+]] = bitcast i32* [[GEPB]] to <4 x i32>* +; CHECK-NEXT: %wide.load = load <4 x i32>, <4 x i32>* [[BCB]] +; CHECK-NEXT: [[INVCOND:%[a-zA-Z0-9.]+]] = icmp eq <4 x i32> %wide.load, [[BSPLATK]] +; CHECK: %predphi = select <4 x i1> [[INVCOND]], <4 x i32> [[BSPLAT2]], <4 x i32> [[BSPLATK]] +; CHECK-NEXT: [[EE:%[a-zA-Z0-9.]+]] = extractelement <4 x i32> %predphi, i32 3 +; CHECK-NEXT: store i32 [[EE]], i32* %a +; CHECK-NEXT: %index.next = add i64 %index, 4 +define void @inv_val_store_to_inv_address_conditional_diff_values_ic(i32* %a, i64 %n, i32* %b, i32 %k) { +entry: + %ntrunc = trunc i64 %n to i32 + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + %cmp = icmp eq i32 %tmp2, %k + store i32 %ntrunc, i32* %tmp1 + br i1 %cmp, label %cond_store, label %cond_store_k + +cond_store: + br label %latch + +cond_store_k: + br label %latch + +latch: + %storeval = phi i32 [ %ntrunc, %cond_store ], [ %k, %cond_store_k ] + store i32 %storeval, i32* %a + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + +; invariant val stored to invariant address predicated on invariant condition +; This is not treated as a predicated store since the block the store belongs to +; is the latch block (which doesn't need to be predicated). +; CHECK-LABEL: inv_val_store_to_inv_address_conditional_inv +; CHECK-LABEL: vector.memcheck: +; CHECK-LABEL: vector.ph: +; CHECK: [[BSPLATIN1:%[a-zA-Z0-9.]+]] = insertelement <4 x i32> undef, i32 %ntrunc, i32 0 +; CHECK: [[BSPLATN:%[a-zA-Z0-9.]+]] = shufflevector <4 x i32> [[BSPLATIN1]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: [[INSK:%[a-zA-Z0-9.]+]] = insertelement <4 x i32> undef, i32 %k, i32 3 +; CHECK: %predphi = select <4 x i1> {{.*}}, <4 x i32> [[INSK]], <4 x i32> [[BSPLATN]] +; CHECK: [[STOREVAL:%[a-zA-Z0-9.]+]] = extractelement <4 x i32> %predphi, i32 3 +; CHECK-NEXT: br label %vector.body + +; CHECK-LABEL: vector.body: +; CHECK: [[GEPB:%[a-zA-Z0-9.]+]] = getelementptr inbounds i32, i32* %b, i64 %index +; CHECK-NEXT: [[BCB:%[a-zA-Z0-9.]+]] = bitcast i32* [[GEPB]] to <4 x i32>* +; CHECK: store i32 [[STOREVAL]], i32* %a +; CHECK-NEXT: %index.next = add i64 %index, 4 +define void @inv_val_store_to_inv_address_conditional_inv(i32* %a, i64 %n, i32* %b, i32 %k) { +entry: + %ntrunc = trunc i64 %n to i32 + %cmp = icmp eq i32 %ntrunc, %k + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + store i32 %ntrunc, i32* %tmp1 + br i1 %cmp, label %cond_store, label %cond_store_k + +cond_store: + br label %latch + +cond_store_k: + br label %latch + +latch: + %storeval = phi i32 [ %ntrunc, %cond_store ], [ %k, %cond_store_k ] + store i32 %storeval, i32* %a + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} Index: test/Transforms/LoopVectorize/pr31190.ll =================================================================== --- test/Transforms/LoopVectorize/pr31190.ll +++ test/Transforms/LoopVectorize/pr31190.ll @@ -29,7 +29,8 @@ @a = external global i32, align 4 @b = external global [1 x i32], align 4 -; CHECK: LV: Not vectorizing: Cannot prove legality. +; We can vectorize this loop because we are storing an invariant value into an +; invariant address. ; CHECK-LABEL: @test define void @test() { entry: