diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -379,6 +379,24 @@ return None; } +/// Return set of instructions comprising reduction defined by \p Phi. +static void collectReductionInstructions(PHINode *Phi, Loop *L, + SmallPtrSetImpl &Result) { + SmallVector Worklist; + Worklist.push_back(Phi); + Result.insert(Phi); + + while (!Worklist.empty()) { + Instruction *Cur = Worklist.back(); + Worklist.pop_back(); + for (User *U : Cur->users()) { + Instruction *UI = cast(U); + if (L->contains(UI->getParent()) && Result.insert(UI).second) + Worklist.push_back(UI); + } + } +} + namespace llvm { /// InnerLoopVectorizer vectorizes loops which contain only one basic @@ -3711,14 +3729,27 @@ // Fix the vector-loop phi. + // Wrap flags are in general invalid after vectorization, clear them. + SmallPtrSet RedictionInstructions; + collectReductionInstructions(Phi, OrigLoop, RedictionInstructions); + for (Instruction *I : RedictionInstructions) { + if (!isa(I)) + continue; + for (unsigned Part = 0; Part < UF; ++Part) { + Value *V = getOrCreateVectorValue(I, Part); + cast(V)->dropPoisonGeneratingFlags(); + } + } + // Reductions do not have to start at zero. They can start with // any loop invariant values. BasicBlock *Latch = OrigLoop->getLoopLatch(); Value *LoopVal = Phi->getIncomingValueForBlock(Latch); + for (unsigned Part = 0; Part < UF; ++Part) { Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); Value *Val = getOrCreateVectorValue(LoopVal, Part); - // Make sure to add the reduction stat value only to the + // Make sure to add the reduction start value only to the // first unroll part. Value *StartVal = (Part == 0) ? VectorStart : Identity; cast(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll --- a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll @@ -96,7 +96,7 @@ ; CHECK: [[LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* {{.*}}, i32 4, <8 x i1> [[ICMPULE]], <8 x i32> undef) ; CHECK: [[LOAD2:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* {{.*}}, i32 4, <8 x i1> [[ICMPULE]], <8 x i32> undef) ; CHECK-NEXT: [[ADD:%.*]] = add nsw <8 x i32> [[LOAD2]], [[LOAD1]] -; CHECK-NEXT: [[ACCUM]] = add nuw nsw <8 x i32> [[ADD]], [[ACCUM_PHI]] +; CHECK-NEXT: [[ACCUM]] = add <8 x i32> [[ADD]], [[ACCUM_PHI]] ; CHECK: [[LIVEOUT:%.*]] = select <8 x i1> [[ICMPULE]], <8 x i32> [[ACCUM]], <8 x i32> [[ACCUM_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll --- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll +++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll @@ -248,8 +248,8 @@ ; UNROLL-NOSIMPLIFY-NEXT: store i32 2, i32* [[TMP1]], align 4 ; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE4]] ; UNROLL-NOSIMPLIFY: pred.store.continue4: -; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = add nsw i32 [[VEC_PHI]], 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = add nsw i32 [[VEC_PHI2]], 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = add i32 [[VEC_PHI]], 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = add i32 [[VEC_PHI2]], 1 ; UNROLL-NOSIMPLIFY-NEXT: [[PREDPHI]] = select i1 undef, i32 [[VEC_PHI]], i32 [[TMP4]] ; UNROLL-NOSIMPLIFY-NEXT: [[PREDPHI5]] = select i1 undef, i32 [[VEC_PHI2]], i32 [[TMP5]] ; UNROLL-NOSIMPLIFY-NEXT: [[OFFSET_IDX6:%.*]] = add i64 undef, [[INDEX]] diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll --- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -139,9 +139,9 @@ ; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> ; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> ; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> -; CHECK: add nsw <4 x i32> +; CHECK: add <4 x i32> ; CHECK: sub <4 x i32> -; CHECK: add nsw <4 x i32> +; CHECK: add <4 x i32> ; CHECK: sub <4 x i32> %struct.ST4 = type { i32, i32, i32, i32 } @@ -529,7 +529,7 @@ ; CHECK: %[[V0:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> ; CHECK: %[[V1:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> ; CHECK: bitcast <4 x i32> %[[V1]] to <4 x float> -; CHECK: add nsw <4 x i32> +; CHECK: add <4 x i32> ; CHECK: fadd fast <4 x float> %struct.IntFloat = type { i32, float } @@ -645,7 +645,7 @@ ; CHECK: store i32 %[[X4:.+]], {{.*}} ; CHECK: %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}} ; CHECK: %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> -; CHECK: add nsw <4 x i32> %[[S1]], %[[Phi]] +; CHECK: add <4 x i32> %[[S1]], %[[Phi]] define i32 @PR27626_1(%pair.i32 *%p, i64 %n) { entry: @@ -746,7 +746,7 @@ ; CHECK: store i32 %[[X4:.+]], {{.*}} ; CHECK: %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}} ; CHECK: %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> -; CHECK: add nsw <4 x i32> %[[S1]], %[[Phi]] +; CHECK: add <4 x i32> %[[S1]], %[[Phi]] define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) { entry: diff --git a/llvm/test/Transforms/LoopVectorize/no_int_induction.ll b/llvm/test/Transforms/LoopVectorize/no_int_induction.ll --- a/llvm/test/Transforms/LoopVectorize/no_int_induction.ll +++ b/llvm/test/Transforms/LoopVectorize/no_int_induction.ll @@ -10,7 +10,7 @@ ;CHECK: phi i64 ;CHECK: phi <4 x i32> ;CHECK: load <4 x i32> -;CHECK: add nsw <4 x i32> +;CHECK: add <4 x i32> ;CHECK: ret i32 define i32 @sum_array(i32* %A, i32 %n) nounwind uwtable readonly noinline ssp { %1 = sext i32 %n to i64 @@ -37,7 +37,7 @@ ;CHECK: phi i16 ;CHECK: phi <4 x i32> ;CHECK: load <4 x i32> -;CHECK: add nsw <4 x i32> +;CHECK: add <4 x i32> ;CHECK: ret i32 define i32 @sum_array_as1(i32 addrspace(1)* %A, i32 %n) nounwind uwtable readonly noinline ssp { %1 = sext i32 %n to i64 diff --git a/llvm/test/Transforms/LoopVectorize/nuw.ll b/llvm/test/Transforms/LoopVectorize/nuw.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/nuw.ll @@ -0,0 +1,59 @@ +; RUN: opt %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -S | FileCheck %s + +; Fixes PR43828 + +; CHECK-LABEL: @test +define void @test(i32* %B) { +entry: + br label %outer_loop + +outer_loop: + %local_4 = phi i32 [ 2, %entry ], [ %4, %outer_tail] + br label %inner_loop + +; CHECK-LABEL: vector.body: +; CHECK: sub <4 x i32> +; CHECK: sub <4 x i32> +inner_loop: + %local_2 = phi i32 [ 0, %outer_loop ], [ %1, %inner_loop ] + %local_3 = phi i32 [ -104, %outer_loop ], [ %0, %inner_loop ] + %0 = sub nuw nsw i32 %local_3, %local_4 + %1 = add nuw nsw i32 %local_2, 1 + %2 = icmp ugt i32 %local_2, 126 + br i1 %2, label %outer_tail, label %inner_loop + +outer_tail: + %3 = phi i32 [ %0, %inner_loop ] + store atomic i32 %3, i32 * %B unordered, align 8 + %4 = add i32 %local_4, 1 + %5 = icmp slt i32 %4, 6 + br i1 %5, label %outer_loop, label %exit + +exit: + ret void +} + +; CHECK-LABEL: @multi-instr +define i32 @multi-instr(i32* noalias nocapture %A, i32* noalias nocapture %B, i32 %inc) { +entry: + br label %loop + + ; CHECK-LABEL: vector.body: + ; CHECK-COUNT-4: add <4 x i32> +loop: + %iv = phi i32 [0, %entry], [%iv_inc, %loop] + %redu = phi i32 [0, %entry], [%3, %loop] + %gepa = getelementptr inbounds i32, i32* %A, i32 %iv + %gepb = getelementptr inbounds i32, i32* %B, i32 %iv + %0 = load i32, i32* %gepa + %1 = load i32, i32* %gepb + %2 = add nuw nsw i32 %redu, %0 + %3 = add nuw nsw i32 %2, %1 + %iv_inc = add nuw nsw i32 %iv, 1 + %4 = icmp ult i32 %iv_inc, 128 + br i1 %4, label %loop, label %exit + +exit: + %lcssa = phi i32 [%3, %loop] + ret i32 %lcssa +} diff --git a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll --- a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll @@ -8,7 +8,7 @@ ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP17:%.*]], %[[LATCH]] ] ; CHECK: [[LATCH]]: ; CHECK: [[TMP13:%.*]] = and <4 x i32> [[VEC_PHI]], -; CHECK-NEXT: [[TMP14:%.*]] = add nuw nsw <4 x i32> [[TMP13]], {{.*}} +; CHECK-NEXT: [[TMP14:%.*]] = add <4 x i32> [[TMP13]], {{.*}} ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK: [[TMP16:%.*]] = trunc <4 x i32> [[TMP14]] to <4 x i8> ; CHECK-NEXT: [[TMP17]] = zext <4 x i8> [[TMP16]] to <4 x i32> diff --git a/llvm/test/Transforms/LoopVectorize/reduction.ll b/llvm/test/Transforms/LoopVectorize/reduction.ll --- a/llvm/test/Transforms/LoopVectorize/reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction.ll @@ -300,7 +300,7 @@ ; In this test the reduction variable is on the LHS and we can vectorize it. ;CHECK-LABEL: @reduction_sub_lhs( ;CHECK: phi <4 x i32> -;CHECK: sub nsw <4 x i32> +;CHECK: sub <4 x i32> ;CHECK: ret i32 define i32 @reduction_sub_lhs(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly { entry: