diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -531,6 +531,9 @@
   /// vectorizing this phi node.
   void fixReduction(PHINode *Phi);
 
+  /// Clear NSW/NUW flags from reduction instructions if necessary
+  void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
+
   /// The Loop exit block may have single value PHI nodes with some
   /// incoming value. While vectorizing we only handled real values
   /// that were defined inside the loop and we should have one value for
@@ -3709,16 +3712,20 @@
     }
   }
 
+  // Wrap flags are in general invalid after vectorization, clear them.
+  clearReductionWrapFlags(RdxDesc);
+
   // Fix the vector-loop phi.
 
   // Reductions do not have to start at zero. They can start with
   // any loop invariant values.
   BasicBlock *Latch = OrigLoop->getLoopLatch();
   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
+
   for (unsigned Part = 0; Part < UF; ++Part) {
     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
     Value *Val = getOrCreateVectorValue(LoopVal, Part);
-    // Make sure to add the reduction stat value only to the
+    // Make sure to add the reduction start value only to the
     // first unroll part.
     Value *StartVal = (Part == 0) ? VectorStart : Identity;
     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
@@ -3855,6 +3862,37 @@
   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
 }
 
+void InnerLoopVectorizer::clearReductionWrapFlags(
+    RecurrenceDescriptor &RdxDesc) {
+  RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
+  if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
+      RK != RecurrenceDescriptor::RK_IntegerMult)
+    return;
+
+  Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
+  assert(LoopExitInstr != nullptr && "null loop exit instruction");
+  SmallVector<Instruction *, 8> Worklist;
+  SmallPtrSet<Instruction *, 8> Visited;
+  Worklist.push_back(LoopExitInstr);
+  Visited.insert(LoopExitInstr);
+
+  while (!Worklist.empty()) {
+    Instruction *Cur = Worklist.pop_back_val();
+    if (isa<OverflowingBinaryOperator>(Cur))
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        Value *V = getOrCreateVectorValue(Cur, Part);
+        cast<Instruction>(V)->dropPoisonGeneratingFlags();
+      }
+
+    for (User *U : Cur->users()) {
+      Instruction *UI = cast<Instruction>(U);
+      if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
+          Visited.insert(UI).second)
+        Worklist.push_back(UI);
+    }
+  }
+}
+
 void InnerLoopVectorizer::fixLCSSAPHIs() {
   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
     if (LCSSAPhi.getNumIncomingValues() == 1) {
diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
@@ -96,7 +96,7 @@
 ; CHECK:         [[LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* {{.*}}, i32 4, <8 x i1> [[ICMPULE]], <8 x i32> undef)
 ; CHECK:         [[LOAD2:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* {{.*}}, i32 4, <8 x i1> [[ICMPULE]], <8 x i32> undef)
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw <8 x i32> [[LOAD2]], [[LOAD1]]
-; CHECK-NEXT:    [[ACCUM]] = add nuw nsw <8 x i32> [[ADD]], [[ACCUM_PHI]]
+; CHECK-NEXT:    [[ACCUM]] = add <8 x i32> [[ADD]], [[ACCUM_PHI]]
 ; CHECK:         [[LIVEOUT:%.*]] = select <8 x i1> [[ICMPULE]], <8 x i32> [[ACCUM]], <8 x i32> [[ACCUM_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
--- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -248,8 +248,8 @@
 ; UNROLL-NOSIMPLIFY-NEXT:    store i32 2, i32* [[TMP1]], align 4
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE4]]
 ; UNROLL-NOSIMPLIFY:       pred.store.continue4:
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP4:%.*]] = add nsw i32 [[VEC_PHI]], 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP5:%.*]] = add nsw i32 [[VEC_PHI2]], 1
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP4:%.*]] = add i32 [[VEC_PHI]], 1
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP5:%.*]] = add i32 [[VEC_PHI2]], 1
 ; UNROLL-NOSIMPLIFY-NEXT:    [[PREDPHI]] = select i1 undef, i32 [[VEC_PHI]], i32 [[TMP4]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[PREDPHI5]] = select i1 undef, i32 [[VEC_PHI2]], i32 [[TMP5]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[OFFSET_IDX6:%.*]] = add i64 undef, [[INDEX]]
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -139,9 +139,9 @@
 ; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
 ; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
 ; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK: add nsw <4 x i32>
+; CHECK: add <4 x i32>
 ; CHECK: sub <4 x i32>
-; CHECK: add nsw <4 x i32>
+; CHECK: add <4 x i32>
 ; CHECK: sub <4 x i32>
 
 %struct.ST4 = type { i32, i32, i32, i32 }
@@ -529,7 +529,7 @@
 ; CHECK: %[[V0:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK: %[[V1:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK: bitcast <4 x i32> %[[V1]] to <4 x float>
-; CHECK: add nsw <4 x i32>
+; CHECK: add <4 x i32>
 ; CHECK: fadd fast <4 x float>
 
 %struct.IntFloat = type { i32, float }
@@ -645,7 +645,7 @@
 ; CHECK:   store i32 %[[X4:.+]], {{.*}}
 ; CHECK:   %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
 ; CHECK:   %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK:   add nsw <4 x i32> %[[S1]], %[[Phi]]
+; CHECK:   add <4 x i32> %[[S1]], %[[Phi]]
 
 define i32 @PR27626_1(%pair.i32 *%p, i64 %n) {
 entry:
@@ -746,7 +746,7 @@
 ; CHECK:   store i32 %[[X4:.+]], {{.*}}
 ; CHECK:   %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
 ; CHECK:   %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK:   add nsw <4 x i32> %[[S1]], %[[Phi]]
+; CHECK:   add <4 x i32> %[[S1]], %[[Phi]]
 
 define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/no_int_induction.ll b/llvm/test/Transforms/LoopVectorize/no_int_induction.ll
--- a/llvm/test/Transforms/LoopVectorize/no_int_induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/no_int_induction.ll
@@ -10,7 +10,7 @@
 ;CHECK: phi i64
 ;CHECK: phi <4 x i32>
 ;CHECK: load <4 x i32>
-;CHECK: add nsw <4 x i32>
+;CHECK: add <4 x i32>
 ;CHECK: ret i32
 define i32 @sum_array(i32* %A, i32 %n) nounwind uwtable readonly noinline ssp {
   %1 = sext i32 %n to i64
@@ -37,7 +37,7 @@
 ;CHECK: phi i16
 ;CHECK: phi <4 x i32>
 ;CHECK: load <4 x i32>
-;CHECK: add nsw <4 x i32>
+;CHECK: add <4 x i32>
 ;CHECK: ret i32
 define i32 @sum_array_as1(i32 addrspace(1)* %A, i32 %n) nounwind uwtable readonly noinline ssp {
   %1 = sext i32 %n to i64
diff --git a/llvm/test/Transforms/LoopVectorize/nuw.ll b/llvm/test/Transforms/LoopVectorize/nuw.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/nuw.ll
@@ -0,0 +1,58 @@
+; RUN: opt %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -S | FileCheck %s
+
+; Fixes PR43828
+
+define void @test(i32* %B) {
+; CHECK-LABEL: @test(
+; CHECK:       vector.body:
+; CHECK-COUNT-2: sub <4 x i32>
+entry:
+  br label %outer_loop
+
+outer_loop:
+  %local_4 = phi i32 [ 2, %entry ], [ %4, %outer_tail]
+  br label %inner_loop
+
+inner_loop:
+  %local_2 = phi i32 [ 0, %outer_loop ], [ %1, %inner_loop ]
+  %local_3 = phi i32 [ -104, %outer_loop ], [ %0, %inner_loop ]
+  %0 = sub nuw nsw i32 %local_3, %local_4
+  %1 = add nuw nsw i32 %local_2, 1
+  %2 = icmp ugt i32 %local_2, 126
+  br i1 %2, label %outer_tail, label %inner_loop
+
+outer_tail:
+  %3 = phi i32 [ %0, %inner_loop ]
+  store atomic i32 %3, i32 * %B unordered, align 8
+  %4 = add i32 %local_4, 1
+  %5 = icmp slt i32 %4, 6
+  br i1 %5, label %outer_loop, label %exit
+
+exit:
+  ret void
+}
+
+define i32 @multi-instr(i32* noalias nocapture %A, i32* noalias nocapture %B, i32 %inc) {
+; CHECK-LABEL: @multi-instr(
+; CHECK:       vector.body:
+; CHECK-COUNT-4: add <4 x i32>
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv_inc, %loop]
+  %redu = phi i32 [0, %entry], [%3, %loop]
+  %gepa = getelementptr inbounds i32, i32* %A, i32 %iv
+  %gepb = getelementptr inbounds i32, i32* %B, i32 %iv
+  %0 = load i32, i32* %gepa
+  %1 = load i32, i32* %gepb
+  %2 = add nuw nsw i32 %redu, %0
+  %3 = add nuw nsw i32 %2, %1
+  %iv_inc = add nuw nsw i32 %iv, 1
+  %4 = icmp ult i32 %iv_inc, 128
+  br i1 %4, label %loop, label %exit
+
+exit:
+  %lcssa = phi i32 [%3, %loop]
+  ret i32 %lcssa
+}
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
--- a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
@@ -8,7 +8,7 @@
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP17:%.*]], %[[LATCH]] ]
 ; CHECK:       [[LATCH]]:
 ; CHECK:         [[TMP13:%.*]] = and <4 x i32> [[VEC_PHI]], <i32 255, i32 255, i32 255, i32 255>
-; CHECK-NEXT:    [[TMP14:%.*]] = add nuw nsw <4 x i32> [[TMP13]], {{.*}}
+; CHECK-NEXT:    [[TMP14:%.*]] = add <4 x i32> [[TMP13]], {{.*}}
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK:         [[TMP16:%.*]] = trunc <4 x i32> [[TMP14]] to <4 x i8>
 ; CHECK-NEXT:    [[TMP17]] = zext <4 x i8> [[TMP16]] to <4 x i32>
diff --git a/llvm/test/Transforms/LoopVectorize/reduction.ll b/llvm/test/Transforms/LoopVectorize/reduction.ll
--- a/llvm/test/Transforms/LoopVectorize/reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction.ll
@@ -300,7 +300,7 @@
 ; In this test the reduction variable is on the LHS and we can vectorize it.
 ;CHECK-LABEL: @reduction_sub_lhs(
 ;CHECK: phi <4 x i32>
-;CHECK: sub nsw <4 x i32>
+;CHECK: sub <4 x i32>
 ;CHECK: ret i32
 define i32 @reduction_sub_lhs(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly {
 entry: