Index: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4062,7 +4062,14 @@
   SmallVector<Value *, 32> ReducedVals;
 
   BinaryOperator *ReductionRoot;
-  PHINode *ReductionPHI;
+  // After successfull horizontal reduction vectorization attempt for PHI node
+  // vectorizer tries to update root binary op by combining vectorized tree and
+  // the ReductionPHI node. But during vectorization this ReductionPHI can be
+  // vectorized itself and replaced by the undef value, while the instruction
+  // itself is marked for deletion. This 'marked for deletion' PHI node then can
+  // be used in new binary operation, causing "Use still stuck around after Def
+  // is destroyed" crash upon PHI node deletion.
+  WeakVH ReductionPHI;
 
   /// The opcode of the reduction.
   unsigned ReductionOpcode;
@@ -4081,8 +4088,8 @@
   unsigned MinVecRegSize;
 
   HorizontalReduction(unsigned MinVecRegSize)
-      : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
-        ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0),
+      : ReductionRoot(nullptr), ReductionOpcode(0), ReducedValueOpcode(0),
+        IsPairwiseReduction(false), ReduxWidth(0),
         MinVecRegSize(MinVecRegSize) {}
 
   /// \brief Try to find a reduction tree.
@@ -4247,7 +4254,7 @@
                                      ReducedVals[i]);
       }
       // Update users.
-      if (ReductionPHI) {
+      if (ReductionPHI && !isa<UndefValue>(ReductionPHI)) {
         assert(ReductionRoot && "Need a reduction operation");
         ReductionRoot->setOperand(0, VectorizedTree);
         ReductionRoot->setOperand(1, ReductionPHI);
Index: llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal.ll
===================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal.ll
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal.ll
@@ -1,6 +1,5 @@
-; RUN: opt -slp-vectorizer -S <  %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=NOSTORE
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s
 
 ; #include <stdint.h>
 ;
@@ -15,9 +14,9 @@
 ;   return sum;
 ; }
 
-; NOSTORE-LABEL: add_red
-; NOSTORE: fmul <4 x float>
-; NOSTORE: shufflevector <4 x float>
+; CHECK-LABEL: add_red
+; CHECK: fmul <4 x float>
+; CHECK: shufflevector <4 x float>
 
 define i32 @add_red(float* %A, i32 %n) {
 entry:
@@ -148,8 +147,8 @@
 ; }
 
 ; CHECK-LABEL: long_red
-; CHECK: fmul fast <4 x float>
-; CHECK: shufflevector <4 x float>
+; CHECK: fmul fast <8 x float>
+; CHECK: shufflevector <8 x float>
 
 define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) {
 entry:
@@ -305,6 +304,149 @@
   ret i32 %sum.0.lcssa
 }
 
+; void foo(const float *arg_A, unsigned arg_B, float *array) {
+;   for (uint32_t i = 0; i < 6; ++i) {
+;     const float *ptr = arg_A + i;
+;     float w0 = array[i * 4 + 0];
+;     float w1 = array[i * 4 + 1];
+;     float w2 = array[i * 4 + 2];
+;     float w3 = array[i * 4 + 3];
+;
+;     for (unsigned j = 0; j < arg_B; ++j) {
+;       const float x1 = *ptr - (-1.1f * w0) - (1.2f * w1);
+;       const float x2 = (2.1f * x1) + (-2.2f * w0) + (2.3f * w1);
+;       const float x3 = x2 - (-3.1f * w2) - (3.2f * w3);
+;       const float x4 = x3 + (-4.0f * w2) + w3;
+;       w1 = w0;
+;       w0 = x1;
+;       w3 = w2;
+;       w2 = x3;
+;     }
+;
+;     array[i * 4 + 0] = w0;
+;     array[i * 4 + 1] = w1;
+;     array[i * 4 + 2] = w2;
+;     array[i * 4 + 3] = w3;
+;   }
+; }
+
+define void @foo(float* nocapture readonly %arg_A, i32 %arg_B, float* nocapture %array) {
+; CHECK-LABEL: @foo(
+; CHECK: fmul fast <4 x float>
+; CHECK: shufflevector <4 x float>
+;
+entry:
+  %cmp1495 = icmp eq i32 %arg_B, 0
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup15
+  ret void
+
+for.body:                                         ; preds = %for.cond.cleanup15, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.cond.cleanup15 ]
+  %0 = shl i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds float, float* %array, i64 %0
+  %1 = load float, float* %arrayidx, align 4
+  %2 = or i64 %0, 1
+  %arrayidx4 = getelementptr inbounds float, float* %array, i64 %2
+  %3 = load float, float* %arrayidx4, align 4
+  %4 = or i64 %0, 2
+  %arrayidx8 = getelementptr inbounds float, float* %array, i64 %4
+  %5 = load float, float* %arrayidx8, align 4
+  %6 = or i64 %0, 3
+  %arrayidx12 = getelementptr inbounds float, float* %array, i64 %6
+  %7 = load float, float* %arrayidx12, align 4
+  br i1 %cmp1495, label %for.cond.cleanup15, label %for.body16.lr.ph
+
+for.body16.lr.ph:                                 ; preds = %for.body
+  %add.ptr = getelementptr inbounds float, float* %arg_A, i64 %indvars.iv
+  %8 = load float, float* %add.ptr, align 4
+  br label %for.body16
+
+for.cond.cleanup15:                               ; preds = %for.body16, %for.body
+  %w2.0.lcssa = phi float [ %5, %for.body ], [ %sub28, %for.body16 ]
+  %w3.0.lcssa = phi float [ %7, %for.body ], [ %w2.096, %for.body16 ]
+  %w1.0.lcssa = phi float [ %3, %for.body ], [ %w0.0100, %for.body16 ]
+  %w0.0.lcssa = phi float [ %1, %for.body ], [ %sub19, %for.body16 ]
+  store float %w0.0.lcssa, float* %arrayidx, align 4
+  store float %w1.0.lcssa, float* %arrayidx4, align 4
+  store float %w2.0.lcssa, float* %arrayidx8, align 4
+  store float %w3.0.lcssa, float* %arrayidx12, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond109 = icmp eq i64 %indvars.iv.next, 6
+  br i1 %exitcond109, label %for.cond.cleanup, label %for.body
+
+for.body16:                                       ; preds = %for.body16, %for.body16.lr.ph
+  %w0.0100 = phi float [ %1, %for.body16.lr.ph ], [ %sub19, %for.body16 ]
+  %w1.099 = phi float [ %3, %for.body16.lr.ph ], [ %w0.0100, %for.body16 ]
+  %j.098 = phi i32 [ 0, %for.body16.lr.ph ], [ %inc, %for.body16 ]
+  %w3.097 = phi float [ %7, %for.body16.lr.ph ], [ %w2.096, %for.body16 ]
+  %w2.096 = phi float [ %5, %for.body16.lr.ph ], [ %sub28, %for.body16 ]
+  %mul17 = fmul fast float %w0.0100, 0x3FF19999A0000000
+  %mul18.neg = fmul fast float %w1.099, 0xBFF3333340000000
+  %sub92 = fadd fast float %mul17, %mul18.neg
+  %sub19 = fadd fast float %sub92, %8
+  %mul20 = fmul fast float %sub19, 0x4000CCCCC0000000
+  %mul21.neg = fmul fast float %w0.0100, 0xC0019999A0000000
+  %mul23 = fmul fast float %w1.099, 0x4002666660000000
+  %mul25 = fmul fast float %w2.096, 0x4008CCCCC0000000
+  %mul27.neg = fmul fast float %w3.097, 0xC0099999A0000000
+  %add2293 = fadd fast float %mul27.neg, %mul25
+  %add24 = fadd fast float %add2293, %mul23
+  %sub2694 = fadd fast float %add24, %mul21.neg
+  %sub28 = fadd fast float %sub2694, %mul20
+  %inc = add nuw i32 %j.098, 1
+  %exitcond = icmp eq i32 %inc, %arg_B
+  br i1 %exitcond, label %for.cond.cleanup15, label %for.body16
+}
+
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=STORE
+
+; void foo(double * restrict A, double * restrict B, double * restrict C,
+;          int n) {
+;   for (intptr_t i=0; i < n; ++i) {
+;     C[i] = B[0] *A[i*4  ] + B[1] *A[i*4+1];
+;   }
+; }
+
+; STORE-LABEL: store_red_double
+; STORE: fmul fast <2 x double>
+; STORE: extractelement <2 x double>
+; STORE: extractelement <2 x double>
+
+define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) {
+entry:
+  %cmp17 = icmp sgt i32 %n, 0
+  br i1 %cmp17, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = load double, double* %B, align 8
+  %arrayidx4 = getelementptr inbounds double, double* %B, i64 1
+  %1 = load double, double* %arrayidx4, align 8
+  %2 = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %mul = shl nsw i64 %i.018, 2
+  %arrayidx2 = getelementptr inbounds double, double* %A, i64 %mul
+  %3 = load double, double* %arrayidx2, align 8
+  %mul3 = fmul fast double %0, %3
+  %add16 = or i64 %mul, 1
+  %arrayidx6 = getelementptr inbounds double, double* %A, i64 %add16
+  %4 = load double, double* %arrayidx6, align 8
+  %mul7 = fmul fast double %1, %4
+  %add8 = fadd fast double %mul3, %mul7
+  %arrayidx9 = getelementptr inbounds double, double* %C, i64 %i.018
+  store double %add8, double* %arrayidx9, align 8
+  %inc = add nsw i64 %i.018, 1
+  %exitcond = icmp eq i64 %inc, %2
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
 ; int foo(float * restrict A, float * restrict B, float * restrict C, int n) {
 ;   float sum = 0;
 ;   for (intptr_t i=0; i < n; ++i) {
@@ -316,9 +458,9 @@
 ;   return sum;
 ; }
 
-; CHECK-LABEL: store_red
-; CHECK: fmul fast <4 x float>
-; CHECK: shufflevector <4 x float>
+; STORE-LABEL: store_red
+; STORE: fmul fast <4 x float>
+; STORE: shufflevector <4 x float>
 
 define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) {
 entry:
@@ -368,50 +510,3 @@
   ret i32 0
 }
 
-
-; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S <  %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=STORE
-
-; void foo(double * restrict A, double * restrict B, double * restrict C,
-;          int n) {
-;   for (intptr_t i=0; i < n; ++i) {
-;     C[i] = B[0] *A[i*4  ] + B[1] *A[i*4+1];
-;   }
-; }
-
-; STORE-LABEL: store_red_double
-; STORE: fmul fast <2 x double>
-; STORE: extractelement <2 x double>
-; STORE: extractelement <2 x double>
-
-define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) {
-entry:
-  %cmp17 = icmp sgt i32 %n, 0
-  br i1 %cmp17, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:
-  %0 = load double, double* %B, align 8
-  %arrayidx4 = getelementptr inbounds double, double* %B, i64 1
-  %1 = load double, double* %arrayidx4, align 8
-  %2 = sext i32 %n to i64
-  br label %for.body
-
-for.body:
-  %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
-  %mul = shl nsw i64 %i.018, 2
-  %arrayidx2 = getelementptr inbounds double, double* %A, i64 %mul
-  %3 = load double, double* %arrayidx2, align 8
-  %mul3 = fmul fast double %0, %3
-  %add16 = or i64 %mul, 1
-  %arrayidx6 = getelementptr inbounds double, double* %A, i64 %add16
-  %4 = load double, double* %arrayidx6, align 8
-  %mul7 = fmul fast double %1, %4
-  %add8 = fadd fast double %mul3, %mul7
-  %arrayidx9 = getelementptr inbounds double, double* %C, i64 %i.018
-  store double %add8, double* %arrayidx9, align 8
-  %inc = add nsw i64 %i.018, 1
-  %exitcond = icmp eq i64 %inc, %2
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}