Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -436,8 +436,6 @@
                                SmallPtrSetImpl<Value *> &AllowedExit) {
   // Reductions, Inductions and non-header phis are allowed to have exit users. All
   // other instructions must not have external users.
-  // TODO: Non-phi instructions can also be taught to have exit users, now that
-  // we know how to extract the last scalar element from the loop.
   if (!AllowedExit.count(Inst))
     // Check that all of the users of the loop are inside the BB.
     for (User *U : Inst->users()) {
@@ -626,6 +624,20 @@
           continue;
         }
 
+        // TODO: Instead of recording the AllowedExit, it would be good to record the
+        // complementary set: NotAllowedExit. These include (but may not be
+        // limited to):
+        // 1. Reduction phis as they represent the one-before-last value, which
+        // is not available when vectorized 
+        // 2. Induction phis and increment when SCEV predicates cannot be used
+        // outside the loop - see addInductionPhi
+        // 3. Non-Phis with outside uses when SCEV predicates cannot be used
+        // outside the loop - see call to hasOutsideLoopUser in the non-phi
+        // handling below
+        // 4. FirstOrderRecurrence phis that can possibly be handled by
+        // extraction.
+        // By recording these, we can then reason about ways to vectorize each
+        // of these NotAllowedExit. 
         InductionDescriptor ID;
         if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
           addInductionPhi(Phi, ID, AllowedExit);
@@ -718,6 +730,14 @@
       // Reduction instructions are allowed to have exit users.
       // All other instructions must not have external users.
       if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
+        // We can safely vectorize loops where instructions within the loop are
+        // used outside the loop only if the SCEV predicates within the loop is
+        // same as outside the loop. Allowing the exit means reusing the SCEV
+        // outside the loop.
+        if (PSE.getUnionPredicate().isAlwaysTrue()) {
+          AllowedExit.insert(&I);
+          continue;
+        }
         ORE->emit(createMissedAnalysis("ValueUsedOutsideLoop", &I)
                   << "value cannot be used outside the loop");
         return false;
Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3721,11 +3721,18 @@
   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
     if (LCSSAPhi.getNumIncomingValues() == 1) {
       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
+      // Non-instruction incoming values will have only one value.
+      unsigned LastLane = 0;
+      if (isa<Instruction>(IncomingValue)) 
+          LastLane = Cost->isUniformAfterVectorization(
+                         cast<Instruction>(IncomingValue), VF)
+                         ? 0
+                         : VF - 1;
       // Can be a loop invariant incoming value or the last scalar value to be
       // extracted from the vectorized loop.
       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
       Value *lastIncomingValue =
-          getOrCreateScalarValue(IncomingValue, {UF - 1, VF - 1});
+          getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
     }
   }
@@ -4504,20 +4511,22 @@
     }
 
   // Expand Worklist in topological order: whenever a new instruction
-  // is added , its users should be either already inside Worklist, or
-  // out of scope. It ensures a uniform instruction will only be used
-  // by uniform instructions or out of scope instructions.
+  // is added , its users should be already inside Worklist.  It ensures
+  // a uniform instruction will only be used by uniform instructions.
   unsigned idx = 0;
   while (idx != Worklist.size()) {
     Instruction *I = Worklist[idx++];
 
     for (auto OV : I->operand_values()) {
+      // isOutOfScope operands cannot be uniform instructions.
       if (isOutOfScope(OV))
         continue;
+      // If all the users of the operand are uniform, then add the
+      // operand into the uniform worklist.
       auto *OI = cast<Instruction>(OV);
       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
             auto *J = cast<Instruction>(U);
-            return !TheLoop->contains(J) || Worklist.count(J) ||
+            return Worklist.count(J) ||
                    (OI == getLoadStorePointerOperand(J) &&
                     isUniformDecision(J, VF));
           })) {
Index: llvm/trunk/test/Transforms/LoopVectorize/no_outside_user.ll
===================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/no_outside_user.ll
+++ llvm/trunk/test/Transforms/LoopVectorize/no_outside_user.ll
@@ -265,3 +265,150 @@
   %x.0.lcssa = phi i32 [ 0, %entry ], [ %tmp17 , %latch ]
   ret i32 %x.0.lcssa
 }
+
+
+; CHECK-LABEL: @outside_user_non_phi(
+; CHECK: %vec.ind = phi <2 x i32>
+; CHECK: [[CMP:%[a-zA-Z0-9.]+]] = icmp sgt <2 x i32> %vec.ind, <i32 10, i32 10>
+; CHECK: %predphi = select <2 x i1> [[CMP]], <2 x i32> <i32 1, i32 1>, <2 x i32> zeroinitializer
+; CHECK: [[TRUNC:%[a-zA-Z0-9.]+]] = trunc <2 x i32> %predphi to <2 x i8>
+
+; CHECK-LABEL: middle.block:
+; CHECK:          [[E1:%[a-zA-Z0-9.]+]] = extractelement <2 x i8> [[TRUNC]], i32 1
+
+; CHECK-LABEL: f1.exit.loopexit:
+; CHECK:          %.lcssa = phi i8 [ %tmp17.trunc, %bb16 ], [ [[E1]], %middle.block ]
+define i8 @outside_user_non_phi()  {
+bb:
+  %b.promoted = load i32, i32* @b, align 4
+  br label %.lr.ph.i
+
+.lr.ph.i:
+  %tmp8 = phi i32 [ %tmp18, %bb16 ], [ %b.promoted, %bb ]
+  %tmp2 = icmp sgt i32 %tmp8, 10
+  br i1 %tmp2, label %bb16, label %bb10
+
+bb10:
+  br label %bb16
+
+bb16:
+  %tmp17 = phi i32 [ 0, %bb10 ], [ 1, %.lr.ph.i ]
+  %tmp17.trunc = trunc i32 %tmp17 to i8
+  %tmp18 = add nsw i32 %tmp8, 1
+  %tmp19 = icmp slt i32 %tmp18, 4
+  br i1 %tmp19, label %.lr.ph.i, label %f1.exit.loopexit
+
+f1.exit.loopexit:
+  %.lcssa = phi i8 [ %tmp17.trunc, %bb16 ]
+  ret i8 %.lcssa
+}
+
+; CHECK-LABEL: no_vectorize_reduction_with_outside_use(
+; CHECK-NOT: <2 x i32>
+define i32 @no_vectorize_reduction_with_outside_use(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %or, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %or = or i32 %add, %result.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %1, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+
+; vectorize c[i] = a[i] + b[i] loop where result of c[i] is used outside the
+; loop
+; CHECK-LABEL: sum_arrays_outside_use(
+; CHECK-LABEL: vector.memcheck:
+; CHECK:         br i1 %memcheck.conflict, label %scalar.ph, label %vector.ph  
+
+; CHECK-LABEL: vector.body:
+; CHECK:          %wide.load = load <2 x i32>, <2 x i32>*
+; CHECK:          %wide.load16 = load <2 x i32>, <2 x i32>* 
+; CHECK:          [[ADD:%[a-zA-Z0-9.]+]] = add nsw <2 x i32> %wide.load, %wide.load16
+; CHECK:          store <2 x i32>
+
+; CHECK-LABEL: middle.block:
+; CHECK:          [[E1:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[ADD]], i32 1
+
+; CHECK-LABEL: f1.exit.loopexit:
+; CHECK:          %.lcssa = phi i32 [ %sum, %.lr.ph.i ], [ [[E1]], %middle.block ]
+define i32 @sum_arrays_outside_use(i32* %B, i32* %A, i32* %C, i32 %N)  {
+bb:
+  %b.promoted = load i32, i32* @b, align 4
+  br label %.lr.ph.i
+
+.lr.ph.i:
+  %iv = phi i32 [ %ivnext, %.lr.ph.i ], [ %b.promoted, %bb ]
+  %indvars.iv = sext i32 %iv to i64
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %Bload = load i32, i32* %arrayidx2, align 4
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %Aload = load i32, i32* %arrayidx, align 4
+  %sum = add nsw i32 %Bload, %Aload
+  %arrayidx3 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
+  store i32 %sum, i32* %arrayidx3, align 4
+  %ivnext = add nsw i32 %iv, 1
+  %tmp19 = icmp slt i32 %ivnext, %N
+  br i1 %tmp19, label %.lr.ph.i, label %f1.exit.loopexit
+
+f1.exit.loopexit:
+  %.lcssa = phi i32 [ %sum, %.lr.ph.i ]
+  ret i32 %.lcssa
+}
+
+@tab = common global [32 x i8] zeroinitializer, align 1
+
+; CHECK-LABEL: non_uniform_live_out()
+; CHECK-LABEL:   vector.body:
+; CHECK:           %vec.ind = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+; CHECK:           [[ADD:%[a-zA-Z0-9.]+]] = add <2 x i32> %vec.ind, <i32 7, i32 7> 
+; CHECK:           [[EE:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[ADD]], i32 0 
+; CHECK:           [[GEP:%[a-zA-Z0-9.]+]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[EE]]
+; CHECK-NEXT:      [[GEP2:%[a-zA-Z0-9.]+]] = getelementptr inbounds i8, i8* [[GEP]], i32 0
+; CHECK-NEXT:      [[BC:%[a-zA-Z0-9.]+]] = bitcast i8* [[GEP2]] to <2 x i8>*
+; CHECK-NEXT:      %wide.load = load <2 x i8>, <2 x i8>* [[BC]]
+; CHECK-NEXT:      [[ADD2:%[a-zA-Z0-9.]+]] = add <2 x i8> %wide.load, <i8 1, i8 1> 
+; CHECK:           store <2 x i8> [[ADD2]], <2 x i8>*
+
+; CHECK-LABEL:  middle.block:
+; CHECK:           [[ADDEE:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[ADD]], i32 1
+
+; CHECK-LABEL:  for.end:
+; CHECK:           %lcssa = phi i32 [ %i.09, %for.body ], [ [[ADDEE]], %middle.block ]
+; CHECK:           %arrayidx.out = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %lcssa
+define i32 @non_uniform_live_out() {
+entry:
+ br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+ %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+ %i.09 = add i32 %i.08, 7
+ %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.09
+ %0 = load i8, i8* %arrayidx, align 1
+ %bump = add i8 %0, 1
+ store i8 %bump, i8* %arrayidx, align 1
+ %inc = add nsw i32 %i.08, 1
+ %exitcond = icmp eq i32 %i.08, 20000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+ %lcssa = phi i32 [%i.09, %for.body]
+ %arrayidx.out = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %lcssa
+ store i8 42, i8* %arrayidx.out, align 1
+ ret i32 0
+}