Index: lib/Target/X86/X86FixupBWInsts.cpp
===================================================================
--- lib/Target/X86/X86FixupBWInsts.cpp
+++ lib/Target/X86/X86FixupBWInsts.cpp
@@ -95,6 +95,12 @@
   /// nullptr.
   MachineInstr *tryReplaceCopy(MachineInstr *MI) const;
 
+  // Change the MachineInstr \p MI into an eqivalent 32 bit instruction if
+  // possible.  Return the replacement instruction if OK, return nullptr
+  // otherwise. Set WasCandidate to true or false depending on whether the
+  // MI was a candidate for this sort of transformation.
+  MachineInstr *tryReplaceInstr(MachineInstr *MI, MachineBasicBlock &MBB,
+                                bool &WasCandidate) const;
 public:
   static char ID;
 
@@ -267,6 +273,54 @@
   return MIB;
 }
 
+MachineInstr *FixupBWInstPass::tryReplaceInstr(
+                  MachineInstr *MI, MachineBasicBlock &MBB,
+                  bool &WasCandidate) const {
+  MachineInstr *NewMI = nullptr;
+  WasCandidate = false;
+
+  // See if this is an instruction of the type we are currently looking for.
+  switch (MI->getOpcode()) {
+
+  case X86::MOV8rm:
+    // Only replace 8 bit loads with the zero extending versions if
+    // in an inner most loop and not optimizing for size. This takes
+    // an extra byte to encode, and provides limited performance upside.
+    if (MachineLoop *ML = MLI->getLoopFor(&MBB)) {
+      if (ML->begin() == ML->end() && !OptForSize) {
+        NewMI = tryReplaceLoad(X86::MOVZX32rm8, MI);
+        WasCandidate = true;
+      }
+    }
+    break;
+
+  case X86::MOV16rm:
+    // Always try to replace 16 bit load with 32 bit zero extending.
+    // Code size is the same, and there is sometimes a perf advantage
+    // from eliminating a false dependence on the upper portion of
+    // the register.
+    NewMI = tryReplaceLoad(X86::MOVZX32rm16, MI);
+    WasCandidate = true;
+    break;
+
+  case X86::MOV8rr:
+  case X86::MOV16rr:
+    // Always try to replace 8/16 bit copies with a 32 bit copy.
+    // Code size is either less (16) or equal (8), and there is sometimes a
+    // perf advantage from eliminating a false dependence on the upper portion
+    // of the register.
+    NewMI = tryReplaceCopy(MI);
+    WasCandidate = true;
+    break;
+
+  default:
+    // nothing to do here.
+    break;
+  }
+
+  return NewMI;
+}
+
 void FixupBWInstPass::processBasicBlock(MachineFunction &MF,
                                         MachineBasicBlock &MBB) {
 
@@ -280,7 +334,9 @@
   // and notes that and the original in a data structure, until the
   // whole BB has been analyzed.  This keeps the replacement instructions
   // from making it seem as if the larger register might be live.
-  SmallVector<std::pair<MachineInstr *, MachineInstr *>, 8> MIReplacements;
+  typedef SmallVector<std::pair<MachineInstr *, MachineInstr *>, 8>
+    MIReplacementsType;
+  MIReplacementsType MIReplacements;
 
   // Start computing liveness for this block. We iterate from the end to be able
   // to update this for each instruction.
@@ -288,57 +344,59 @@
   // We run after PEI, so we need to AddPristinesAndCSRs.
   LiveRegs.addLiveOuts(MBB);
 
+  bool CandidateDidntGetTransformed = false;
+  bool WasCandidate = false;
+
   for (auto I = MBB.rbegin(); I != MBB.rend(); ++I) {
-    MachineInstr *NewMI = nullptr;
     MachineInstr *MI = &*I;
+    
+    MachineInstr *NewMI = tryReplaceInstr(MI, MBB, WasCandidate);
 
-    // See if this is an instruction of the type we are currently looking for.
-    switch (MI->getOpcode()) {
-
-    case X86::MOV8rm:
-      // Only replace 8 bit loads with the zero extending versions if
-      // in an inner most loop and not optimizing for size. This takes
-      // an extra byte to encode, and provides limited performance upside.
-      if (MachineLoop *ML = MLI->getLoopFor(&MBB)) {
-        if (ML->begin() == ML->end() && !OptForSize)
-          NewMI = tryReplaceLoad(X86::MOVZX32rm8, MI);
-      }
-      break;
-
-    case X86::MOV16rm:
-      // Always try to replace 16 bit load with 32 bit zero extending.
-      // Code size is the same, and there is sometimes a perf advantage
-      // from eliminating a false dependence on the upper portion of
-      // the register.
-      NewMI = tryReplaceLoad(X86::MOVZX32rm16, MI);
-      break;
-
-    case X86::MOV8rr:
-    case X86::MOV16rr:
-      // Always try to replace 8/16 bit copies with a 32 bit copy.
-      // Code size is either less (16) or equal (8), and there is sometimes a
-      // perf advantage from eliminating a false dependence on the upper portion
-      // of the register.
-      NewMI = tryReplaceCopy(MI);
-      break;
-
-    default:
-      // nothing to do here.
-      break;
-    }
-
-    if (NewMI)
+    // Add this to replacements if it was a candidate, even if NewMI is
+    // nullptr.  We will revisit that in a bit.
+    if (WasCandidate) {
       MIReplacements.push_back(std::make_pair(MI, NewMI));
+      if (!NewMI)
+        CandidateDidntGetTransformed = true;
+    }
 
     // We're done with this instruction, update liveness for the next one.
     LiveRegs.stepBackward(*MI);
   }
 
+  if (CandidateDidntGetTransformed) {
+    // If there was a candidate that didn't get transformed then let's try
+    // doing the register liveness going forward.  Sometimes one direction
+    // is overly conservative compared to the other.
+    LiveRegs.clear();
+    LiveRegs.addLiveIns(MBB);
+
+    auto NextCandidateIter = MIReplacements.begin();
+
+    for (auto I = MBB.begin(); I != MBB.end(); ++I) {
+      MachineInstr *MI = &*I;
+      SmallVector<std::pair<unsigned, const MachineOperand*>, 4> Clobbers;
+      LiveRegs.stepForward(*MI, Clobbers);
+
+      // Only check and create a new instruction if this instruction is
+      // known to be a candidate that didn't get transformed.
+      if (NextCandidateIter->first == MI) {
+        if (NextCandidateIter->second == nullptr) {
+          MachineInstr *NewMI = tryReplaceInstr(MI, MBB, WasCandidate);
+          NextCandidateIter->second = NewMI;
+        }
+        ++NextCandidateIter;
+      }
+    }
+  }
+
   while (!MIReplacements.empty()) {
     MachineInstr *MI = MIReplacements.back().first;
     MachineInstr *NewMI = MIReplacements.back().second;
     MIReplacements.pop_back();
-    MBB.insert(MI, NewMI);
-    MBB.erase(MI);
+    if (NewMI) {
+      MBB.insert(MI, NewMI);
+      MBB.erase(MI);
+    }
   }
 }
Index: test/CodeGen/X86/fixup-bw-inst-fwlive.ll
===================================================================
--- test/CodeGen/X86/fixup-bw-inst-fwlive.ll
+++ test/CodeGen/X86/fixup-bw-inst-fwlive.ll
@@ -0,0 +1,117 @@
+; RUN: llc -fixup-byte-word-insts=1 -march=x86-64 < %s | \
+; RUN: FileCheck -check-prefix CHECK -check-prefix BWON %s
+; RUN: llc -fixup-byte-word-insts=0 -march=x86-64 < %s | \
+; RUN: FileCheck -check-prefix CHECK -check-prefix BWOFF %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; These CHECKs are complex.  They were derived by running
+; fixup-byte-word-insts with and without the forward liveness checking code
+; in order to create this unit test, and using bugpoint to reduce the test
+; to something as simple as possible that would reproduce the need for the
+; forward liveness.
+;
+; CHECK-LABEL: getAndMoveToFrontDecode:
+; CHECK-LABEL: # BB#1:
+; BWON: movl %eax, %ecx
+; BWOFF: movb %al, %cl
+; CHECK-NEXT: shll %cl, %esi
+
+@bsLive = external global i32, align 4
+@bsBuff = external global i32, align 4
+@limit = external global [6 x [258 x i32]], align 16
+
+define void @getAndMoveToFrontDecode() {
+entry:
+  br label %while.body.i.i
+
+while.body.i.i:                                   ; preds = %if.end.i.i, %entry
+  br i1 false, label %if.then.i.i, label %if.end.i.i
+
+if.then.i.i:                                      ; preds = %while.body.i.i
+  unreachable
+
+if.end.i.i:                                       ; preds = %while.body.i.i
+  br i1 undef, label %while.body.i.i, label %vector.body
+
+vector.body:                                      ; preds = %if.end.i.i
+  br label %while.body.i
+
+while.body.i:                                     ; preds = %if.end.i, %vector.body
+  br i1 false, label %if.then.i, label %if.end.i
+
+if.then.i:                                        ; preds = %while.body.i
+  unreachable
+
+if.end.i:                                         ; preds = %while.body.i
+  br i1 undef, label %while.body.i, label %bsR.exit
+
+bsR.exit:                                         ; preds = %if.end.i
+  br i1 undef, label %while.end307, label %if.end57
+
+if.end57:                                         ; preds = %while.end297, %bsR.exit
+  br i1 false, label %do.body, label %if.else172
+
+do.body:                                          ; preds = %if.end57
+  unreachable
+
+if.else172:                                       ; preds = %if.end57
+  %cmp174 = icmp slt i32 undef, undef
+  br i1 %cmp174, label %if.end177, label %if.then176
+
+if.then176:                                       ; preds = %if.else172
+  unreachable
+
+if.end177:                                        ; preds = %if.else172
+  %0 = load i32, i32* undef, align 4
+  %idxprom264480 = sext i32 %0 to i64
+  br i1 false, label %while.body.i438, label %entry.while.end_crit_edge.i435
+
+entry.while.end_crit_edge.i435:                   ; preds = %if.end177
+  %.pre.i434 = load i32, i32* @bsBuff, align 4
+  %1 = trunc i64 %idxprom264480 to i32
+  %2 = trunc i64 %idxprom264480 to i32
+  %sub.i447 = sub nsw i32 0, %1
+  %shr.i448 = lshr i32 %.pre.i434, %sub.i447
+  %shl4.i449 = shl i32 1, %2
+  %sub5.i450 = add nsw i32 %shl4.i449, -1
+  %and6.i451 = and i32 %shr.i448, %sub5.i450
+  store i32 %sub.i447, i32* @bsLive, align 4
+  %arrayidx267481 = getelementptr inbounds [6 x [258 x i32]], [6 x [258 x i32]]* @limit, i64 0, i64 undef, i64 %idxprom264480
+  %3 = load i32, i32* %arrayidx267481, align 4
+  %cmp268482 = icmp sgt i32 %and6.i451, %3
+  br i1 %cmp268482, label %while.body270, label %while.end297
+
+while.body.i438:                                  ; preds = %if.end177
+  unreachable
+
+while.body270:                                    ; preds = %while.end290, %entry.while.end_crit_edge.i435
+  %zvec248.0484 = phi i32 [ %or296, %while.end290 ], [ %and6.i451, %entry.while.end_crit_edge.i435 ]
+  %indvars.iv.next529 = add i64 undef, 1
+  %cmp273478 = icmp slt i32 undef, 1
+  br i1 %cmp273478, label %while.body275, label %while.end290
+
+while.body275:                                    ; preds = %if.end282, %while.body270
+  br i1 undef, label %if.then281, label %if.end282
+
+if.then281:                                       ; preds = %while.body275
+  unreachable
+
+if.end282:                                        ; preds = %while.body275
+  br i1 undef, label %while.body275, label %while.end290
+
+while.end290:                                     ; preds = %if.end282, %while.body270
+  %shl295 = shl i32 %zvec248.0484, 1
+  %or296 = or i32 0, %shl295
+  %arrayidx267 = getelementptr inbounds [6 x [258 x i32]], [6 x [258 x i32]]* @limit, i64 0, i64 undef, i64 %indvars.iv.next529
+  %4 = load i32, i32* %arrayidx267, align 4
+  %cmp268 = icmp sgt i32 %or296, %4
+  br i1 %cmp268, label %while.body270, label %while.end297
+
+while.end297:                                     ; preds = %while.end290, %entry.while.end_crit_edge.i435
+  br i1 undef, label %while.end307, label %if.end57
+
+while.end307:                                     ; preds = %while.end297, %bsR.exit
+  ret void
+}