Index: lib/Target/X86/CMakeLists.txt
===================================================================
--- lib/Target/X86/CMakeLists.txt
+++ lib/Target/X86/CMakeLists.txt
@@ -20,6 +20,7 @@
   X86FixupBWInsts.cpp
   X86FixupLEAs.cpp
   X86FixupSetCC.cpp
+  X86FixupZExt.cpp
   X86FloatingPoint.cpp
   X86FrameLowering.cpp
   X86ISelDAGToDAG.cpp
Index: lib/Target/X86/X86.h
===================================================================
--- lib/Target/X86/X86.h
+++ lib/Target/X86/X86.h
@@ -62,6 +62,8 @@
 /// Return a pass that transforms setcc + movzx pairs into xor + setcc.
 FunctionPass *createX86FixupSetCC();
 
+FunctionPass *createX86FixupZExt();
+
 /// Return a pass that expands WinAlloca pseudo-instructions.
 FunctionPass *createX86WinAllocaExpander();
 
Index: lib/Target/X86/X86FixupZExt.cpp
===================================================================
--- /dev/null
+++ lib/Target/X86/X86FixupZExt.cpp
@@ -0,0 +1,685 @@
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+
+#include <iterator>
+
+#define DEBUG_TYPE "x86-fixup-zext"
+
+namespace {
+using namespace llvm;
+using std::unique_ptr;
+using std::vector;
+using std::pair;
+using Segment = LiveRange::Segment;
+
+template <typename Elem, typename Container>
+using is_iterable_of = typename std::enable_if<std::is_same<
+    typename std::decay<decltype(*std::declval<Container>().begin())>::type,
+    Elem>::value>::type;
+
+template <typename T> auto push_to(T &t) -> decltype(std::back_inserter(t)) {
+  return std::back_inserter(t);
+}
+
+unsigned get_phys(unsigned reg, const VirtRegMap &vrm) {
+  return TargetRegisterInfo::isVirtualRegister(reg) ? vrm.getPhys(reg) : reg;
+}
+
+unsigned get_phys(const MachineOperand &regop, const VirtRegMap &vrm) {
+  const auto *f = regop.getParent()->getParent()->getParent();
+  const auto &tri = *f->getSubtarget().getRegisterInfo();
+  assert(regop.isReg());
+  unsigned preg = get_phys(regop.getReg(), vrm);
+  return regop.getSubReg() ? tri.getSubReg(preg, regop.getSubReg()) : preg;
+}
+
+unsigned get_phys(const MachineInstr &i, unsigned opnum,
+                  const VirtRegMap &vrm) {
+  return get_phys(i.getOperand(opnum), vrm);
+}
+
+DenseMap<MachineBasicBlock *, MachineInstr *>
+dominating_defs(unsigned gr8, const MachineRegisterInfo &mri,
+                const SlotIndexes &si) {
+  DenseMap<MachineBasicBlock *, MachineInstr *> defs;
+  // at least until release_37, getInstructionIndex is expensive.
+  DenseMap<MachineBasicBlock *, SlotIndex> cached;
+
+  for (MachineInstr &def : mri.def_instructions(gr8)) {
+    unsigned tied_use;
+    if (def.isRegTiedToUseOperand(0, &tied_use) &&
+        def.getOperand(tied_use).getReg() != def.getOperand(0).getReg()) {
+      DEBUG(dbgs() << "dominating_defs: " << def.getOperand(0) << " is tied to "
+                   << def.getOperand(tied_use) << "\n");
+      return dominating_defs(def.getOperand(tied_use).getReg(), mri, si);
+    }
+    MachineBasicBlock *bb = def.getParent();
+    if (defs.find(bb) == defs.end() ||
+        si.getInstructionIndex(def) < cached.lookup(bb)) {
+      cached[bb] = si.getInstructionIndex(def);
+      defs[bb] = &def;
+    }
+  }
+  return defs;
+}
+
+void add_seg(SlotIndex s, SlotIndex e, LiveInterval &live, LiveIntervals &li) {
+  VNInfo *valno = !live.hasAtLeastOneValue()
+                      ? live.getNextValue(s, li.getVNInfoAllocator())
+                      : *live.vni_begin();
+  assert(live.getNumValNums() == 1);
+  live.addSegment(Segment(std::move(s), std::move(e), valno));
+}
+
+void add_seg(MachineInstr &s, MachineInstr &e, LiveInterval &live,
+             LiveIntervals &li) {
+  return add_seg(li.getInstructionIndex(s), li.getInstructionIndex(e), live,
+                 li);
+}
+
+void add_segs(LiveInterval &src, LiveInterval &dest, LiveIntervals &li) {
+  for (const Segment &s : src) {
+    add_seg(s.start, s.end, dest, li);
+  }
+}
+
+MachineInstr *insert_mov32r0(MachineInstr &def8, LiveInterval &live,
+                             LiveIntervals &li) {
+  auto slot = [&](MachineInstr &i) { return li.getInstructionIndex(i); };
+  const MachineFunction &f = *def8.getParent()->getParent();
+  const auto &tri = f.getSubtarget().getRegisterInfo();
+  MachineBasicBlock &bb = *def8.getParent();
+  MachineBasicBlock::iterator ins = &def8;
+
+  if (const Segment *eflagseg =
+          li.getRegUnit(*MCRegUnitIterator(X86::EFLAGS, tri))
+              .getSegmentContaining(slot(def8))) {
+    if (eflagseg->start <= slot(*bb.begin()) && bb.isLiveIn(X86::EFLAGS)) {
+      if (bb.pred_size() > 1) {
+        return nullptr;
+      }
+      add_seg(li.getMBBStartIdx(&bb), slot(def8), live, li);
+      return insert_mov32r0(*(*bb.pred_begin())->rbegin(), live, li);
+    }
+    ins = li.getInstructionFromIndex(eflagseg->start);
+  }
+  // insert dummy mov32r0
+  MachineInstrBuilder mib =
+      BuildMI(bb, ins, def8.getDebugLoc(),
+              f.getSubtarget().getInstrInfo()->get(X86::MOV32r0), 0);
+  return mib;
+}
+
+template <typename T, typename = is_iterable_of<LiveInterval *, T>>
+raw_ostream &operator<<(raw_ostream &out, const T &es) {
+  for (LiveInterval *e : es) {
+    out << "\t" << (*e) << "\n";
+  }
+  return out;
+}
+
+template <typename T, typename = is_iterable_of<LiveInterval *, T>>
+bool interferes(const T &as, const LiveInterval &b,
+                const MachineRegisterInfo &mri) {
+  return any_of(as, [&](const LiveInterval *a) { return a->overlaps(b); });
+}
+
+template <typename Iterator, typename Predicate>
+Iterator move_to_end_if(Iterator first, Iterator last, Predicate p) {
+  Iterator rv = last;
+  while (first != rv) {
+    if (p(*first)) {
+      --rv;
+      std::swap(*first, *rv);
+    } else {
+      ++first;
+    }
+  }
+  return rv;
+}
+
+template <typename Range, typename Predicate>
+auto move_to_end_if(Range &r, Predicate p) -> decltype(r.end()) {
+  return move_to_end_if(r.begin(), r.end(), std::move(p));
+}
+
+struct ReAllocTool {
+  const TargetRegisterInfo *tri;
+  const MachineRegisterInfo *mri;
+  LiveRegMatrix *lrm;
+  VirtRegMap *vrm;
+  RegisterClassInfo rci;
+  BitVector unused_csr;
+
+  void add_reg_to_bv(BitVector &bv, MCPhysReg reg) const {
+    for (MCRegAliasIterator r(reg, tri, true); r.isValid(); ++r) {
+      bv.set(*r);
+    }
+  }
+
+  BitVector bv_from_regs(ArrayRef<MCPhysReg> regs) const {
+    BitVector rv(tri->getNumRegs());
+    for (const MCPhysReg &r : regs) {
+      add_reg_to_bv(rv, r);
+    }
+    return rv;
+  }
+
+  template <typename Predicate>
+  BitVector bv_from_regs(ArrayRef<MCPhysReg> regs, Predicate p) const {
+    BitVector rv(tri->getNumRegs());
+    for (const MCPhysReg &r : regs) {
+      if (p(r)) {
+        add_reg_to_bv(rv, r);
+      }
+    }
+  }
+
+  ReAllocTool(const MachineFunction &f, LiveRegMatrix &lrm_, VirtRegMap &vrm_)
+      : tri(f.getSubtarget().getRegisterInfo()), mri(&f.getRegInfo()),
+        lrm(&lrm_), vrm(&vrm_), rci(), unused_csr(tri->getNumRegs()) {
+    const MCPhysReg *csr = tri->getCalleeSavedRegs(&f);
+    for (unsigned i = 0; csr[i] != 0; i += 1) {
+      if (!lrm->isPhysRegUsed(csr[i])) {
+        add_reg_to_bv(unused_csr, csr[i]);
+      }
+    }
+    rci.runOnMachineFunction(f);
+  }
+
+  bool interf(LiveInterval &live, unsigned preg) const {
+    return lrm->checkInterference(live, preg) != LiveRegMatrix::IK_Free;
+  }
+
+  template <typename T, typename = is_iterable_of<LiveInterval *, T>>
+  bool interf(LiveInterval &live, unsigned preg, T &evictees) const {
+    if (lrm->checkRegMaskInterference(live, preg) ||
+        lrm->checkRegUnitInterference(live, preg)) {
+      return true;
+    }
+    DenseSet<LiveInterval *> ev;
+    for (MCRegUnitIterator regunit(preg, tri); regunit.isValid(); ++regunit) {
+      LiveIntervalUnion::Query &q = lrm->query(live, *regunit);
+      if (q.collectInterferingVRegs() > 0) {
+        for (LiveInterval *l : q.interferingVRegs()) {
+          ev.insert(l);
+        }
+      }
+    }
+    std::copy(ev.begin(), ev.end(), push_to(evictees));
+    return evictees.size() > 0;
+  }
+
+  const MCPhysReg *alloc_next(LiveInterval &live,
+                              const BitVector *except = nullptr,
+                              ArrayRef<MCPhysReg>::iterator *it = nullptr,
+                              const TargetRegisterClass *rc = nullptr) const {
+    ArrayRef<MCPhysReg> ord =
+        rci.getOrder(rc ? rc : mri->getRegClass(live.reg));
+    BitVector rs = unused_csr;
+    if (except != nullptr) {
+      rs |= *except;
+    }
+    auto rv = std::find_if(
+        it ? std::next(*it) : ord.begin(), ord.end(),
+        [&](MCPhysReg r) { return !rs.test(r) && !interf(live, r); });
+    return rv == ord.end() ? nullptr : rv;
+  }
+
+  MCPhysReg alloc(LiveInterval &live, const BitVector *except = nullptr,
+                  const TargetRegisterClass *rc = nullptr) const {
+    const MCPhysReg *rv = alloc_next(live, except, nullptr, rc);
+    return rv == nullptr ? 0 : *rv;
+  }
+
+  // (re-)allocate a group of interfering intervals. brute force search. returns
+  // nullptr if impossible.
+  template <typename C, typename = is_iterable_of<LiveInterval *, C>>
+  unique_ptr<vector<pair<LiveInterval *, const MCPhysReg *>>>
+  alloc_interf_intervals(C group, const BitVector *except = nullptr) const {
+    if (group.empty()) {
+      return make_unique<vector<pair<LiveInterval *, const MCPhysReg *>>>();
+    }
+    auto assigned =
+        make_unique<vector<pair<LiveInterval *, const MCPhysReg *>>>();
+
+    auto maybe_unassign = [&](pair<LiveInterval *, const MCPhysReg *> &p) {
+      if (p.second) {
+        lrm->unassign(*p.first);
+      }
+    };
+
+    auto maybe_assign = [&](pair<LiveInterval *, const MCPhysReg *> &p) {
+      if (p.second) {
+        lrm->assign(*p.first, *p.second);
+      }
+    };
+
+    auto try_next_in_group = [&]() {
+      assert(!group.empty());
+      assigned->push_back(
+          std::make_pair(group.back(), alloc_next(*group.back(), except)));
+      group.pop_back();
+      maybe_assign(assigned->back());
+    };
+
+    auto back_to_previous = [&]() {
+      assert(!assigned->empty());
+      maybe_unassign(assigned->back());
+      group.push_back(assigned->back().first);
+      assigned->pop_back();
+    };
+
+    auto try_next_reg = [&]() {
+      assert(!assigned->empty());
+      maybe_unassign(assigned->back());
+      assigned->back().second =
+          alloc_next(*assigned->back().first, except, &assigned->back().second);
+      maybe_assign(assigned->back());
+    };
+
+    try_next_in_group();
+
+    while (!group.empty() || assigned->back().second == nullptr) {
+      if (assigned->back().second == nullptr) {
+        back_to_previous();
+        if (assigned->empty()) {
+          return nullptr;
+        }
+        try_next_reg();
+      } else {
+        try_next_in_group();
+      }
+    }
+    for (auto &p : *assigned) {
+      lrm->unassign(*p.first);
+    }
+    return assigned;
+  }
+
+  template <typename C, typename = is_iterable_of<LiveInterval *, C>>
+  unique_ptr<vector<MCPhysReg>>
+  evict_intervals(const C &lives, const BitVector *excepts = nullptr) const {
+    DenseMap<LiveInterval *, const MCPhysReg *> newmap;
+    vector<LiveInterval *> ungrouped(lives.begin(), lives.end());
+
+    while (!ungrouped.empty()) {
+      vector<LiveInterval *> group;
+      group.push_back(ungrouped.back());
+      ungrouped.pop_back();
+      bool done = false;
+      while (!done) {
+        auto it = move_to_end_if(ungrouped, [&](LiveInterval *h) {
+          return interferes(group, *h, *mri);
+        });
+        done = it == ungrouped.end();
+        std::copy(it, ungrouped.end(), push_to(group));
+        ungrouped.erase(it, ungrouped.end());
+      }
+      if (auto newassigns = alloc_interf_intervals(group, excepts)) {
+        for (auto pair_ : *newassigns) {
+          newmap.insert(pair_);
+        }
+      } else {
+        return nullptr;
+      }
+    }
+    auto rv = make_unique<vector<MCPhysReg>>();
+    transform(lives, push_to(*rv), [&](LiveInterval *l) { return *newmap[l]; });
+    return rv;
+  }
+
+  MCPhysReg unassign(LiveInterval &live) {
+    unsigned old = get_phys(live.reg, *vrm);
+    lrm->unassign(live);
+    return old;
+  }
+
+  template <typename C, typename = is_iterable_of<LiveInterval *, C>>
+  vector<MCPhysReg> unassign_all(C &lives) {
+    vector<MCPhysReg> r;
+    transform(lives, push_to(r), [&](LiveInterval *l) { return unassign(*l); });
+    return r;
+  }
+
+  template <typename C, typename D,
+            typename = is_iterable_of<LiveInterval *, C>,
+            typename = is_iterable_of<MCPhysReg, D>>
+  void assign_all(C &lives, D &&regs) {
+    for (auto intv_reg : zip_first(lives, std::forward<D>(regs))) {
+      lrm->assign(*std::get<0>(intv_reg), std::get<1>(intv_reg));
+    }
+  }
+
+  bool reserve_phys_reg(MCPhysReg preg, LiveInterval &live) {
+    vector<LiveInterval *> evictees;
+    if (!interf(live, preg, evictees)) {
+      DEBUG(dbgs() << "ReAllocTool: " << tri->getName(preg)
+                   << " is already free.\n");
+      return true;
+    } else if (evictees.size() > 0) {
+      DEBUG(dbgs() << "ReAllocTool: trying to reserve " << tri->getName(preg)
+                   << " by evicting:\n"
+                   << evictees);
+      vector<MCPhysReg> oldregs = unassign_all(evictees);
+      BitVector bv = bv_from_regs(preg);
+      if (auto newregs = evict_intervals(evictees, &bv)) {
+        assign_all(evictees, *newregs);
+        return true;
+      }
+      assign_all(evictees, oldregs);
+    }
+    DEBUG(dbgs() << "ReAllocTool: unable to reserve " << tri->getName(preg)
+                 << "\n");
+    return false;
+  }
+};
+
+struct Candidate {
+  MachineInstr *ins;
+  MachineInstr *gr8def;
+  MachineInstr *movzx;
+  vector<MCPhysReg> constraints;
+  LiveInterval *live32;
+  LiveInterval *live8;
+  unique_ptr<LiveInterval> extra;
+  // private:
+  // assign/reassign
+  unsigned pdest;
+  unsigned psrc;
+
+  static MachineInstr *valid_candidate(MachineInstr &i, LiveIntervals &li) {
+    if (i.getOpcode() != X86::MOVZX32rr8 || i.getOperand(1).getSubReg() != 0) {
+      return nullptr;
+    }
+
+    const MachineFunction &f = *i.getParent()->getParent();
+    const MachineRegisterInfo &mri = f.getRegInfo();
+    const TargetRegisterInfo &tri = *f.getSubtarget().getRegisterInfo();
+
+    unsigned src = i.getOperand(1).getReg();
+    auto bbdefs = dominating_defs(src, mri, *li.getSlotIndexes());
+    if (bbdefs.size() > 1 || (mri.getSimpleHint(src) &&
+                              !tri.isVirtualRegister(mri.getSimpleHint(src)))) {
+      DEBUG(dbgs() << "passing over " << i << "defs: " << bbdefs.size()
+                   << ", gr8 hint: " << PrintReg(mri.getSimpleHint(src), &tri)
+                   << "\n");
+      return nullptr;
+    }
+    return bbdefs.begin()->second;
+  }
+
+  static unique_ptr<Candidate> from_mi(MachineInstr &i, LiveIntervals &li,
+                                       const VirtRegMap &vrm) {
+    const MachineFunction &f = *i.getParent()->getParent();
+    const MachineRegisterInfo &mri = f.getRegInfo();
+    const TargetRegisterInfo &tri = *f.getSubtarget().getRegisterInfo();
+
+    MachineInstr *def, *ins;
+    if ((def = valid_candidate(i, li)) == nullptr) {
+      return nullptr;
+    }
+
+    unsigned dest = i.getOperand(0).getReg(), src = i.getOperand(1).getReg();
+    LiveInterval &live32 = li.getInterval(dest), &live8 = li.getInterval(src);
+    unique_ptr<LiveInterval> extra(new LiveInterval(live32.reg, live32.weight));
+
+    if ((ins = insert_mov32r0(*def, *extra, li)) == nullptr) {
+      return nullptr;
+    }
+
+    li.InsertMachineInstrInMaps(*ins);
+    add_seg(*ins, *def, *extra, li);
+    if (extra->overlaps(live32)) {
+      li.RemoveMachineInstrFromMaps(*ins);
+      ins->eraseFromParent();
+      return nullptr;
+    }
+
+    add_segs(live32, *extra, li);
+    add_segs(live8, *extra, li);
+
+    // look for copy instr reg alloc hints
+    vector<MCPhysReg> cx;
+    for (const MachineInstr &use : mri.use_instructions(dest)) {
+      if (use.isCopy() && !tri.isVirtualRegister(use.getOperand(0).getReg())) {
+        unsigned r =
+            use.getOperand(1).getSubReg()
+                ? tri.getMatchingSuperReg(use.getOperand(0).getReg(),
+                                          use.getOperand(1).getSubReg(),
+                                          mri.getRegClass(dest))
+                : get_phys(use.getOperand(0), vrm);
+        if (f.getSubtarget<X86Subtarget>().is64Bit() ||
+            X86::GR32_ABCDRegClass.contains(r)) {
+          cx.push_back(r);
+        }
+      }
+    }
+
+    return unique_ptr<Candidate>(new Candidate{
+        ins, def, &i, std::move(cx), &live32, &live8, std::move(extra), 0, 0});
+  }
+
+  bool operator<(const Candidate &b) const {
+    if (constraints.size() > 0 && b.constraints.size() == 0)
+      return true;
+    if (b.constraints.size() > 0 && constraints.size() == 0)
+      return false;
+    if (constraints.size() < b.constraints.size())
+      return true;
+    return li_size() > b.li_size();
+  }
+
+  unsigned li_size() const { return extra->getSize(); }
+
+  friend raw_ostream &operator<<(raw_ostream &out, const Candidate &c) {
+    out << "Candidate:\n\tinserted: " << (*c.ins)
+        << "\tgr8 def: " << (*c.gr8def) << "\tmovzx: " << (*c.movzx)
+        << "\txor gr32: " << (*c.extra);
+    if (c.constraints.size() > 0) {
+      out << "\n\tconstraints:";
+      for (unsigned cx : c.constraints) {
+        out << " " << PrintReg(cx, &c.tri());
+      }
+    } else {
+      out << "\n\tno constraints.";
+    }
+    return out;
+  }
+
+  const X86RegisterInfo &tri() const {
+    return *reinterpret_cast<const X86RegisterInfo *>(
+        ins->getParent()->getParent()->getSubtarget().getRegisterInfo());
+  }
+
+  const X86InstrInfo &tii() const {
+    return *reinterpret_cast<const X86InstrInfo *>(
+        ins->getParent()->getParent()->getSubtarget().getInstrInfo());
+  }
+
+  MachineRegisterInfo &mri() const {
+    return ins->getParent()->getParent()->getRegInfo();
+  }
+
+  void unassign(ReAllocTool &ratool) {
+    pdest = ratool.unassign(*live32);
+    psrc = ratool.unassign(*live8);
+  }
+
+  void assign_old(LiveRegMatrix &lrm) {
+    lrm.assign(*live32, pdest);
+    lrm.assign(*live8, psrc);
+    pdest = psrc = 0;
+  }
+
+  void assign_new(LiveRegMatrix &lrm, LiveIntervals &li, MCPhysReg newdest) {
+    // vsrc uses => vdest:sub_8bit; insert vdest = mov32r0; del movzx
+    unsigned vdest = movzx->getOperand(0).getReg();
+    unsigned vsrc = movzx->getOperand(1).getReg();
+
+    // in-place operand mutation would confuse defusechain_iterator
+    vector<MachineOperand *> ops;
+    transform(mri().reg_operands(vsrc), push_to(ops),
+              [](MachineOperand &op) { return &op; });
+    for (MachineOperand *op : ops) {
+      DEBUG(dbgs() << "changing " << (*op->getParent()));
+      op->substVirtReg(vdest, X86::sub_8bit, tri());
+      DEBUG(dbgs() << "to " << (*op->getParent()));
+    }
+
+    li.RemoveMachineInstrFromMaps(*movzx);
+    movzx->eraseFromParent();
+    li.removeInterval(vsrc);
+    li.removeInterval(vdest);
+
+    const TargetRegisterClass &destcls = *mri().getRegClass(vdest);
+    ins->getOperand(0).setReg(vdest);
+    if (destcls.getSize() > 32 / 8) {
+      ins->getOperand(0).setSubReg(X86::sub_32bit);
+      ins->getOperand(0).setIsUndef();
+    }
+    if (const TargetRegisterClass *newcls = gr8def->getRegClassConstraintEffect(
+            0, ins->getRegClassConstraintEffect(0, &destcls, &tii(), &tri()),
+            &tii(), &tri())) {
+      DEBUG(dbgs() << "updating reg class from "
+                   << tri().getRegClassName(&destcls) << " to "
+                   << tri().getRegClassName(newcls) << "\n");
+      mri().setRegClass(vdest, newcls);
+    } else {
+      DEBUG(dbgs() << "not updating reg class\n");
+    }
+    lrm.assign(li.createAndComputeVirtRegInterval(vdest), newdest);
+  }
+
+  bool valid_dest_reg(MCPhysReg physreg) const {
+    return mri().getRegClass(movzx->getOperand(0).getReg())->contains(physreg);
+  }
+};
+
+struct X86FixupZExt : public MachineFunctionPass {
+  static char id;
+
+  X86FixupZExt() : MachineFunctionPass(id) {}
+
+  const char *getPassName() const override {
+    return "X86 Zero-Extension Fix-up";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &a) const override {
+    a.addRequired<LiveRegMatrix>();
+    a.addRequired<VirtRegMap>();
+    a.addRequired<LiveIntervals>();
+    a.setPreservesAll();
+    return MachineFunctionPass::getAnalysisUsage(a);
+  }
+
+  bool runOnMachineFunction(MachineFunction &f) override {
+    VirtRegMap &vrm = getAnalysis<VirtRegMap>();
+    LiveIntervals &li = getAnalysis<LiveIntervals>();
+    LiveRegMatrix &lrm = getAnalysis<LiveRegMatrix>();
+    vector<Candidate> constrained, cands, dispose;
+    ReAllocTool ratool(f, lrm, vrm);
+
+    DEBUG(dbgs() << "analyzing " << f.getName() << "'s movzxes.\n");
+    for (MachineBasicBlock &bb : f) {
+      for (MachineInstr &i : bb) {
+        if (auto cand = Candidate::from_mi(i, li, vrm)) {
+          if (cand->constraints.size() > 0) {
+            constrained.emplace_back(std::move(*cand.release()));
+          } else {
+            cands.emplace_back(std::move(*cand.release()));
+          }
+        }
+      }
+    }
+
+    BitVector nosub8;
+    if (f.getSubtarget<X86Subtarget>().is64Bit()) {
+      nosub8 = ratool.bv_from_regs({X86::RIP});
+    } else {
+      nosub8 = ratool.bv_from_regs(ArrayRef<MCPhysReg>(
+          X86::GR32_ABCDRegClass.begin(), X86::GR32_ABCDRegClass.end()));
+      nosub8.flip();
+    }
+
+    DEBUG(vrm.print(dbgs()));
+    DEBUG(f.print(dbgs(), li.getSlotIndexes()));
+    std::sort(constrained.begin(), constrained.end());
+    std::for_each(constrained.begin(), constrained.end(), [&](Candidate &c) {
+      DEBUG(dbgs() << c << "\n");
+      c.unassign(ratool);
+      bool demote = true;
+      for (MCPhysReg preg : c.constraints) {
+        if (!nosub8.test(preg) && c.valid_dest_reg(preg) &&
+            ratool.reserve_phys_reg(preg, *c.extra)) {
+          DEBUG(dbgs() << "works\n");
+          c.assign_new(lrm, li, preg);
+          return;
+        }
+        // only demote if RA pass missed all hints
+        demote &= preg != c.pdest;
+      }
+      DEBUG(dbgs() << "could not transform\n");
+      c.assign_old(lrm);
+      if (demote) {
+        c.constraints.clear();
+        DEBUG(dbgs() << "demoting to unconstrained candidate\n");
+        cands.push_back(std::move(c));
+      } else {
+        dispose.push_back(std::move(c));
+      }
+    });
+
+    auto try_harder_to_alloc = [&](Candidate &c) {
+      for (MCPhysReg newreg : X86::GR32_ABCDRegClass) {
+        if (c.valid_dest_reg(newreg) && !ratool.unused_csr.test(newreg) &&
+            ratool.reserve_phys_reg(newreg, *c.extra)) {
+          return newreg;
+        }
+      }
+      return static_cast<MCPhysReg>(0);
+    };
+
+    std::sort(cands.begin(), cands.end());
+    for (Candidate &c : cands) {
+      DEBUG(dbgs() << c << "\n");
+      c.unassign(ratool);
+      MCPhysReg newreg;
+      if (!f.getSubtarget<X86Subtarget>().is64Bit() &&
+          ((newreg = ratool.alloc(*c.extra, &nosub8)) != 0 ||
+           (newreg = try_harder_to_alloc(c)) != 0)) {
+        DEBUG(dbgs() << "works\n");
+        c.assign_new(lrm, li, newreg);
+      } else if (f.getSubtarget<X86Subtarget>().is64Bit() &&
+                 (newreg = ratool.alloc(*c.extra, &nosub8)) != 0) {
+        DEBUG(dbgs() << "works\n");
+        c.assign_new(lrm, li, newreg);
+      } else {
+        DEBUG(dbgs() << "could not transform\n");
+        c.assign_old(lrm);
+        dispose.push_back(std::move(c));
+      }
+    }
+
+    for (Candidate &c : dispose) {
+      DEBUG(dbgs() << "purging dummy instr: " << (*c.ins));
+      li.RemoveMachineInstrFromMaps(*c.ins);
+      c.ins->eraseFromParent();
+    }
+    return false;
+  }
+};
+
+char X86FixupZExt::id = 0;
+}
+
+namespace llvm {
+FunctionPass *createX86FixupZExt() { return new X86FixupZExt(); }
+}
Index: lib/Target/X86/X86TargetMachine.cpp
===================================================================
--- lib/Target/X86/X86TargetMachine.cpp
+++ lib/Target/X86/X86TargetMachine.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "X86TargetMachine.h"
 #include "X86.h"
+#include "X86TargetMachine.h"
 #include "X86TargetObjectFile.h"
 #include "X86TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
@@ -29,6 +29,10 @@
                                cl::desc("Enable the machine combiner pass"),
                                cl::init(true), cl::Hidden);
 
+static cl::opt<bool> EnableSetCCFixup("setcc-fixup",
+                                      cl::desc("Apply X86FixupSetCC"),
+                                      cl::init(false), cl::Hidden);
+
 namespace llvm {
 void initializeWinEHStatePassPass(PassRegistry &);
 }
@@ -238,7 +242,6 @@
   });
 }
 
-
 //===----------------------------------------------------------------------===//
 // Pass Pipeline Configuration
 //===----------------------------------------------------------------------===//
@@ -260,6 +263,7 @@
   bool addPreISel() override;
   void addPreRegAlloc() override;
   void addPostRegAlloc() override;
+  bool addPreRewrite() override;
   void addPreEmitPass() override;
   void addPreSched2() override;
 };
@@ -305,8 +309,10 @@
 
 void X86PassConfig::addPreRegAlloc() {
   if (getOptLevel() != CodeGenOpt::None) {
-    addPass(createX86FixupSetCC());    
-    addPass(createX86OptimizeLEAs());    
+    if (EnableSetCCFixup) {
+      addPass(createX86FixupSetCC());
+    }
+    addPass(createX86OptimizeLEAs());
   }
 
   addPass(createX86CallFrameOptimization());
@@ -317,6 +323,13 @@
   addPass(createX86FloatingPointStackifierPass());
 }
 
+bool X86PassConfig::addPreRewrite() {
+  if (!EnableSetCCFixup) {
+    addPass(createX86FixupZExt());
+  }
+  return false;
+}
+
 void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); }
 
 void X86PassConfig::addPreEmitPass() {
Index: test/CodeGen/X86/avx-intrinsics-x86.ll
===================================================================
--- test/CodeGen/X86/avx-intrinsics-x86.ll
+++ test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -149,20 +149,20 @@
 define i32 @test_x86_sse2_comieq_sd(<2 x double> %a0, <2 x double> %a1) {
 ; AVX-LABEL: test_x86_sse2_comieq_sd:
 ; AVX:       ## BB#0:
+; AVX-NEXT:    xorl %eax, %eax
 ; AVX-NEXT:    vcomisd %xmm1, %xmm0
-; AVX-NEXT:    setnp %al
-; AVX-NEXT:    sete %cl
-; AVX-NEXT:    andb %al, %cl
-; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    setnp %cl
+; AVX-NEXT:    sete %al
+; AVX-NEXT:    andb %cl, %al
 ; AVX-NEXT:    retl
 ;
 ; AVX512VL-LABEL: test_x86_sse2_comieq_sd:
 ; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    xorl %eax, %eax
 ; AVX512VL-NEXT:    vcomisd %xmm1, %xmm0
-; AVX512VL-NEXT:    setnp %al
-; AVX512VL-NEXT:    sete %cl
-; AVX512VL-NEXT:    andb %al, %cl
-; AVX512VL-NEXT:    movzbl %cl, %eax
+; AVX512VL-NEXT:    setnp %cl
+; AVX512VL-NEXT:    sete %al
+; AVX512VL-NEXT:    andb %cl, %al
 ; AVX512VL-NEXT:    retl
   %res = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
   ret i32 %res
@@ -253,20 +253,20 @@
 define i32 @test_x86_sse2_comineq_sd(<2 x double> %a0, <2 x double> %a1) {
 ; AVX-LABEL: test_x86_sse2_comineq_sd:
 ; AVX:       ## BB#0:
+; AVX-NEXT:    xorl %eax, %eax
 ; AVX-NEXT:    vcomisd %xmm1, %xmm0
-; AVX-NEXT:    setp %al
-; AVX-NEXT:    setne %cl
-; AVX-NEXT:    orb %al, %cl
-; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    setp %cl
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    orb %cl, %al
 ; AVX-NEXT:    retl
 ;
 ; AVX512VL-LABEL: test_x86_sse2_comineq_sd:
 ; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    xorl %eax, %eax
 ; AVX512VL-NEXT:    vcomisd %xmm1, %xmm0
-; AVX512VL-NEXT:    setp %al
-; AVX512VL-NEXT:    setne %cl
-; AVX512VL-NEXT:    orb %al, %cl
-; AVX512VL-NEXT:    movzbl %cl, %eax
+; AVX512VL-NEXT:    setp %cl
+; AVX512VL-NEXT:    setne %al
+; AVX512VL-NEXT:    orb %cl, %al
 ; AVX512VL-NEXT:    retl
   %res = call i32 @llvm.x86.sse2.comineq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
   ret i32 %res
@@ -1240,20 +1240,20 @@
 define i32 @test_x86_sse2_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) {
 ; AVX-LABEL: test_x86_sse2_ucomieq_sd:
 ; AVX:       ## BB#0:
+; AVX-NEXT:    xorl %eax, %eax
 ; AVX-NEXT:    vucomisd %xmm1, %xmm0
-; AVX-NEXT:    setnp %al
-; AVX-NEXT:    sete %cl
-; AVX-NEXT:    andb %al, %cl
-; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    setnp %cl
+; AVX-NEXT:    sete %al
+; AVX-NEXT:    andb %cl, %al
 ; AVX-NEXT:    retl
 ;
 ; AVX512VL-LABEL: test_x86_sse2_ucomieq_sd:
 ; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    xorl %eax, %eax
 ; AVX512VL-NEXT:    vucomisd %xmm1, %xmm0
-; AVX512VL-NEXT:    setnp %al
-; AVX512VL-NEXT:    sete %cl
-; AVX512VL-NEXT:    andb %al, %cl
-; AVX512VL-NEXT:    movzbl %cl, %eax
+; AVX512VL-NEXT:    setnp %cl
+; AVX512VL-NEXT:    sete %al
+; AVX512VL-NEXT:    andb %cl, %al
 ; AVX512VL-NEXT:    retl
   %res = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
   ret i32 %res
@@ -1344,20 +1344,20 @@
 define i32 @test_x86_sse2_ucomineq_sd(<2 x double> %a0, <2 x double> %a1) {
 ; AVX-LABEL: test_x86_sse2_ucomineq_sd:
 ; AVX:       ## BB#0:
+; AVX-NEXT:    xorl %eax, %eax
 ; AVX-NEXT:    vucomisd %xmm1, %xmm0
-; AVX-NEXT:    setp %al
-; AVX-NEXT:    setne %cl
-; AVX-NEXT:    orb %al, %cl
-; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    setp %cl
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    orb %cl, %al
 ; AVX-NEXT:    retl
 ;
 ; AVX512VL-LABEL: test_x86_sse2_ucomineq_sd:
 ; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    xorl %eax, %eax
 ; AVX512VL-NEXT:    vucomisd %xmm1, %xmm0
-; AVX512VL-NEXT:    setp %al
-; AVX512VL-NEXT:    setne %cl
-; AVX512VL-NEXT:    orb %al, %cl
-; AVX512VL-NEXT:    movzbl %cl, %eax
+; AVX512VL-NEXT:    setp %cl
+; AVX512VL-NEXT:    setne %al
+; AVX512VL-NEXT:    orb %cl, %al
 ; AVX512VL-NEXT:    retl
   %res = call i32 @llvm.x86.sse2.ucomineq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
   ret i32 %res
@@ -1943,29 +1943,23 @@
 }
 
 
-define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
+define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) {
 ; AVX-LABEL: test_x86_sse42_pcmpestria128:
 ; AVX:       ## BB#0:
-; AVX-NEXT:    pushl %ebx
 ; AVX-NEXT:    movl $7, %eax
 ; AVX-NEXT:    movl $7, %edx
-; AVX-NEXT:    xorl %ebx, %ebx
 ; AVX-NEXT:    vpcmpestri $7, %xmm1, %xmm0
-; AVX-NEXT:    seta %bl
-; AVX-NEXT:    movl %ebx, %eax
-; AVX-NEXT:    popl %ebx
+; AVX-NEXT:    seta %al
+; AVX-NEXT:    movzbl %al, %eax
 ; AVX-NEXT:    retl
 ;
 ; AVX512VL-LABEL: test_x86_sse42_pcmpestria128:
 ; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    pushl %ebx
 ; AVX512VL-NEXT:    movl $7, %eax
 ; AVX512VL-NEXT:    movl $7, %edx
-; AVX512VL-NEXT:    xorl %ebx, %ebx
 ; AVX512VL-NEXT:    vpcmpestri $7, %xmm1, %xmm0
-; AVX512VL-NEXT:    seta %bl
-; AVX512VL-NEXT:    movl %ebx, %eax
-; AVX512VL-NEXT:    popl %ebx
+; AVX512VL-NEXT:    seta %al
+; AVX512VL-NEXT:    movzbl %al, %eax
 ; AVX512VL-NEXT:    retl
   %res = call i32 @llvm.x86.sse42.pcmpestria128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
   ret i32 %res
@@ -1997,29 +1991,23 @@
 declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
 
 
-define i32 @test_x86_sse42_pcmpestrio128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
+define i32 @test_x86_sse42_pcmpestrio128(<16 x i8> %a0, <16 x i8> %a2) {
 ; AVX-LABEL: test_x86_sse42_pcmpestrio128:
 ; AVX:       ## BB#0:
-; AVX-NEXT:    pushl %ebx
 ; AVX-NEXT:    movl $7, %eax
 ; AVX-NEXT:    movl $7, %edx
-; AVX-NEXT:    xorl %ebx, %ebx
 ; AVX-NEXT:    vpcmpestri $7, %xmm1, %xmm0
-; AVX-NEXT:    seto %bl
-; AVX-NEXT:    movl %ebx, %eax
-; AVX-NEXT:    popl %ebx
+; AVX-NEXT:    seto %al
+; AVX-NEXT:    movzbl %al, %eax
 ; AVX-NEXT:    retl
 ;
 ; AVX512VL-LABEL: test_x86_sse42_pcmpestrio128:
 ; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    pushl %ebx
 ; AVX512VL-NEXT:    movl $7, %eax
 ; AVX512VL-NEXT:    movl $7, %edx
-; AVX512VL-NEXT:    xorl %ebx, %ebx
 ; AVX512VL-NEXT:    vpcmpestri $7, %xmm1, %xmm0
-; AVX512VL-NEXT:    seto %bl
-; AVX512VL-NEXT:    movl %ebx, %eax
-; AVX512VL-NEXT:    popl %ebx
+; AVX512VL-NEXT:    seto %al
+; AVX512VL-NEXT:    movzbl %al, %eax
 ; AVX512VL-NEXT:    retl
   %res = call i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
   ret i32 %res
@@ -2027,29 +2015,23 @@
 declare i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
 
 
-define i32 @test_x86_sse42_pcmpestris128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
+define i32 @test_x86_sse42_pcmpestris128(<16 x i8> %a0, <16 x i8> %a2) {
 ; AVX-LABEL: test_x86_sse42_pcmpestris128:
 ; AVX:       ## BB#0:
-; AVX-NEXT:    pushl %ebx
 ; AVX-NEXT:    movl $7, %eax
 ; AVX-NEXT:    movl $7, %edx
-; AVX-NEXT:    xorl %ebx, %ebx
 ; AVX-NEXT:    vpcmpestri $7, %xmm1, %xmm0
-; AVX-NEXT:    sets %bl
-; AVX-NEXT:    movl %ebx, %eax
-; AVX-NEXT:    popl %ebx
+; AVX-NEXT:    sets %al
+; AVX-NEXT:    movzbl %al, %eax
 ; AVX-NEXT:    retl
 ;
 ; AVX512VL-LABEL: test_x86_sse42_pcmpestris128:
 ; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    pushl %ebx
 ; AVX512VL-NEXT:    movl $7, %eax
 ; AVX512VL-NEXT:    movl $7, %edx
-; AVX512VL-NEXT:    xorl %ebx, %ebx
 ; AVX512VL-NEXT:    vpcmpestri $7, %xmm1, %xmm0
-; AVX512VL-NEXT:    sets %bl
-; AVX512VL-NEXT:    movl %ebx, %eax
-; AVX512VL-NEXT:    popl %ebx
+; AVX512VL-NEXT:    sets %al
+; AVX512VL-NEXT:    movzbl %al, %eax
 ; AVX512VL-NEXT:    retl
   %res = call i32 @llvm.x86.sse42.pcmpestris128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
   ret i32 %res
@@ -2057,29 +2039,23 @@
 declare i32 @llvm.x86.sse42.pcmpestris128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
 
 
-define i32 @test_x86_sse42_pcmpestriz128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
+define i32 @test_x86_sse42_pcmpestriz128(<16 x i8> %a0, <16 x i8> %a2) {
 ; AVX-LABEL: test_x86_sse42_pcmpestriz128:
 ; AVX:       ## BB#0:
-; AVX-NEXT:    pushl %ebx
 ; AVX-NEXT:    movl $7, %eax
 ; AVX-NEXT:    movl $7, %edx
-; AVX-NEXT:    xorl %ebx, %ebx
 ; AVX-NEXT:    vpcmpestri $7, %xmm1, %xmm0
-; AVX-NEXT:    sete %bl
-; AVX-NEXT:    movl %ebx, %eax
-; AVX-NEXT:    popl %ebx
+; AVX-NEXT:    sete %al
+; AVX-NEXT:    movzbl %al, %eax
 ; AVX-NEXT:    retl
 ;
 ; AVX512VL-LABEL: test_x86_sse42_pcmpestriz128:
 ; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    pushl %ebx
 ; AVX512VL-NEXT:    movl $7, %eax
 ; AVX512VL-NEXT:    movl $7, %edx
-; AVX512VL-NEXT:    xorl %ebx, %ebx
 ; AVX512VL-NEXT:    vpcmpestri $7, %xmm1, %xmm0
-; AVX512VL-NEXT:    sete %bl
-; AVX512VL-NEXT:    movl %ebx, %eax
-; AVX512VL-NEXT:    popl %ebx
+; AVX512VL-NEXT:    sete %al
+; AVX512VL-NEXT:    movzbl %al, %eax
 ; AVX512VL-NEXT:    retl
   %res = call i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
   ret i32 %res
@@ -2357,20 +2333,20 @@
 define i32 @test_x86_sse_comieq_ss(<4 x float> %a0, <4 x float> %a1) {
 ; AVX-LABEL: test_x86_sse_comieq_ss:
 ; AVX:       ## BB#0:
+; AVX-NEXT:    xorl %eax, %eax
 ; AVX-NEXT:    vcomiss %xmm1, %xmm0
-; AVX-NEXT:    setnp %al
-; AVX-NEXT:    sete %cl
-; AVX-NEXT:    andb %al, %cl
-; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    setnp %cl
+; AVX-NEXT:    sete %al
+; AVX-NEXT:    andb %cl, %al
 ; AVX-NEXT:    retl
 ;
 ; AVX512VL-LABEL: test_x86_sse_comieq_ss:
 ; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    xorl %eax, %eax
 ; AVX512VL-NEXT:    vcomiss %xmm1, %xmm0
-; AVX512VL-NEXT:    setnp %al
-; AVX512VL-NEXT:    sete %cl
-; AVX512VL-NEXT:    andb %al, %cl
-; AVX512VL-NEXT:    movzbl %cl, %eax
+; AVX512VL-NEXT:    setnp %cl
+; AVX512VL-NEXT:    sete %al
+; AVX512VL-NEXT:    andb %cl, %al
 ; AVX512VL-NEXT:    retl
   %res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
   ret i32 %res
@@ -2461,20 +2437,20 @@
 define i32 @test_x86_sse_comineq_ss(<4 x float> %a0, <4 x float> %a1) {
 ; AVX-LABEL: test_x86_sse_comineq_ss:
 ; AVX:       ## BB#0:
+; AVX-NEXT:    xorl %eax, %eax
 ; AVX-NEXT:    vcomiss %xmm1, %xmm0
-; AVX-NEXT:    setp %al
-; AVX-NEXT:    setne %cl
-; AVX-NEXT:    orb %al, %cl
-; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    setp %cl
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    orb %cl, %al
 ; AVX-NEXT:    retl
 ;
 ; AVX512VL-LABEL: test_x86_sse_comineq_ss:
 ; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    xorl %eax, %eax
 ; AVX512VL-NEXT:    vcomiss %xmm1, %xmm0
-; AVX512VL-NEXT:    setp %al
-; AVX512VL-NEXT:    setne %cl
-; AVX512VL-NEXT:    orb %al, %cl
-; AVX512VL-NEXT:    movzbl %cl, %eax
+; AVX512VL-NEXT:    setp %cl
+; AVX512VL-NEXT:    setne %al
+; AVX512VL-NEXT:    orb %cl, %al
 ; AVX512VL-NEXT:    retl
   %res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
   ret i32 %res
@@ -2797,20 +2773,20 @@
 define i32 @test_x86_sse_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) {
 ; AVX-LABEL: test_x86_sse_ucomieq_ss:
 ; AVX:       ## BB#0:
+; AVX-NEXT:    xorl %eax, %eax
 ; AVX-NEXT:    vucomiss %xmm1, %xmm0
-; AVX-NEXT:    setnp %al
-; AVX-NEXT:    sete %cl
-; AVX-NEXT:    andb %al, %cl
-; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    setnp %cl
+; AVX-NEXT:    sete %al
+; AVX-NEXT:    andb %cl, %al
 ; AVX-NEXT:    retl
 ;
 ; AVX512VL-LABEL: test_x86_sse_ucomieq_ss:
 ; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    xorl %eax, %eax
 ; AVX512VL-NEXT:    vucomiss %xmm1, %xmm0
-; AVX512VL-NEXT:    setnp %al
-; AVX512VL-NEXT:    sete %cl
-; AVX512VL-NEXT:    andb %al, %cl
-; AVX512VL-NEXT:    movzbl %cl, %eax
+; AVX512VL-NEXT:    setnp %cl
+; AVX512VL-NEXT:    sete %al
+; AVX512VL-NEXT:    andb %cl, %al
 ; AVX512VL-NEXT:    retl
   %res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
   ret i32 %res
@@ -2901,20 +2877,20 @@
 define i32 @test_x86_sse_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) {
 ; AVX-LABEL: test_x86_sse_ucomineq_ss:
 ; AVX:       ## BB#0:
+; AVX-NEXT:    xorl %eax, %eax
 ; AVX-NEXT:    vucomiss %xmm1, %xmm0
-; AVX-NEXT:    setp %al
-; AVX-NEXT:    setne %cl
-; AVX-NEXT:    orb %al, %cl
-; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    setp %cl
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    orb %cl, %al
 ; AVX-NEXT:    retl
 ;
 ; AVX512VL-LABEL: test_x86_sse_ucomineq_ss:
 ; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    xorl %eax, %eax
 ; AVX512VL-NEXT:    vucomiss %xmm1, %xmm0
-; AVX512VL-NEXT:    setp %al
-; AVX512VL-NEXT:    setne %cl
-; AVX512VL-NEXT:    orb %al, %cl
-; AVX512VL-NEXT:    movzbl %cl, %eax
+; AVX512VL-NEXT:    setp %cl
+; AVX512VL-NEXT:    setne %al
+; AVX512VL-NEXT:    orb %cl, %al
 ; AVX512VL-NEXT:    retl
   %res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
   ret i32 %res
Index: test/CodeGen/X86/avx512-cmp.ll
===================================================================
--- test/CodeGen/X86/avx512-cmp.ll
+++ test/CodeGen/X86/avx512-cmp.ll
@@ -51,11 +51,11 @@
 define i32 @test3(float %a, float %b) {
 ; ALL-LABEL: test3:
 ; ALL:       ## BB#0:
+; ALL-NEXT:    xorl %eax, %eax
 ; ALL-NEXT:    vucomiss %xmm1, %xmm0
-; ALL-NEXT:    setnp %al
-; ALL-NEXT:    sete %cl
-; ALL-NEXT:    andb %al, %cl
-; ALL-NEXT:    movzbl %cl, %eax
+; ALL-NEXT:    setnp %cl
+; ALL-NEXT:    sete %al
+; ALL-NEXT:    andb %cl, %al
 ; ALL-NEXT:    retq
 
   %cmp10.i = fcmp oeq float %a, %b
@@ -67,12 +67,12 @@
 ; ALL-LABEL: test5:
 ; ALL:       ## BB#0: ## %entry
 ; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; ALL-NEXT:    xorl %eax, %eax
 ; ALL-NEXT:    vucomiss %xmm1, %xmm0
 ; ALL-NEXT:    jne LBB3_1
 ; ALL-NEXT:    jnp LBB3_2
 ; ALL-NEXT:  LBB3_1: ## %if.end
 ; ALL-NEXT:    seta %al
-; ALL-NEXT:    movzbl %al, %eax
 ; ALL-NEXT:    leaq {{.*}}(%rip), %rcx
 ; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; ALL-NEXT:  LBB3_2: ## %return
Index: test/CodeGen/X86/cmpxchg-i1.ll
===================================================================
--- test/CodeGen/X86/cmpxchg-i1.ll
+++ test/CodeGen/X86/cmpxchg-i1.ll
@@ -34,7 +34,7 @@
 ; CHECK-LABEL: cmpxchg_sext:
 ; CHECK-DAG: cmpxchgl
 ; CHECK-NOT: cmpl
-; CHECK: sete %cl
+; CHECK: sete %al
 ; CHECK: retq
   %pair = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
   %success = extractvalue { i32, i1 } %pair, 1
@@ -44,10 +44,10 @@
 
 define i32 @cmpxchg_zext(i32* %addr, i32 %desired, i32 %new) {
 ; CHECK-LABEL: cmpxchg_zext:
-; CHECK: xorl %e[[R:[a-z]]]x
 ; CHECK: cmpxchgl
 ; CHECK-NOT: cmp
-; CHECK: sete %[[R]]l
+; CHECK: sete [[BYTE:%[a-z0-9]+]]
+; CHECK: movzbl [[BYTE]], %eax
   %pair = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
   %success = extractvalue { i32, i1 } %pair, 1
   %mask = zext i1 %success to i32
Index: test/CodeGen/X86/cmpxchg-i128-i1.ll
===================================================================
--- test/CodeGen/X86/cmpxchg-i128-i1.ll
+++ test/CodeGen/X86/cmpxchg-i128-i1.ll
@@ -44,10 +44,10 @@
 
 define i128 @cmpxchg_zext(i128* %addr, i128 %desired, i128 %new) {
 ; CHECK-LABEL: cmpxchg_zext:
-; CHECK: xorl
 ; CHECK: cmpxchg16b
 ; CHECK-NOT: cmpq
-; CHECK: sete
+; CHECK: sete [[BYTE:%[a-z0-9]+]]
+; CHECK: movzbl [[BYTE]], %eax
   %pair = cmpxchg i128* %addr, i128 %desired, i128 %new seq_cst seq_cst
   %success = extractvalue { i128, i1 } %pair, 1
   %mask = zext i1 %success to i128
Index: test/CodeGen/X86/fast-isel-cmp.ll
===================================================================
--- test/CodeGen/X86/fast-isel-cmp.ll
+++ test/CodeGen/X86/fast-isel-cmp.ll
@@ -8,9 +8,9 @@
 ; SDAG-NEXT:  andl     $1, %eax
 ; FAST-LABEL: fcmp_oeq
 ; FAST:       ucomiss  %xmm1, %xmm0
-; FAST-NEXT:  sete     %al
-; FAST-NEXT:  setnp    %cl
-; FAST-NEXT:  andb     %al, %cl
+; FAST-NEXT:  sete     %cl
+; FAST-NEXT:  setnp    %al
+; FAST-NEXT:  andb     %cl, %al
   %1 = fcmp oeq float %x, %y
   ret i1 %1
 }
@@ -153,9 +153,9 @@
 ; SDAG-NEXT:  andl     $1, %eax
 ; FAST-LABEL: fcmp_une
 ; FAST:       ucomiss  %xmm1, %xmm0
-; FAST-NEXT:  setne    %al
-; FAST-NEXT:  setp     %cl
-; FAST-NEXT:  orb      %al, %cl
+; FAST-NEXT:  setne    %cl
+; FAST-NEXT:  setp     %al
+; FAST-NEXT:  orb      %cl, %al
   %1 = fcmp une float %x, %y
   ret i1 %1
 }
@@ -290,10 +290,11 @@
 ; SDAG-NEXT:  andl     $1, %eax
 ; FAST-LABEL: fcmp_oeq3
 ; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  xorl     %eax, %eax
 ; FAST-NEXT:  ucomiss  %xmm1, %xmm0
-; FAST-NEXT:  sete     %al
-; FAST-NEXT:  setnp    %cl
-; FAST-NEXT:  andb     %al, %cl
+; FAST-NEXT:  sete     %cl
+; FAST-NEXT:  setnp    %al
+; FAST-NEXT:  andb     %cl, %al
   %1 = fcmp oeq float %x, 0.000000e+00
   ret i1 %1
 }
@@ -314,6 +315,7 @@
 ; SDAG-NEXT:  seta     %al
 ; FAST-LABEL: fcmp_ogt3
 ; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  xorl     %eax, %eax
 ; FAST-NEXT:  ucomiss  %xmm1, %xmm0
 ; FAST-NEXT:  seta     %al
   %1 = fcmp ogt float %x, 0.000000e+00
@@ -338,6 +340,7 @@
 ; SDAG-NEXT:  setae    %al
 ; FAST-LABEL: fcmp_oge3
 ; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  xorl     %eax, %eax
 ; FAST-NEXT:  ucomiss  %xmm1, %xmm0
 ; FAST-NEXT:  setae    %al
   %1 = fcmp oge float %x, 0.000000e+00
@@ -360,6 +363,7 @@
 ; SDAG-NEXT:  seta     %al
 ; FAST-LABEL: fcmp_olt3
 ; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  xorl     %eax, %eax
 ; FAST-NEXT:  ucomiss  %xmm0, %xmm1
 ; FAST-NEXT:  seta     %al
   %1 = fcmp olt float %x, 0.000000e+00
@@ -384,6 +388,7 @@
 ; SDAG-NEXT:  setae    %al
 ; FAST-LABEL: fcmp_ole3
 ; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  xorl     %eax, %eax
 ; FAST-NEXT:  ucomiss  %xmm0, %xmm1
 ; FAST-NEXT:  setae    %al
   %1 = fcmp ole float %x, 0.000000e+00
@@ -406,6 +411,7 @@
 ; SDAG-NEXT:  setne    %al
 ; FAST-LABEL: fcmp_one3
 ; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  xorl     %eax, %eax
 ; FAST-NEXT:  ucomiss  %xmm1, %xmm0
 ; FAST-NEXT:  setne    %al
   %1 = fcmp one float %x, 0.000000e+00
@@ -472,6 +478,7 @@
 ; SDAG-NEXT:  sete     %al
 ; FAST-LABEL: fcmp_ueq3
 ; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  xorl     %eax, %eax
 ; FAST-NEXT:  ucomiss  %xmm1, %xmm0
 ; FAST-NEXT:  sete     %al
   %1 = fcmp ueq float %x, 0.000000e+00
@@ -496,6 +503,7 @@
 ; SDAG-NEXT:  setb     %al
 ; FAST-LABEL: fcmp_ugt3
 ; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  xorl     %eax, %eax
 ; FAST-NEXT:  ucomiss  %xmm0, %xmm1
 ; FAST-NEXT:  setb     %al
   %1 = fcmp ugt float %x, 0.000000e+00
@@ -518,6 +526,7 @@
 ; SDAG-NEXT:  setbe    %al
 ; FAST-LABEL: fcmp_uge3
 ; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  xorl     %eax, %eax
 ; FAST-NEXT:  ucomiss  %xmm0, %xmm1
 ; FAST-NEXT:  setbe    %al
   %1 = fcmp uge float %x, 0.000000e+00
@@ -542,6 +551,7 @@
 ; SDAG-NEXT:  setb     %al
 ; FAST-LABEL: fcmp_ult3
 ; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  xorl     %eax, %eax
 ; FAST-NEXT:  ucomiss  %xmm1, %xmm0
 ; FAST-NEXT:  setb     %al
   %1 = fcmp ult float %x, 0.000000e+00
@@ -564,6 +574,7 @@
 ; SDAG-NEXT:  setbe    %al
 ; FAST-LABEL: fcmp_ule3
 ; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  xorl     %eax, %eax
 ; FAST-NEXT:  ucomiss  %xmm1, %xmm0
 ; FAST-NEXT:  setbe    %al
   %1 = fcmp ule float %x, 0.000000e+00
@@ -589,10 +600,11 @@
 ; SDAG-NEXT:  andl     $1, %eax
 ; FAST-LABEL: fcmp_une3
 ; FAST:       xorps    %xmm1, %xmm1
+; FAST:       xorl     %eax, %eax
 ; FAST-NEXT:  ucomiss  %xmm1, %xmm0
-; FAST-NEXT:  setne    %al
-; FAST-NEXT:  setp     %cl
-; FAST-NEXT:  orb      %al, %cl
+; FAST-NEXT:  setne    %cl
+; FAST-NEXT:  setp     %al
+; FAST-NEXT:  orb      %cl, %al
   %1 = fcmp une float %x, 0.000000e+00
   ret i1 %1
 }
Index: test/CodeGen/X86/fp128-cast.ll
===================================================================
--- test/CodeGen/X86/fp128-cast.ll
+++ test/CodeGen/X86/fp128-cast.ll
@@ -238,8 +238,7 @@
 ; X64-LABEL: TestConst128:
 ; X64:       movaps {{.*}}, %xmm1
 ; X64-NEXT:  callq __gttf2
-; X64-NEXT:  xorl
-; X64-NEXT:  test
+; X64:       test
 ; X64:       retq
 }
 
Index: test/CodeGen/X86/fp128-compare.ll
===================================================================
--- test/CodeGen/X86/fp128-compare.ll
+++ test/CodeGen/X86/fp128-compare.ll
@@ -1,101 +1,140 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s
 ; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s
 
 define i32 @TestComp128GT(fp128 %d1, fp128 %d2) {
-entry:
-  %cmp = fcmp ogt fp128 %d1, %d2
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
 ; CHECK-LABEL: TestComp128GT:
-; CHECK:       callq __gttf2
-; CHECK:       xorl  %ecx, %ecx
-; CHECK:       setg  %cl
-; CHECK:       movl  %ecx, %eax
-; CHECK:       retq
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:  .Ltmp0:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq __gttf2
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testl %ecx, %ecx
+; CHECK-NEXT:    setg %al
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    retq
+entry:
+  %cmp = fcmp ogt fp128 %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
 }
 
 define i32 @TestComp128GE(fp128 %d1, fp128 %d2) {
-entry:
-  %cmp = fcmp oge fp128 %d1, %d2
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
 ; CHECK-LABEL: TestComp128GE:
-; CHECK:       callq __getf2
-; CHECK:       xorl  %ecx, %ecx
-; CHECK:       testl %eax, %eax
-; CHECK:       setns %cl
-; CHECK:       movl  %ecx, %eax
-; CHECK:       retq
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:  .Ltmp1:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq __getf2
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testl %ecx, %ecx
+; CHECK-NEXT:    setns %al
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    retq
+entry:
+  %cmp = fcmp oge fp128 %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
 }
 
 define i32 @TestComp128LT(fp128 %d1, fp128 %d2) {
-entry:
-  %cmp = fcmp olt fp128 %d1, %d2
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
 ; CHECK-LABEL: TestComp128LT:
-; CHECK:       callq __lttf2
-; CHECK-NEXT:  shrl $31, %eax
-; CHECK:       retq
-;
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:  .Ltmp2:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq __lttf2
+; CHECK-NEXT:    shrl $31, %eax
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    retq
+entry:
+  %cmp = fcmp olt fp128 %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
 ; The 'shrl' is a special optimization in llvm to combine
 ; the effect of 'fcmp olt' and 'zext'. The main purpose is
 ; to test soften call to __lttf2.
 }
 
 define i32 @TestComp128LE(fp128 %d1, fp128 %d2) {
-entry:
-  %cmp = fcmp ole fp128 %d1, %d2
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
 ; CHECK-LABEL: TestComp128LE:
-; CHECK:       callq __letf2
-; CHECK:       xorl  %ecx, %ecx
-; CHECK:       testl %eax, %eax
-; CHECK:       setle %cl
-; CHECK:       movl  %ecx, %eax
-; CHECK:       retq
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:  .Ltmp3:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq __letf2
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testl %ecx, %ecx
+; CHECK-NEXT:    setle %al
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    retq
+entry:
+  %cmp = fcmp ole fp128 %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
 }
 
 define i32 @TestComp128EQ(fp128 %d1, fp128 %d2) {
-entry:
-  %cmp = fcmp oeq fp128 %d1, %d2
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
 ; CHECK-LABEL: TestComp128EQ:
-; CHECK:       callq __eqtf2
-; CHECK:       xorl  %ecx, %ecx
-; CHECK:       testl %eax, %eax
-; CHECK:       sete  %cl
-; CHECK:       movl  %ecx, %eax
-; CHECK:       retq
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:  .Ltmp4:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq __eqtf2
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testl %ecx, %ecx
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    retq
+entry:
+  %cmp = fcmp oeq fp128 %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
 }
 
 define i32 @TestComp128NE(fp128 %d1, fp128 %d2) {
-entry:
-  %cmp = fcmp une fp128 %d1, %d2
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
 ; CHECK-LABEL: TestComp128NE:
-; CHECK:       callq __netf2
-; CHECK:       xorl  %ecx, %ecx
-; CHECK:       testl %eax, %eax
-; CHECK:       setne %cl
-; CHECK:       movl  %ecx, %eax
-; CHECK:       retq
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:  .Ltmp5:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq __netf2
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testl %ecx, %ecx
+; CHECK-NEXT:    setne %al
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    retq
+entry:
+  %cmp = fcmp une fp128 %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
 }
 
 define fp128 @TestMax(fp128 %x, fp128 %y) {
+; CHECK-LABEL: TestMax:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:  .Ltmp6:
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    callq __gttf2
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    jg .LBB6_2
+; CHECK-NEXT:  # BB#1: # %entry
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:  .LBB6_2: # %entry
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
 entry:
   %cmp = fcmp ogt fp128 %x, %y
   %cond = select i1 %cmp, fp128 %x, fp128 %y
   ret fp128 %cond
-; CHECK-LABEL: TestMax:
-; CHECK: movaps %xmm0
-; CHECK: movaps %xmm1
-; CHECK: callq __gttf2
-; CHECK: movaps {{.*}}, %xmm0
-; CHECK: testl %eax, %eax
-; CHECK: movaps {{.*}}, %xmm0
-; CHECK: retq
 }
Index: test/CodeGen/X86/sse-intrinsics-fast-isel.ll
===================================================================
--- test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -570,20 +570,20 @@
 define i32 @test_mm_comieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
 ; X32-LABEL: test_mm_comieq_ss:
 ; X32:       # BB#0:
+; X32-NEXT:    xorl %eax, %eax
 ; X32-NEXT:    comiss %xmm1, %xmm0
-; X32-NEXT:    setnp %al
-; X32-NEXT:    sete %cl
-; X32-NEXT:    andb %al, %cl
-; X32-NEXT:    movzbl %cl, %eax
+; X32-NEXT:    setnp %cl
+; X32-NEXT:    sete %al
+; X32-NEXT:    andb %cl, %al
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_comieq_ss:
 ; X64:       # BB#0:
+; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    comiss %xmm1, %xmm0
-; X64-NEXT:    setnp %al
-; X64-NEXT:    sete %cl
-; X64-NEXT:    andb %al, %cl
-; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    setnp %cl
+; X64-NEXT:    sete %al
+; X64-NEXT:    andb %cl, %al
 ; X64-NEXT:    retq
   %res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
   ret i32 %res
@@ -669,20 +669,20 @@
 define i32 @test_mm_comineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
 ; X32-LABEL: test_mm_comineq_ss:
 ; X32:       # BB#0:
+; X32-NEXT:    xorl %eax, %eax
 ; X32-NEXT:    comiss %xmm1, %xmm0
-; X32-NEXT:    setp %al
-; X32-NEXT:    setne %cl
-; X32-NEXT:    orb %al, %cl
-; X32-NEXT:    movzbl %cl, %eax
+; X32-NEXT:    setp %cl
+; X32-NEXT:    setne %al
+; X32-NEXT:    orb %cl, %al
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_comineq_ss:
 ; X64:       # BB#0:
+; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    comiss %xmm1, %xmm0
-; X64-NEXT:    setp %al
-; X64-NEXT:    setne %cl
-; X64-NEXT:    orb %al, %cl
-; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    setp %cl
+; X64-NEXT:    setne %al
+; X64-NEXT:    orb %cl, %al
 ; X64-NEXT:    retq
   %res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1)
   ret i32 %res
@@ -2071,20 +2071,20 @@
 define i32 @test_mm_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
 ; X32-LABEL: test_mm_ucomieq_ss:
 ; X32:       # BB#0:
+; X32-NEXT:    xorl %eax, %eax
 ; X32-NEXT:    ucomiss %xmm1, %xmm0
-; X32-NEXT:    setnp %al
-; X32-NEXT:    sete %cl
-; X32-NEXT:    andb %al, %cl
-; X32-NEXT:    movzbl %cl, %eax
+; X32-NEXT:    setnp %cl
+; X32-NEXT:    sete %al
+; X32-NEXT:    andb %cl, %al
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_ucomieq_ss:
 ; X64:       # BB#0:
+; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    ucomiss %xmm1, %xmm0
-; X64-NEXT:    setnp %al
-; X64-NEXT:    sete %cl
-; X64-NEXT:    andb %al, %cl
-; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    setnp %cl
+; X64-NEXT:    sete %al
+; X64-NEXT:    andb %cl, %al
 ; X64-NEXT:    retq
   %res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
   ret i32 %res
@@ -2170,20 +2170,20 @@
 define i32 @test_mm_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
 ; X32-LABEL: test_mm_ucomineq_ss:
 ; X32:       # BB#0:
+; X32-NEXT:    xorl %eax, %eax
 ; X32-NEXT:    ucomiss %xmm1, %xmm0
-; X32-NEXT:    setp %al
-; X32-NEXT:    setne %cl
-; X32-NEXT:    orb %al, %cl
-; X32-NEXT:    movzbl %cl, %eax
+; X32-NEXT:    setp %cl
+; X32-NEXT:    setne %al
+; X32-NEXT:    orb %cl, %al
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_ucomineq_ss:
 ; X64:       # BB#0:
+; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    ucomiss %xmm1, %xmm0
-; X64-NEXT:    setp %al
-; X64-NEXT:    setne %cl
-; X64-NEXT:    orb %al, %cl
-; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    setp %cl
+; X64-NEXT:    setne %al
+; X64-NEXT:    orb %cl, %al
 ; X64-NEXT:    retq
   %res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1)
   ret i32 %res
Index: test/CodeGen/X86/sse-intrinsics-x86.ll
===================================================================
--- test/CodeGen/X86/sse-intrinsics-x86.ll
+++ test/CodeGen/X86/sse-intrinsics-x86.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse | FileCheck %s --check-prefix=SSE
 ; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL
@@ -53,20 +54,20 @@
 define i32 @test_x86_sse_comieq_ss(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-LABEL: test_x86_sse_comieq_ss:
 ; SSE:       ## BB#0:
+; SSE-NEXT:    xorl %eax, %eax
 ; SSE-NEXT:    comiss %xmm1, %xmm0
-; SSE-NEXT:    setnp %al
-; SSE-NEXT:    sete %cl
-; SSE-NEXT:    andb %al, %cl
-; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    setnp %cl
+; SSE-NEXT:    sete %al
+; SSE-NEXT:    andb %cl, %al
 ; SSE-NEXT:    retl
 ;
 ; KNL-LABEL: test_x86_sse_comieq_ss:
 ; KNL:       ## BB#0:
+; KNL-NEXT:    xorl %eax, %eax
 ; KNL-NEXT:    vcomiss %xmm1, %xmm0
-; KNL-NEXT:    setnp %al
-; KNL-NEXT:    sete %cl
-; KNL-NEXT:    andb %al, %cl
-; KNL-NEXT:    movzbl %cl, %eax
+; KNL-NEXT:    setnp %cl
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    andb %cl, %al
 ; KNL-NEXT:    retl
   %res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
   ret i32 %res
@@ -157,20 +158,20 @@
 define i32 @test_x86_sse_comineq_ss(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-LABEL: test_x86_sse_comineq_ss:
 ; SSE:       ## BB#0:
+; SSE-NEXT:    xorl %eax, %eax
 ; SSE-NEXT:    comiss %xmm1, %xmm0
-; SSE-NEXT:    setp %al
-; SSE-NEXT:    setne %cl
-; SSE-NEXT:    orb %al, %cl
-; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    setp %cl
+; SSE-NEXT:    setne %al
+; SSE-NEXT:    orb %cl, %al
 ; SSE-NEXT:    retl
 ;
 ; KNL-LABEL: test_x86_sse_comineq_ss:
 ; KNL:       ## BB#0:
+; KNL-NEXT:    xorl %eax, %eax
 ; KNL-NEXT:    vcomiss %xmm1, %xmm0
-; KNL-NEXT:    setp %al
-; KNL-NEXT:    setne %cl
-; KNL-NEXT:    orb %al, %cl
-; KNL-NEXT:    movzbl %cl, %eax
+; KNL-NEXT:    setp %cl
+; KNL-NEXT:    setne %al
+; KNL-NEXT:    orb %cl, %al
 ; KNL-NEXT:    retl
   %res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
   ret i32 %res
@@ -493,20 +494,20 @@
 define i32 @test_x86_sse_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-LABEL: test_x86_sse_ucomieq_ss:
 ; SSE:       ## BB#0:
+; SSE-NEXT:    xorl %eax, %eax
 ; SSE-NEXT:    ucomiss %xmm1, %xmm0
-; SSE-NEXT:    setnp %al
-; SSE-NEXT:    sete %cl
-; SSE-NEXT:    andb %al, %cl
-; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    setnp %cl
+; SSE-NEXT:    sete %al
+; SSE-NEXT:    andb %cl, %al
 ; SSE-NEXT:    retl
 ;
 ; KNL-LABEL: test_x86_sse_ucomieq_ss:
 ; KNL:       ## BB#0:
+; KNL-NEXT:    xorl %eax, %eax
 ; KNL-NEXT:    vucomiss %xmm1, %xmm0
-; KNL-NEXT:    setnp %al
-; KNL-NEXT:    sete %cl
-; KNL-NEXT:    andb %al, %cl
-; KNL-NEXT:    movzbl %cl, %eax
+; KNL-NEXT:    setnp %cl
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    andb %cl, %al
 ; KNL-NEXT:    retl
   %res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
   ret i32 %res
@@ -597,20 +598,20 @@
 define i32 @test_x86_sse_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-LABEL: test_x86_sse_ucomineq_ss:
 ; SSE:       ## BB#0:
+; SSE-NEXT:    xorl %eax, %eax
 ; SSE-NEXT:    ucomiss %xmm1, %xmm0
-; SSE-NEXT:    setp %al
-; SSE-NEXT:    setne %cl
-; SSE-NEXT:    orb %al, %cl
-; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    setp %cl
+; SSE-NEXT:    setne %al
+; SSE-NEXT:    orb %cl, %al
 ; SSE-NEXT:    retl
 ;
 ; KNL-LABEL: test_x86_sse_ucomineq_ss:
 ; KNL:       ## BB#0:
+; KNL-NEXT:    xorl %eax, %eax
 ; KNL-NEXT:    vucomiss %xmm1, %xmm0
-; KNL-NEXT:    setp %al
-; KNL-NEXT:    setne %cl
-; KNL-NEXT:    orb %al, %cl
-; KNL-NEXT:    movzbl %cl, %eax
+; KNL-NEXT:    setp %cl
+; KNL-NEXT:    setne %al
+; KNL-NEXT:    orb %cl, %al
 ; KNL-NEXT:    retl
   %res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
   ret i32 %res