Index: lib/Target/X86/CMakeLists.txt =================================================================== --- lib/Target/X86/CMakeLists.txt +++ lib/Target/X86/CMakeLists.txt @@ -20,6 +20,7 @@ X86FixupBWInsts.cpp X86FixupLEAs.cpp X86FixupSetCC.cpp + X86FixupZExt.cpp X86FloatingPoint.cpp X86FrameLowering.cpp X86ISelDAGToDAG.cpp Index: lib/Target/X86/X86.h =================================================================== --- lib/Target/X86/X86.h +++ lib/Target/X86/X86.h @@ -62,6 +62,8 @@ /// Return a pass that transforms setcc + movzx pairs into xor + setcc. FunctionPass *createX86FixupSetCC(); +FunctionPass *createX86FixupZExt(); + /// Return a pass that expands WinAlloca pseudo-instructions. FunctionPass *createX86WinAllocaExpander(); Index: lib/Target/X86/X86FixupZExt.cpp =================================================================== --- /dev/null +++ lib/Target/X86/X86FixupZExt.cpp @@ -0,0 +1,685 @@ +#include "X86Subtarget.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/LiveRegMatrix.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/VirtRegMap.h" + +#include + +#define DEBUG_TYPE "x86-fixup-zext" + +namespace { +using namespace llvm; +using std::unique_ptr; +using std::vector; +using std::pair; +using Segment = LiveRange::Segment; + +template +using is_iterable_of = typename std::enable_if().begin())>::type, + Elem>::value>::type; + +template auto push_to(T &t) -> decltype(std::back_inserter(t)) { + return std::back_inserter(t); +} + +unsigned get_phys(unsigned reg, const VirtRegMap &vrm) { + return TargetRegisterInfo::isVirtualRegister(reg) ? vrm.getPhys(reg) : reg; +} + +unsigned get_phys(const MachineOperand ®op, const VirtRegMap &vrm) { + const auto *f = regop.getParent()->getParent()->getParent(); + const auto &tri = *f->getSubtarget().getRegisterInfo(); + assert(regop.isReg()); + unsigned preg = get_phys(regop.getReg(), vrm); + return regop.getSubReg() ? tri.getSubReg(preg, regop.getSubReg()) : preg; +} + +unsigned get_phys(const MachineInstr &i, unsigned opnum, + const VirtRegMap &vrm) { + return get_phys(i.getOperand(opnum), vrm); +} + +DenseMap +dominating_defs(unsigned gr8, const MachineRegisterInfo &mri, + const SlotIndexes &si) { + DenseMap defs; + // at least until release_37, getInstructionIndex is expensive. + DenseMap cached; + + for (MachineInstr &def : mri.def_instructions(gr8)) { + unsigned tied_use; + if (def.isRegTiedToUseOperand(0, &tied_use) && + def.getOperand(tied_use).getReg() != def.getOperand(0).getReg()) { + DEBUG(dbgs() << "dominating_defs: " << def.getOperand(0) << " is tied to " + << def.getOperand(tied_use) << "\n"); + return dominating_defs(def.getOperand(tied_use).getReg(), mri, si); + } + MachineBasicBlock *bb = def.getParent(); + if (defs.find(bb) == defs.end() || + si.getInstructionIndex(def) < cached.lookup(bb)) { + cached[bb] = si.getInstructionIndex(def); + defs[bb] = &def; + } + } + return defs; +} + +void add_seg(SlotIndex s, SlotIndex e, LiveInterval &live, LiveIntervals &li) { + VNInfo *valno = !live.hasAtLeastOneValue() + ? live.getNextValue(s, li.getVNInfoAllocator()) + : *live.vni_begin(); + assert(live.getNumValNums() == 1); + live.addSegment(Segment(std::move(s), std::move(e), valno)); +} + +void add_seg(MachineInstr &s, MachineInstr &e, LiveInterval &live, + LiveIntervals &li) { + return add_seg(li.getInstructionIndex(s), li.getInstructionIndex(e), live, + li); +} + +void add_segs(LiveInterval &src, LiveInterval &dest, LiveIntervals &li) { + for (const Segment &s : src) { + add_seg(s.start, s.end, dest, li); + } +} + +MachineInstr *insert_mov32r0(MachineInstr &def8, LiveInterval &live, + LiveIntervals &li) { + auto slot = [&](MachineInstr &i) { return li.getInstructionIndex(i); }; + const MachineFunction &f = *def8.getParent()->getParent(); + const auto &tri = f.getSubtarget().getRegisterInfo(); + MachineBasicBlock &bb = *def8.getParent(); + MachineBasicBlock::iterator ins = &def8; + + if (const Segment *eflagseg = + li.getRegUnit(*MCRegUnitIterator(X86::EFLAGS, tri)) + .getSegmentContaining(slot(def8))) { + if (eflagseg->start <= slot(*bb.begin()) && bb.isLiveIn(X86::EFLAGS)) { + if (bb.pred_size() > 1) { + return nullptr; + } + add_seg(li.getMBBStartIdx(&bb), slot(def8), live, li); + return insert_mov32r0(*(*bb.pred_begin())->rbegin(), live, li); + } + ins = li.getInstructionFromIndex(eflagseg->start); + } + // insert dummy mov32r0 + MachineInstrBuilder mib = + BuildMI(bb, ins, def8.getDebugLoc(), + f.getSubtarget().getInstrInfo()->get(X86::MOV32r0), 0); + return mib; +} + +template > +raw_ostream &operator<<(raw_ostream &out, const T &es) { + for (LiveInterval *e : es) { + out << "\t" << (*e) << "\n"; + } + return out; +} + +template > +bool interferes(const T &as, const LiveInterval &b, + const MachineRegisterInfo &mri) { + return any_of(as, [&](const LiveInterval *a) { return a->overlaps(b); }); +} + +template +Iterator move_to_end_if(Iterator first, Iterator last, Predicate p) { + Iterator rv = last; + while (first != rv) { + if (p(*first)) { + --rv; + std::swap(*first, *rv); + } else { + ++first; + } + } + return rv; +} + +template +auto move_to_end_if(Range &r, Predicate p) -> decltype(r.end()) { + return move_to_end_if(r.begin(), r.end(), std::move(p)); +} + +struct ReAllocTool { + const TargetRegisterInfo *tri; + const MachineRegisterInfo *mri; + LiveRegMatrix *lrm; + VirtRegMap *vrm; + RegisterClassInfo rci; + BitVector unused_csr; + + void add_reg_to_bv(BitVector &bv, MCPhysReg reg) const { + for (MCRegAliasIterator r(reg, tri, true); r.isValid(); ++r) { + bv.set(*r); + } + } + + BitVector bv_from_regs(ArrayRef regs) const { + BitVector rv(tri->getNumRegs()); + for (const MCPhysReg &r : regs) { + add_reg_to_bv(rv, r); + } + return rv; + } + + template + BitVector bv_from_regs(ArrayRef regs, Predicate p) const { + BitVector rv(tri->getNumRegs()); + for (const MCPhysReg &r : regs) { + if (p(r)) { + add_reg_to_bv(rv, r); + } + } + } + + ReAllocTool(const MachineFunction &f, LiveRegMatrix &lrm_, VirtRegMap &vrm_) + : tri(f.getSubtarget().getRegisterInfo()), mri(&f.getRegInfo()), + lrm(&lrm_), vrm(&vrm_), rci(), unused_csr(tri->getNumRegs()) { + const MCPhysReg *csr = tri->getCalleeSavedRegs(&f); + for (unsigned i = 0; csr[i] != 0; i += 1) { + if (!lrm->isPhysRegUsed(csr[i])) { + add_reg_to_bv(unused_csr, csr[i]); + } + } + rci.runOnMachineFunction(f); + } + + bool interf(LiveInterval &live, unsigned preg) const { + return lrm->checkInterference(live, preg) != LiveRegMatrix::IK_Free; + } + + template > + bool interf(LiveInterval &live, unsigned preg, T &evictees) const { + if (lrm->checkRegMaskInterference(live, preg) || + lrm->checkRegUnitInterference(live, preg)) { + return true; + } + DenseSet ev; + for (MCRegUnitIterator regunit(preg, tri); regunit.isValid(); ++regunit) { + LiveIntervalUnion::Query &q = lrm->query(live, *regunit); + if (q.collectInterferingVRegs() > 0) { + for (LiveInterval *l : q.interferingVRegs()) { + ev.insert(l); + } + } + } + std::copy(ev.begin(), ev.end(), push_to(evictees)); + return evictees.size() > 0; + } + + const MCPhysReg *alloc_next(LiveInterval &live, + const BitVector *except = nullptr, + ArrayRef::iterator *it = nullptr, + const TargetRegisterClass *rc = nullptr) const { + ArrayRef ord = + rci.getOrder(rc ? rc : mri->getRegClass(live.reg)); + BitVector rs = unused_csr; + if (except != nullptr) { + rs |= *except; + } + auto rv = std::find_if( + it ? std::next(*it) : ord.begin(), ord.end(), + [&](MCPhysReg r) { return !rs.test(r) && !interf(live, r); }); + return rv == ord.end() ? nullptr : rv; + } + + MCPhysReg alloc(LiveInterval &live, const BitVector *except = nullptr, + const TargetRegisterClass *rc = nullptr) const { + const MCPhysReg *rv = alloc_next(live, except, nullptr, rc); + return rv == nullptr ? 0 : *rv; + } + + // (re-)allocate a group of interfering intervals. brute force search. returns + // nullptr if impossible. + template > + unique_ptr>> + alloc_interf_intervals(C group, const BitVector *except = nullptr) const { + if (group.empty()) { + return make_unique>>(); + } + auto assigned = + make_unique>>(); + + auto maybe_unassign = [&](pair &p) { + if (p.second) { + lrm->unassign(*p.first); + } + }; + + auto maybe_assign = [&](pair &p) { + if (p.second) { + lrm->assign(*p.first, *p.second); + } + }; + + auto try_next_in_group = [&]() { + assert(!group.empty()); + assigned->push_back( + std::make_pair(group.back(), alloc_next(*group.back(), except))); + group.pop_back(); + maybe_assign(assigned->back()); + }; + + auto back_to_previous = [&]() { + assert(!assigned->empty()); + maybe_unassign(assigned->back()); + group.push_back(assigned->back().first); + assigned->pop_back(); + }; + + auto try_next_reg = [&]() { + assert(!assigned->empty()); + maybe_unassign(assigned->back()); + assigned->back().second = + alloc_next(*assigned->back().first, except, &assigned->back().second); + maybe_assign(assigned->back()); + }; + + try_next_in_group(); + + while (!group.empty() || assigned->back().second == nullptr) { + if (assigned->back().second == nullptr) { + back_to_previous(); + if (assigned->empty()) { + return nullptr; + } + try_next_reg(); + } else { + try_next_in_group(); + } + } + for (auto &p : *assigned) { + lrm->unassign(*p.first); + } + return assigned; + } + + template > + unique_ptr> + evict_intervals(const C &lives, const BitVector *excepts = nullptr) const { + DenseMap newmap; + vector ungrouped(lives.begin(), lives.end()); + + while (!ungrouped.empty()) { + vector group; + group.push_back(ungrouped.back()); + ungrouped.pop_back(); + bool done = false; + while (!done) { + auto it = move_to_end_if(ungrouped, [&](LiveInterval *h) { + return interferes(group, *h, *mri); + }); + done = it == ungrouped.end(); + std::copy(it, ungrouped.end(), push_to(group)); + ungrouped.erase(it, ungrouped.end()); + } + if (auto newassigns = alloc_interf_intervals(group, excepts)) { + for (auto pair_ : *newassigns) { + newmap.insert(pair_); + } + } else { + return nullptr; + } + } + auto rv = make_unique>(); + transform(lives, push_to(*rv), [&](LiveInterval *l) { return *newmap[l]; }); + return rv; + } + + MCPhysReg unassign(LiveInterval &live) { + unsigned old = get_phys(live.reg, *vrm); + lrm->unassign(live); + return old; + } + + template > + vector unassign_all(C &lives) { + vector r; + transform(lives, push_to(r), [&](LiveInterval *l) { return unassign(*l); }); + return r; + } + + template , + typename = is_iterable_of> + void assign_all(C &lives, D &®s) { + for (auto intv_reg : zip_first(lives, std::forward(regs))) { + lrm->assign(*std::get<0>(intv_reg), std::get<1>(intv_reg)); + } + } + + bool reserve_phys_reg(MCPhysReg preg, LiveInterval &live) { + vector evictees; + if (!interf(live, preg, evictees)) { + DEBUG(dbgs() << "ReAllocTool: " << tri->getName(preg) + << " is already free.\n"); + return true; + } else if (evictees.size() > 0) { + DEBUG(dbgs() << "ReAllocTool: trying to reserve " << tri->getName(preg) + << " by evicting:\n" + << evictees); + vector oldregs = unassign_all(evictees); + BitVector bv = bv_from_regs(preg); + if (auto newregs = evict_intervals(evictees, &bv)) { + assign_all(evictees, *newregs); + return true; + } + assign_all(evictees, oldregs); + } + DEBUG(dbgs() << "ReAllocTool: unable to reserve " << tri->getName(preg) + << "\n"); + return false; + } +}; + +struct Candidate { + MachineInstr *ins; + MachineInstr *gr8def; + MachineInstr *movzx; + vector constraints; + LiveInterval *live32; + LiveInterval *live8; + unique_ptr extra; + // private: + // assign/reassign + unsigned pdest; + unsigned psrc; + + static MachineInstr *valid_candidate(MachineInstr &i, LiveIntervals &li) { + if (i.getOpcode() != X86::MOVZX32rr8 || i.getOperand(1).getSubReg() != 0) { + return nullptr; + } + + const MachineFunction &f = *i.getParent()->getParent(); + const MachineRegisterInfo &mri = f.getRegInfo(); + const TargetRegisterInfo &tri = *f.getSubtarget().getRegisterInfo(); + + unsigned src = i.getOperand(1).getReg(); + auto bbdefs = dominating_defs(src, mri, *li.getSlotIndexes()); + if (bbdefs.size() > 1 || (mri.getSimpleHint(src) && + !tri.isVirtualRegister(mri.getSimpleHint(src)))) { + DEBUG(dbgs() << "passing over " << i << "defs: " << bbdefs.size() + << ", gr8 hint: " << PrintReg(mri.getSimpleHint(src), &tri) + << "\n"); + return nullptr; + } + return bbdefs.begin()->second; + } + + static unique_ptr from_mi(MachineInstr &i, LiveIntervals &li, + const VirtRegMap &vrm) { + const MachineFunction &f = *i.getParent()->getParent(); + const MachineRegisterInfo &mri = f.getRegInfo(); + const TargetRegisterInfo &tri = *f.getSubtarget().getRegisterInfo(); + + MachineInstr *def, *ins; + if ((def = valid_candidate(i, li)) == nullptr) { + return nullptr; + } + + unsigned dest = i.getOperand(0).getReg(), src = i.getOperand(1).getReg(); + LiveInterval &live32 = li.getInterval(dest), &live8 = li.getInterval(src); + unique_ptr extra(new LiveInterval(live32.reg, live32.weight)); + + if ((ins = insert_mov32r0(*def, *extra, li)) == nullptr) { + return nullptr; + } + + li.InsertMachineInstrInMaps(*ins); + add_seg(*ins, *def, *extra, li); + if (extra->overlaps(live32)) { + li.RemoveMachineInstrFromMaps(*ins); + ins->eraseFromParent(); + return nullptr; + } + + add_segs(live32, *extra, li); + add_segs(live8, *extra, li); + + // look for copy instr reg alloc hints + vector cx; + for (const MachineInstr &use : mri.use_instructions(dest)) { + if (use.isCopy() && !tri.isVirtualRegister(use.getOperand(0).getReg())) { + unsigned r = + use.getOperand(1).getSubReg() + ? tri.getMatchingSuperReg(use.getOperand(0).getReg(), + use.getOperand(1).getSubReg(), + mri.getRegClass(dest)) + : get_phys(use.getOperand(0), vrm); + if (f.getSubtarget().is64Bit() || + X86::GR32_ABCDRegClass.contains(r)) { + cx.push_back(r); + } + } + } + + return unique_ptr(new Candidate{ + ins, def, &i, std::move(cx), &live32, &live8, std::move(extra), 0, 0}); + } + + bool operator<(const Candidate &b) const { + if (constraints.size() > 0 && b.constraints.size() == 0) + return true; + if (b.constraints.size() > 0 && constraints.size() == 0) + return false; + if (constraints.size() < b.constraints.size()) + return true; + return li_size() > b.li_size(); + } + + unsigned li_size() const { return extra->getSize(); } + + friend raw_ostream &operator<<(raw_ostream &out, const Candidate &c) { + out << "Candidate:\n\tinserted: " << (*c.ins) + << "\tgr8 def: " << (*c.gr8def) << "\tmovzx: " << (*c.movzx) + << "\txor gr32: " << (*c.extra); + if (c.constraints.size() > 0) { + out << "\n\tconstraints:"; + for (unsigned cx : c.constraints) { + out << " " << PrintReg(cx, &c.tri()); + } + } else { + out << "\n\tno constraints."; + } + return out; + } + + const X86RegisterInfo &tri() const { + return *reinterpret_cast( + ins->getParent()->getParent()->getSubtarget().getRegisterInfo()); + } + + const X86InstrInfo &tii() const { + return *reinterpret_cast( + ins->getParent()->getParent()->getSubtarget().getInstrInfo()); + } + + MachineRegisterInfo &mri() const { + return ins->getParent()->getParent()->getRegInfo(); + } + + void unassign(ReAllocTool &ratool) { + pdest = ratool.unassign(*live32); + psrc = ratool.unassign(*live8); + } + + void assign_old(LiveRegMatrix &lrm) { + lrm.assign(*live32, pdest); + lrm.assign(*live8, psrc); + pdest = psrc = 0; + } + + void assign_new(LiveRegMatrix &lrm, LiveIntervals &li, MCPhysReg newdest) { + // vsrc uses => vdest:sub_8bit; insert vdest = mov32r0; del movzx + unsigned vdest = movzx->getOperand(0).getReg(); + unsigned vsrc = movzx->getOperand(1).getReg(); + + // in-place operand mutation would confuse defusechain_iterator + vector ops; + transform(mri().reg_operands(vsrc), push_to(ops), + [](MachineOperand &op) { return &op; }); + for (MachineOperand *op : ops) { + DEBUG(dbgs() << "changing " << (*op->getParent())); + op->substVirtReg(vdest, X86::sub_8bit, tri()); + DEBUG(dbgs() << "to " << (*op->getParent())); + } + + li.RemoveMachineInstrFromMaps(*movzx); + movzx->eraseFromParent(); + li.removeInterval(vsrc); + li.removeInterval(vdest); + + const TargetRegisterClass &destcls = *mri().getRegClass(vdest); + ins->getOperand(0).setReg(vdest); + if (destcls.getSize() > 32 / 8) { + ins->getOperand(0).setSubReg(X86::sub_32bit); + ins->getOperand(0).setIsUndef(); + } + if (const TargetRegisterClass *newcls = gr8def->getRegClassConstraintEffect( + 0, ins->getRegClassConstraintEffect(0, &destcls, &tii(), &tri()), + &tii(), &tri())) { + DEBUG(dbgs() << "updating reg class from " + << tri().getRegClassName(&destcls) << " to " + << tri().getRegClassName(newcls) << "\n"); + mri().setRegClass(vdest, newcls); + } else { + DEBUG(dbgs() << "not updating reg class\n"); + } + lrm.assign(li.createAndComputeVirtRegInterval(vdest), newdest); + } + + bool valid_dest_reg(MCPhysReg physreg) const { + return mri().getRegClass(movzx->getOperand(0).getReg())->contains(physreg); + } +}; + +struct X86FixupZExt : public MachineFunctionPass { + static char id; + + X86FixupZExt() : MachineFunctionPass(id) {} + + const char *getPassName() const override { + return "X86 Zero-Extension Fix-up"; + } + + void getAnalysisUsage(AnalysisUsage &a) const override { + a.addRequired(); + a.addRequired(); + a.addRequired(); + a.setPreservesAll(); + return MachineFunctionPass::getAnalysisUsage(a); + } + + bool runOnMachineFunction(MachineFunction &f) override { + VirtRegMap &vrm = getAnalysis(); + LiveIntervals &li = getAnalysis(); + LiveRegMatrix &lrm = getAnalysis(); + vector constrained, cands, dispose; + ReAllocTool ratool(f, lrm, vrm); + + DEBUG(dbgs() << "analyzing " << f.getName() << "'s movzxes.\n"); + for (MachineBasicBlock &bb : f) { + for (MachineInstr &i : bb) { + if (auto cand = Candidate::from_mi(i, li, vrm)) { + if (cand->constraints.size() > 0) { + constrained.emplace_back(std::move(*cand.release())); + } else { + cands.emplace_back(std::move(*cand.release())); + } + } + } + } + + BitVector nosub8; + if (f.getSubtarget().is64Bit()) { + nosub8 = ratool.bv_from_regs({X86::RIP}); + } else { + nosub8 = ratool.bv_from_regs(ArrayRef( + X86::GR32_ABCDRegClass.begin(), X86::GR32_ABCDRegClass.end())); + nosub8.flip(); + } + + DEBUG(vrm.print(dbgs())); + DEBUG(f.print(dbgs(), li.getSlotIndexes())); + std::sort(constrained.begin(), constrained.end()); + std::for_each(constrained.begin(), constrained.end(), [&](Candidate &c) { + DEBUG(dbgs() << c << "\n"); + c.unassign(ratool); + bool demote = true; + for (MCPhysReg preg : c.constraints) { + if (!nosub8.test(preg) && c.valid_dest_reg(preg) && + ratool.reserve_phys_reg(preg, *c.extra)) { + DEBUG(dbgs() << "works\n"); + c.assign_new(lrm, li, preg); + return; + } + // only demote if RA pass missed all hints + demote &= preg != c.pdest; + } + DEBUG(dbgs() << "could not transform\n"); + c.assign_old(lrm); + if (demote) { + c.constraints.clear(); + DEBUG(dbgs() << "demoting to unconstrained candidate\n"); + cands.push_back(std::move(c)); + } else { + dispose.push_back(std::move(c)); + } + }); + + auto try_harder_to_alloc = [&](Candidate &c) { + for (MCPhysReg newreg : X86::GR32_ABCDRegClass) { + if (c.valid_dest_reg(newreg) && !ratool.unused_csr.test(newreg) && + ratool.reserve_phys_reg(newreg, *c.extra)) { + return newreg; + } + } + return static_cast(0); + }; + + std::sort(cands.begin(), cands.end()); + for (Candidate &c : cands) { + DEBUG(dbgs() << c << "\n"); + c.unassign(ratool); + MCPhysReg newreg; + if (!f.getSubtarget().is64Bit() && + ((newreg = ratool.alloc(*c.extra, &nosub8)) != 0 || + (newreg = try_harder_to_alloc(c)) != 0)) { + DEBUG(dbgs() << "works\n"); + c.assign_new(lrm, li, newreg); + } else if (f.getSubtarget().is64Bit() && + (newreg = ratool.alloc(*c.extra, &nosub8)) != 0) { + DEBUG(dbgs() << "works\n"); + c.assign_new(lrm, li, newreg); + } else { + DEBUG(dbgs() << "could not transform\n"); + c.assign_old(lrm); + dispose.push_back(std::move(c)); + } + } + + for (Candidate &c : dispose) { + DEBUG(dbgs() << "purging dummy instr: " << (*c.ins)); + li.RemoveMachineInstrFromMaps(*c.ins); + c.ins->eraseFromParent(); + } + return false; + } +}; + +char X86FixupZExt::id = 0; +} + +namespace llvm { +FunctionPass *createX86FixupZExt() { return new X86FixupZExt(); } +} Index: lib/Target/X86/X86TargetMachine.cpp =================================================================== --- lib/Target/X86/X86TargetMachine.cpp +++ lib/Target/X86/X86TargetMachine.cpp @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#include "X86TargetMachine.h" #include "X86.h" +#include "X86TargetMachine.h" #include "X86TargetObjectFile.h" #include "X86TargetTransformInfo.h" #include "llvm/CodeGen/Passes.h" @@ -29,6 +29,10 @@ cl::desc("Enable the machine combiner pass"), cl::init(true), cl::Hidden); +static cl::opt EnableSetCCFixup("setcc-fixup", + cl::desc("Apply X86FixupSetCC"), + cl::init(false), cl::Hidden); + namespace llvm { void initializeWinEHStatePassPass(PassRegistry &); } @@ -238,7 +242,6 @@ }); } - //===----------------------------------------------------------------------===// // Pass Pipeline Configuration //===----------------------------------------------------------------------===// @@ -260,6 +263,7 @@ bool addPreISel() override; void addPreRegAlloc() override; void addPostRegAlloc() override; + bool addPreRewrite() override; void addPreEmitPass() override; void addPreSched2() override; }; @@ -305,8 +309,10 @@ void X86PassConfig::addPreRegAlloc() { if (getOptLevel() != CodeGenOpt::None) { - addPass(createX86FixupSetCC()); - addPass(createX86OptimizeLEAs()); + if (EnableSetCCFixup) { + addPass(createX86FixupSetCC()); + } + addPass(createX86OptimizeLEAs()); } addPass(createX86CallFrameOptimization()); @@ -317,6 +323,13 @@ addPass(createX86FloatingPointStackifierPass()); } +bool X86PassConfig::addPreRewrite() { + if (!EnableSetCCFixup) { + addPass(createX86FixupZExt()); + } + return false; +} + void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); } void X86PassConfig::addPreEmitPass() { Index: test/CodeGen/X86/avx-intrinsics-x86.ll =================================================================== --- test/CodeGen/X86/avx-intrinsics-x86.ll +++ test/CodeGen/X86/avx-intrinsics-x86.ll @@ -149,20 +149,20 @@ define i32 @test_x86_sse2_comieq_sd(<2 x double> %a0, <2 x double> %a1) { ; AVX-LABEL: test_x86_sse2_comieq_sd: ; AVX: ## BB#0: +; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: vcomisd %xmm1, %xmm0 -; AVX-NEXT: setnp %al -; AVX-NEXT: sete %cl -; AVX-NEXT: andb %al, %cl -; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: setnp %cl +; AVX-NEXT: sete %al +; AVX-NEXT: andb %cl, %al ; AVX-NEXT: retl ; ; AVX512VL-LABEL: test_x86_sse2_comieq_sd: ; AVX512VL: ## BB#0: +; AVX512VL-NEXT: xorl %eax, %eax ; AVX512VL-NEXT: vcomisd %xmm1, %xmm0 -; AVX512VL-NEXT: setnp %al -; AVX512VL-NEXT: sete %cl -; AVX512VL-NEXT: andb %al, %cl -; AVX512VL-NEXT: movzbl %cl, %eax +; AVX512VL-NEXT: setnp %cl +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: andb %cl, %al ; AVX512VL-NEXT: retl %res = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1) ; [#uses=1] ret i32 %res @@ -253,20 +253,20 @@ define i32 @test_x86_sse2_comineq_sd(<2 x double> %a0, <2 x double> %a1) { ; AVX-LABEL: test_x86_sse2_comineq_sd: ; AVX: ## BB#0: +; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: vcomisd %xmm1, %xmm0 -; AVX-NEXT: setp %al -; AVX-NEXT: setne %cl -; AVX-NEXT: orb %al, %cl -; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: setp %cl +; AVX-NEXT: setne %al +; AVX-NEXT: orb %cl, %al ; AVX-NEXT: retl ; ; AVX512VL-LABEL: test_x86_sse2_comineq_sd: ; AVX512VL: ## BB#0: +; AVX512VL-NEXT: xorl %eax, %eax ; AVX512VL-NEXT: vcomisd %xmm1, %xmm0 -; AVX512VL-NEXT: setp %al -; AVX512VL-NEXT: setne %cl -; AVX512VL-NEXT: orb %al, %cl -; AVX512VL-NEXT: movzbl %cl, %eax +; AVX512VL-NEXT: setp %cl +; AVX512VL-NEXT: setne %al +; AVX512VL-NEXT: orb %cl, %al ; AVX512VL-NEXT: retl %res = call i32 @llvm.x86.sse2.comineq.sd(<2 x double> %a0, <2 x double> %a1) ; [#uses=1] ret i32 %res @@ -1240,20 +1240,20 @@ define i32 @test_x86_sse2_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) { ; AVX-LABEL: test_x86_sse2_ucomieq_sd: ; AVX: ## BB#0: +; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: vucomisd %xmm1, %xmm0 -; AVX-NEXT: setnp %al -; AVX-NEXT: sete %cl -; AVX-NEXT: andb %al, %cl -; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: setnp %cl +; AVX-NEXT: sete %al +; AVX-NEXT: andb %cl, %al ; AVX-NEXT: retl ; ; AVX512VL-LABEL: test_x86_sse2_ucomieq_sd: ; AVX512VL: ## BB#0: +; AVX512VL-NEXT: xorl %eax, %eax ; AVX512VL-NEXT: vucomisd %xmm1, %xmm0 -; AVX512VL-NEXT: setnp %al -; AVX512VL-NEXT: sete %cl -; AVX512VL-NEXT: andb %al, %cl -; AVX512VL-NEXT: movzbl %cl, %eax +; AVX512VL-NEXT: setnp %cl +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: andb %cl, %al ; AVX512VL-NEXT: retl %res = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1) ; [#uses=1] ret i32 %res @@ -1344,20 +1344,20 @@ define i32 @test_x86_sse2_ucomineq_sd(<2 x double> %a0, <2 x double> %a1) { ; AVX-LABEL: test_x86_sse2_ucomineq_sd: ; AVX: ## BB#0: +; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: vucomisd %xmm1, %xmm0 -; AVX-NEXT: setp %al -; AVX-NEXT: setne %cl -; AVX-NEXT: orb %al, %cl -; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: setp %cl +; AVX-NEXT: setne %al +; AVX-NEXT: orb %cl, %al ; AVX-NEXT: retl ; ; AVX512VL-LABEL: test_x86_sse2_ucomineq_sd: ; AVX512VL: ## BB#0: +; AVX512VL-NEXT: xorl %eax, %eax ; AVX512VL-NEXT: vucomisd %xmm1, %xmm0 -; AVX512VL-NEXT: setp %al -; AVX512VL-NEXT: setne %cl -; AVX512VL-NEXT: orb %al, %cl -; AVX512VL-NEXT: movzbl %cl, %eax +; AVX512VL-NEXT: setp %cl +; AVX512VL-NEXT: setne %al +; AVX512VL-NEXT: orb %cl, %al ; AVX512VL-NEXT: retl %res = call i32 @llvm.x86.sse2.ucomineq.sd(<2 x double> %a0, <2 x double> %a1) ; [#uses=1] ret i32 %res @@ -1943,29 +1943,23 @@ } -define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) nounwind { +define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) { ; AVX-LABEL: test_x86_sse42_pcmpestria128: ; AVX: ## BB#0: -; AVX-NEXT: pushl %ebx ; AVX-NEXT: movl $7, %eax ; AVX-NEXT: movl $7, %edx -; AVX-NEXT: xorl %ebx, %ebx ; AVX-NEXT: vpcmpestri $7, %xmm1, %xmm0 -; AVX-NEXT: seta %bl -; AVX-NEXT: movl %ebx, %eax -; AVX-NEXT: popl %ebx +; AVX-NEXT: seta %al +; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: retl ; ; AVX512VL-LABEL: test_x86_sse42_pcmpestria128: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: pushl %ebx ; AVX512VL-NEXT: movl $7, %eax ; AVX512VL-NEXT: movl $7, %edx -; AVX512VL-NEXT: xorl %ebx, %ebx ; AVX512VL-NEXT: vpcmpestri $7, %xmm1, %xmm0 -; AVX512VL-NEXT: seta %bl -; AVX512VL-NEXT: movl %ebx, %eax -; AVX512VL-NEXT: popl %ebx +; AVX512VL-NEXT: seta %al +; AVX512VL-NEXT: movzbl %al, %eax ; AVX512VL-NEXT: retl %res = call i32 @llvm.x86.sse42.pcmpestria128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; [#uses=1] ret i32 %res @@ -1997,29 +1991,23 @@ declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone -define i32 @test_x86_sse42_pcmpestrio128(<16 x i8> %a0, <16 x i8> %a2) nounwind { +define i32 @test_x86_sse42_pcmpestrio128(<16 x i8> %a0, <16 x i8> %a2) { ; AVX-LABEL: test_x86_sse42_pcmpestrio128: ; AVX: ## BB#0: -; AVX-NEXT: pushl %ebx ; AVX-NEXT: movl $7, %eax ; AVX-NEXT: movl $7, %edx -; AVX-NEXT: xorl %ebx, %ebx ; AVX-NEXT: vpcmpestri $7, %xmm1, %xmm0 -; AVX-NEXT: seto %bl -; AVX-NEXT: movl %ebx, %eax -; AVX-NEXT: popl %ebx +; AVX-NEXT: seto %al +; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: retl ; ; AVX512VL-LABEL: test_x86_sse42_pcmpestrio128: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: pushl %ebx ; AVX512VL-NEXT: movl $7, %eax ; AVX512VL-NEXT: movl $7, %edx -; AVX512VL-NEXT: xorl %ebx, %ebx ; AVX512VL-NEXT: vpcmpestri $7, %xmm1, %xmm0 -; AVX512VL-NEXT: seto %bl -; AVX512VL-NEXT: movl %ebx, %eax -; AVX512VL-NEXT: popl %ebx +; AVX512VL-NEXT: seto %al +; AVX512VL-NEXT: movzbl %al, %eax ; AVX512VL-NEXT: retl %res = call i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; [#uses=1] ret i32 %res @@ -2027,29 +2015,23 @@ declare i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone -define i32 @test_x86_sse42_pcmpestris128(<16 x i8> %a0, <16 x i8> %a2) nounwind { +define i32 @test_x86_sse42_pcmpestris128(<16 x i8> %a0, <16 x i8> %a2) { ; AVX-LABEL: test_x86_sse42_pcmpestris128: ; AVX: ## BB#0: -; AVX-NEXT: pushl %ebx ; AVX-NEXT: movl $7, %eax ; AVX-NEXT: movl $7, %edx -; AVX-NEXT: xorl %ebx, %ebx ; AVX-NEXT: vpcmpestri $7, %xmm1, %xmm0 -; AVX-NEXT: sets %bl -; AVX-NEXT: movl %ebx, %eax -; AVX-NEXT: popl %ebx +; AVX-NEXT: sets %al +; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: retl ; ; AVX512VL-LABEL: test_x86_sse42_pcmpestris128: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: pushl %ebx ; AVX512VL-NEXT: movl $7, %eax ; AVX512VL-NEXT: movl $7, %edx -; AVX512VL-NEXT: xorl %ebx, %ebx ; AVX512VL-NEXT: vpcmpestri $7, %xmm1, %xmm0 -; AVX512VL-NEXT: sets %bl -; AVX512VL-NEXT: movl %ebx, %eax -; AVX512VL-NEXT: popl %ebx +; AVX512VL-NEXT: sets %al +; AVX512VL-NEXT: movzbl %al, %eax ; AVX512VL-NEXT: retl %res = call i32 @llvm.x86.sse42.pcmpestris128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; [#uses=1] ret i32 %res @@ -2057,29 +2039,23 @@ declare i32 @llvm.x86.sse42.pcmpestris128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone -define i32 @test_x86_sse42_pcmpestriz128(<16 x i8> %a0, <16 x i8> %a2) nounwind { +define i32 @test_x86_sse42_pcmpestriz128(<16 x i8> %a0, <16 x i8> %a2) { ; AVX-LABEL: test_x86_sse42_pcmpestriz128: ; AVX: ## BB#0: -; AVX-NEXT: pushl %ebx ; AVX-NEXT: movl $7, %eax ; AVX-NEXT: movl $7, %edx -; AVX-NEXT: xorl %ebx, %ebx ; AVX-NEXT: vpcmpestri $7, %xmm1, %xmm0 -; AVX-NEXT: sete %bl -; AVX-NEXT: movl %ebx, %eax -; AVX-NEXT: popl %ebx +; AVX-NEXT: sete %al +; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: retl ; ; AVX512VL-LABEL: test_x86_sse42_pcmpestriz128: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: pushl %ebx ; AVX512VL-NEXT: movl $7, %eax ; AVX512VL-NEXT: movl $7, %edx -; AVX512VL-NEXT: xorl %ebx, %ebx ; AVX512VL-NEXT: vpcmpestri $7, %xmm1, %xmm0 -; AVX512VL-NEXT: sete %bl -; AVX512VL-NEXT: movl %ebx, %eax -; AVX512VL-NEXT: popl %ebx +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: movzbl %al, %eax ; AVX512VL-NEXT: retl %res = call i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; [#uses=1] ret i32 %res @@ -2357,20 +2333,20 @@ define i32 @test_x86_sse_comieq_ss(<4 x float> %a0, <4 x float> %a1) { ; AVX-LABEL: test_x86_sse_comieq_ss: ; AVX: ## BB#0: +; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: vcomiss %xmm1, %xmm0 -; AVX-NEXT: setnp %al -; AVX-NEXT: sete %cl -; AVX-NEXT: andb %al, %cl -; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: setnp %cl +; AVX-NEXT: sete %al +; AVX-NEXT: andb %cl, %al ; AVX-NEXT: retl ; ; AVX512VL-LABEL: test_x86_sse_comieq_ss: ; AVX512VL: ## BB#0: +; AVX512VL-NEXT: xorl %eax, %eax ; AVX512VL-NEXT: vcomiss %xmm1, %xmm0 -; AVX512VL-NEXT: setnp %al -; AVX512VL-NEXT: sete %cl -; AVX512VL-NEXT: andb %al, %cl -; AVX512VL-NEXT: movzbl %cl, %eax +; AVX512VL-NEXT: setnp %cl +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: andb %cl, %al ; AVX512VL-NEXT: retl %res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1) ; [#uses=1] ret i32 %res @@ -2461,20 +2437,20 @@ define i32 @test_x86_sse_comineq_ss(<4 x float> %a0, <4 x float> %a1) { ; AVX-LABEL: test_x86_sse_comineq_ss: ; AVX: ## BB#0: +; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: vcomiss %xmm1, %xmm0 -; AVX-NEXT: setp %al -; AVX-NEXT: setne %cl -; AVX-NEXT: orb %al, %cl -; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: setp %cl +; AVX-NEXT: setne %al +; AVX-NEXT: orb %cl, %al ; AVX-NEXT: retl ; ; AVX512VL-LABEL: test_x86_sse_comineq_ss: ; AVX512VL: ## BB#0: +; AVX512VL-NEXT: xorl %eax, %eax ; AVX512VL-NEXT: vcomiss %xmm1, %xmm0 -; AVX512VL-NEXT: setp %al -; AVX512VL-NEXT: setne %cl -; AVX512VL-NEXT: orb %al, %cl -; AVX512VL-NEXT: movzbl %cl, %eax +; AVX512VL-NEXT: setp %cl +; AVX512VL-NEXT: setne %al +; AVX512VL-NEXT: orb %cl, %al ; AVX512VL-NEXT: retl %res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1) ; [#uses=1] ret i32 %res @@ -2797,20 +2773,20 @@ define i32 @test_x86_sse_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) { ; AVX-LABEL: test_x86_sse_ucomieq_ss: ; AVX: ## BB#0: +; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: vucomiss %xmm1, %xmm0 -; AVX-NEXT: setnp %al -; AVX-NEXT: sete %cl -; AVX-NEXT: andb %al, %cl -; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: setnp %cl +; AVX-NEXT: sete %al +; AVX-NEXT: andb %cl, %al ; AVX-NEXT: retl ; ; AVX512VL-LABEL: test_x86_sse_ucomieq_ss: ; AVX512VL: ## BB#0: +; AVX512VL-NEXT: xorl %eax, %eax ; AVX512VL-NEXT: vucomiss %xmm1, %xmm0 -; AVX512VL-NEXT: setnp %al -; AVX512VL-NEXT: sete %cl -; AVX512VL-NEXT: andb %al, %cl -; AVX512VL-NEXT: movzbl %cl, %eax +; AVX512VL-NEXT: setnp %cl +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: andb %cl, %al ; AVX512VL-NEXT: retl %res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1) ; [#uses=1] ret i32 %res @@ -2901,20 +2877,20 @@ define i32 @test_x86_sse_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) { ; AVX-LABEL: test_x86_sse_ucomineq_ss: ; AVX: ## BB#0: +; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: vucomiss %xmm1, %xmm0 -; AVX-NEXT: setp %al -; AVX-NEXT: setne %cl -; AVX-NEXT: orb %al, %cl -; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: setp %cl +; AVX-NEXT: setne %al +; AVX-NEXT: orb %cl, %al ; AVX-NEXT: retl ; ; AVX512VL-LABEL: test_x86_sse_ucomineq_ss: ; AVX512VL: ## BB#0: +; AVX512VL-NEXT: xorl %eax, %eax ; AVX512VL-NEXT: vucomiss %xmm1, %xmm0 -; AVX512VL-NEXT: setp %al -; AVX512VL-NEXT: setne %cl -; AVX512VL-NEXT: orb %al, %cl -; AVX512VL-NEXT: movzbl %cl, %eax +; AVX512VL-NEXT: setp %cl +; AVX512VL-NEXT: setne %al +; AVX512VL-NEXT: orb %cl, %al ; AVX512VL-NEXT: retl %res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1) ; [#uses=1] ret i32 %res Index: test/CodeGen/X86/avx512-cmp.ll =================================================================== --- test/CodeGen/X86/avx512-cmp.ll +++ test/CodeGen/X86/avx512-cmp.ll @@ -51,11 +51,11 @@ define i32 @test3(float %a, float %b) { ; ALL-LABEL: test3: ; ALL: ## BB#0: +; ALL-NEXT: xorl %eax, %eax ; ALL-NEXT: vucomiss %xmm1, %xmm0 -; ALL-NEXT: setnp %al -; ALL-NEXT: sete %cl -; ALL-NEXT: andb %al, %cl -; ALL-NEXT: movzbl %cl, %eax +; ALL-NEXT: setnp %cl +; ALL-NEXT: sete %al +; ALL-NEXT: andb %cl, %al ; ALL-NEXT: retq %cmp10.i = fcmp oeq float %a, %b @@ -67,12 +67,12 @@ ; ALL-LABEL: test5: ; ALL: ## BB#0: ## %entry ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ALL-NEXT: xorl %eax, %eax ; ALL-NEXT: vucomiss %xmm1, %xmm0 ; ALL-NEXT: jne LBB3_1 ; ALL-NEXT: jnp LBB3_2 ; ALL-NEXT: LBB3_1: ## %if.end ; ALL-NEXT: seta %al -; ALL-NEXT: movzbl %al, %eax ; ALL-NEXT: leaq {{.*}}(%rip), %rcx ; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; ALL-NEXT: LBB3_2: ## %return Index: test/CodeGen/X86/cmpxchg-i1.ll =================================================================== --- test/CodeGen/X86/cmpxchg-i1.ll +++ test/CodeGen/X86/cmpxchg-i1.ll @@ -34,7 +34,7 @@ ; CHECK-LABEL: cmpxchg_sext: ; CHECK-DAG: cmpxchgl ; CHECK-NOT: cmpl -; CHECK: sete %cl +; CHECK: sete %al ; CHECK: retq %pair = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst %success = extractvalue { i32, i1 } %pair, 1 @@ -44,10 +44,10 @@ define i32 @cmpxchg_zext(i32* %addr, i32 %desired, i32 %new) { ; CHECK-LABEL: cmpxchg_zext: -; CHECK: xorl %e[[R:[a-z]]]x ; CHECK: cmpxchgl ; CHECK-NOT: cmp -; CHECK: sete %[[R]]l +; CHECK: sete [[BYTE:%[a-z0-9]+]] +; CHECK: movzbl [[BYTE]], %eax %pair = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst %success = extractvalue { i32, i1 } %pair, 1 %mask = zext i1 %success to i32 Index: test/CodeGen/X86/cmpxchg-i128-i1.ll =================================================================== --- test/CodeGen/X86/cmpxchg-i128-i1.ll +++ test/CodeGen/X86/cmpxchg-i128-i1.ll @@ -44,10 +44,10 @@ define i128 @cmpxchg_zext(i128* %addr, i128 %desired, i128 %new) { ; CHECK-LABEL: cmpxchg_zext: -; CHECK: xorl ; CHECK: cmpxchg16b ; CHECK-NOT: cmpq -; CHECK: sete +; CHECK: sete [[BYTE:%[a-z0-9]+]] +; CHECK: movzbl [[BYTE]], %eax %pair = cmpxchg i128* %addr, i128 %desired, i128 %new seq_cst seq_cst %success = extractvalue { i128, i1 } %pair, 1 %mask = zext i1 %success to i128 Index: test/CodeGen/X86/fast-isel-cmp.ll =================================================================== --- test/CodeGen/X86/fast-isel-cmp.ll +++ test/CodeGen/X86/fast-isel-cmp.ll @@ -8,9 +8,9 @@ ; SDAG-NEXT: andl $1, %eax ; FAST-LABEL: fcmp_oeq ; FAST: ucomiss %xmm1, %xmm0 -; FAST-NEXT: sete %al -; FAST-NEXT: setnp %cl -; FAST-NEXT: andb %al, %cl +; FAST-NEXT: sete %cl +; FAST-NEXT: setnp %al +; FAST-NEXT: andb %cl, %al %1 = fcmp oeq float %x, %y ret i1 %1 } @@ -153,9 +153,9 @@ ; SDAG-NEXT: andl $1, %eax ; FAST-LABEL: fcmp_une ; FAST: ucomiss %xmm1, %xmm0 -; FAST-NEXT: setne %al -; FAST-NEXT: setp %cl -; FAST-NEXT: orb %al, %cl +; FAST-NEXT: setne %cl +; FAST-NEXT: setp %al +; FAST-NEXT: orb %cl, %al %1 = fcmp une float %x, %y ret i1 %1 } @@ -290,10 +290,11 @@ ; SDAG-NEXT: andl $1, %eax ; FAST-LABEL: fcmp_oeq3 ; FAST: xorps %xmm1, %xmm1 +; FAST-NEXT: xorl %eax, %eax ; FAST-NEXT: ucomiss %xmm1, %xmm0 -; FAST-NEXT: sete %al -; FAST-NEXT: setnp %cl -; FAST-NEXT: andb %al, %cl +; FAST-NEXT: sete %cl +; FAST-NEXT: setnp %al +; FAST-NEXT: andb %cl, %al %1 = fcmp oeq float %x, 0.000000e+00 ret i1 %1 } @@ -314,6 +315,7 @@ ; SDAG-NEXT: seta %al ; FAST-LABEL: fcmp_ogt3 ; FAST: xorps %xmm1, %xmm1 +; FAST-NEXT: xorl %eax, %eax ; FAST-NEXT: ucomiss %xmm1, %xmm0 ; FAST-NEXT: seta %al %1 = fcmp ogt float %x, 0.000000e+00 @@ -338,6 +340,7 @@ ; SDAG-NEXT: setae %al ; FAST-LABEL: fcmp_oge3 ; FAST: xorps %xmm1, %xmm1 +; FAST-NEXT: xorl %eax, %eax ; FAST-NEXT: ucomiss %xmm1, %xmm0 ; FAST-NEXT: setae %al %1 = fcmp oge float %x, 0.000000e+00 @@ -360,6 +363,7 @@ ; SDAG-NEXT: seta %al ; FAST-LABEL: fcmp_olt3 ; FAST: xorps %xmm1, %xmm1 +; FAST-NEXT: xorl %eax, %eax ; FAST-NEXT: ucomiss %xmm0, %xmm1 ; FAST-NEXT: seta %al %1 = fcmp olt float %x, 0.000000e+00 @@ -384,6 +388,7 @@ ; SDAG-NEXT: setae %al ; FAST-LABEL: fcmp_ole3 ; FAST: xorps %xmm1, %xmm1 +; FAST-NEXT: xorl %eax, %eax ; FAST-NEXT: ucomiss %xmm0, %xmm1 ; FAST-NEXT: setae %al %1 = fcmp ole float %x, 0.000000e+00 @@ -406,6 +411,7 @@ ; SDAG-NEXT: setne %al ; FAST-LABEL: fcmp_one3 ; FAST: xorps %xmm1, %xmm1 +; FAST-NEXT: xorl %eax, %eax ; FAST-NEXT: ucomiss %xmm1, %xmm0 ; FAST-NEXT: setne %al %1 = fcmp one float %x, 0.000000e+00 @@ -472,6 +478,7 @@ ; SDAG-NEXT: sete %al ; FAST-LABEL: fcmp_ueq3 ; FAST: xorps %xmm1, %xmm1 +; FAST-NEXT: xorl %eax, %eax ; FAST-NEXT: ucomiss %xmm1, %xmm0 ; FAST-NEXT: sete %al %1 = fcmp ueq float %x, 0.000000e+00 @@ -496,6 +503,7 @@ ; SDAG-NEXT: setb %al ; FAST-LABEL: fcmp_ugt3 ; FAST: xorps %xmm1, %xmm1 +; FAST-NEXT: xorl %eax, %eax ; FAST-NEXT: ucomiss %xmm0, %xmm1 ; FAST-NEXT: setb %al %1 = fcmp ugt float %x, 0.000000e+00 @@ -518,6 +526,7 @@ ; SDAG-NEXT: setbe %al ; FAST-LABEL: fcmp_uge3 ; FAST: xorps %xmm1, %xmm1 +; FAST-NEXT: xorl %eax, %eax ; FAST-NEXT: ucomiss %xmm0, %xmm1 ; FAST-NEXT: setbe %al %1 = fcmp uge float %x, 0.000000e+00 @@ -542,6 +551,7 @@ ; SDAG-NEXT: setb %al ; FAST-LABEL: fcmp_ult3 ; FAST: xorps %xmm1, %xmm1 +; FAST-NEXT: xorl %eax, %eax ; FAST-NEXT: ucomiss %xmm1, %xmm0 ; FAST-NEXT: setb %al %1 = fcmp ult float %x, 0.000000e+00 @@ -564,6 +574,7 @@ ; SDAG-NEXT: setbe %al ; FAST-LABEL: fcmp_ule3 ; FAST: xorps %xmm1, %xmm1 +; FAST-NEXT: xorl %eax, %eax ; FAST-NEXT: ucomiss %xmm1, %xmm0 ; FAST-NEXT: setbe %al %1 = fcmp ule float %x, 0.000000e+00 @@ -589,10 +600,11 @@ ; SDAG-NEXT: andl $1, %eax ; FAST-LABEL: fcmp_une3 ; FAST: xorps %xmm1, %xmm1 +; FAST: xorl %eax, %eax ; FAST-NEXT: ucomiss %xmm1, %xmm0 -; FAST-NEXT: setne %al -; FAST-NEXT: setp %cl -; FAST-NEXT: orb %al, %cl +; FAST-NEXT: setne %cl +; FAST-NEXT: setp %al +; FAST-NEXT: orb %cl, %al %1 = fcmp une float %x, 0.000000e+00 ret i1 %1 } Index: test/CodeGen/X86/fp128-cast.ll =================================================================== --- test/CodeGen/X86/fp128-cast.ll +++ test/CodeGen/X86/fp128-cast.ll @@ -238,8 +238,7 @@ ; X64-LABEL: TestConst128: ; X64: movaps {{.*}}, %xmm1 ; X64-NEXT: callq __gttf2 -; X64-NEXT: xorl -; X64-NEXT: test +; X64: test ; X64: retq } Index: test/CodeGen/X86/fp128-compare.ll =================================================================== --- test/CodeGen/X86/fp128-compare.ll +++ test/CodeGen/X86/fp128-compare.ll @@ -1,101 +1,140 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s ; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s define i32 @TestComp128GT(fp128 %d1, fp128 %d2) { -entry: - %cmp = fcmp ogt fp128 %d1, %d2 - %conv = zext i1 %cmp to i32 - ret i32 %conv ; CHECK-LABEL: TestComp128GT: -; CHECK: callq __gttf2 -; CHECK: xorl %ecx, %ecx -; CHECK: setg %cl -; CHECK: movl %ecx, %eax -; CHECK: retq +; CHECK: # BB#0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .Ltmp0: +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: callq __gttf2 +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testl %ecx, %ecx +; CHECK-NEXT: setg %al +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: retq +entry: + %cmp = fcmp ogt fp128 %d1, %d2 + %conv = zext i1 %cmp to i32 + ret i32 %conv } define i32 @TestComp128GE(fp128 %d1, fp128 %d2) { -entry: - %cmp = fcmp oge fp128 %d1, %d2 - %conv = zext i1 %cmp to i32 - ret i32 %conv ; CHECK-LABEL: TestComp128GE: -; CHECK: callq __getf2 -; CHECK: xorl %ecx, %ecx -; CHECK: testl %eax, %eax -; CHECK: setns %cl -; CHECK: movl %ecx, %eax -; CHECK: retq +; CHECK: # BB#0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .Ltmp1: +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: callq __getf2 +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testl %ecx, %ecx +; CHECK-NEXT: setns %al +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: retq +entry: + %cmp = fcmp oge fp128 %d1, %d2 + %conv = zext i1 %cmp to i32 + ret i32 %conv } define i32 @TestComp128LT(fp128 %d1, fp128 %d2) { -entry: - %cmp = fcmp olt fp128 %d1, %d2 - %conv = zext i1 %cmp to i32 - ret i32 %conv ; CHECK-LABEL: TestComp128LT: -; CHECK: callq __lttf2 -; CHECK-NEXT: shrl $31, %eax -; CHECK: retq -; +; CHECK: # BB#0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .Ltmp2: +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: callq __lttf2 +; CHECK-NEXT: shrl $31, %eax +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: retq +entry: + %cmp = fcmp olt fp128 %d1, %d2 + %conv = zext i1 %cmp to i32 + ret i32 %conv ; The 'shrl' is a special optimization in llvm to combine ; the effect of 'fcmp olt' and 'zext'. The main purpose is ; to test soften call to __lttf2. } define i32 @TestComp128LE(fp128 %d1, fp128 %d2) { -entry: - %cmp = fcmp ole fp128 %d1, %d2 - %conv = zext i1 %cmp to i32 - ret i32 %conv ; CHECK-LABEL: TestComp128LE: -; CHECK: callq __letf2 -; CHECK: xorl %ecx, %ecx -; CHECK: testl %eax, %eax -; CHECK: setle %cl -; CHECK: movl %ecx, %eax -; CHECK: retq +; CHECK: # BB#0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .Ltmp3: +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: callq __letf2 +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testl %ecx, %ecx +; CHECK-NEXT: setle %al +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: retq +entry: + %cmp = fcmp ole fp128 %d1, %d2 + %conv = zext i1 %cmp to i32 + ret i32 %conv } define i32 @TestComp128EQ(fp128 %d1, fp128 %d2) { -entry: - %cmp = fcmp oeq fp128 %d1, %d2 - %conv = zext i1 %cmp to i32 - ret i32 %conv ; CHECK-LABEL: TestComp128EQ: -; CHECK: callq __eqtf2 -; CHECK: xorl %ecx, %ecx -; CHECK: testl %eax, %eax -; CHECK: sete %cl -; CHECK: movl %ecx, %eax -; CHECK: retq +; CHECK: # BB#0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .Ltmp4: +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: callq __eqtf2 +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testl %ecx, %ecx +; CHECK-NEXT: sete %al +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: retq +entry: + %cmp = fcmp oeq fp128 %d1, %d2 + %conv = zext i1 %cmp to i32 + ret i32 %conv } define i32 @TestComp128NE(fp128 %d1, fp128 %d2) { -entry: - %cmp = fcmp une fp128 %d1, %d2 - %conv = zext i1 %cmp to i32 - ret i32 %conv ; CHECK-LABEL: TestComp128NE: -; CHECK: callq __netf2 -; CHECK: xorl %ecx, %ecx -; CHECK: testl %eax, %eax -; CHECK: setne %cl -; CHECK: movl %ecx, %eax -; CHECK: retq +; CHECK: # BB#0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .Ltmp5: +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: callq __netf2 +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testl %ecx, %ecx +; CHECK-NEXT: setne %al +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: retq +entry: + %cmp = fcmp une fp128 %d1, %d2 + %conv = zext i1 %cmp to i32 + ret i32 %conv } define fp128 @TestMax(fp128 %x, fp128 %y) { +; CHECK-LABEL: TestMax: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: .Ltmp6: +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; CHECK-NEXT: callq __gttf2 +; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: testl %eax, %eax +; CHECK-NEXT: jg .LBB6_2 +; CHECK-NEXT: # BB#1: # %entry +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: .LBB6_2: # %entry +; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: retq entry: %cmp = fcmp ogt fp128 %x, %y %cond = select i1 %cmp, fp128 %x, fp128 %y ret fp128 %cond -; CHECK-LABEL: TestMax: -; CHECK: movaps %xmm0 -; CHECK: movaps %xmm1 -; CHECK: callq __gttf2 -; CHECK: movaps {{.*}}, %xmm0 -; CHECK: testl %eax, %eax -; CHECK: movaps {{.*}}, %xmm0 -; CHECK: retq } Index: test/CodeGen/X86/sse-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -570,20 +570,20 @@ define i32 @test_mm_comieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind { ; X32-LABEL: test_mm_comieq_ss: ; X32: # BB#0: +; X32-NEXT: xorl %eax, %eax ; X32-NEXT: comiss %xmm1, %xmm0 -; X32-NEXT: setnp %al -; X32-NEXT: sete %cl -; X32-NEXT: andb %al, %cl -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: setnp %cl +; X32-NEXT: sete %al +; X32-NEXT: andb %cl, %al ; X32-NEXT: retl ; ; X64-LABEL: test_mm_comieq_ss: ; X64: # BB#0: +; X64-NEXT: xorl %eax, %eax ; X64-NEXT: comiss %xmm1, %xmm0 -; X64-NEXT: setnp %al -; X64-NEXT: sete %cl -; X64-NEXT: andb %al, %cl -; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: setnp %cl +; X64-NEXT: sete %al +; X64-NEXT: andb %cl, %al ; X64-NEXT: retq %res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1) ret i32 %res @@ -669,20 +669,20 @@ define i32 @test_mm_comineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind { ; X32-LABEL: test_mm_comineq_ss: ; X32: # BB#0: +; X32-NEXT: xorl %eax, %eax ; X32-NEXT: comiss %xmm1, %xmm0 -; X32-NEXT: setp %al -; X32-NEXT: setne %cl -; X32-NEXT: orb %al, %cl -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: setp %cl +; X32-NEXT: setne %al +; X32-NEXT: orb %cl, %al ; X32-NEXT: retl ; ; X64-LABEL: test_mm_comineq_ss: ; X64: # BB#0: +; X64-NEXT: xorl %eax, %eax ; X64-NEXT: comiss %xmm1, %xmm0 -; X64-NEXT: setp %al -; X64-NEXT: setne %cl -; X64-NEXT: orb %al, %cl -; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: setp %cl +; X64-NEXT: setne %al +; X64-NEXT: orb %cl, %al ; X64-NEXT: retq %res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1) ret i32 %res @@ -2071,20 +2071,20 @@ define i32 @test_mm_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind { ; X32-LABEL: test_mm_ucomieq_ss: ; X32: # BB#0: +; X32-NEXT: xorl %eax, %eax ; X32-NEXT: ucomiss %xmm1, %xmm0 -; X32-NEXT: setnp %al -; X32-NEXT: sete %cl -; X32-NEXT: andb %al, %cl -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: setnp %cl +; X32-NEXT: sete %al +; X32-NEXT: andb %cl, %al ; X32-NEXT: retl ; ; X64-LABEL: test_mm_ucomieq_ss: ; X64: # BB#0: +; X64-NEXT: xorl %eax, %eax ; X64-NEXT: ucomiss %xmm1, %xmm0 -; X64-NEXT: setnp %al -; X64-NEXT: sete %cl -; X64-NEXT: andb %al, %cl -; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: setnp %cl +; X64-NEXT: sete %al +; X64-NEXT: andb %cl, %al ; X64-NEXT: retq %res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1) ret i32 %res @@ -2170,20 +2170,20 @@ define i32 @test_mm_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind { ; X32-LABEL: test_mm_ucomineq_ss: ; X32: # BB#0: +; X32-NEXT: xorl %eax, %eax ; X32-NEXT: ucomiss %xmm1, %xmm0 -; X32-NEXT: setp %al -; X32-NEXT: setne %cl -; X32-NEXT: orb %al, %cl -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: setp %cl +; X32-NEXT: setne %al +; X32-NEXT: orb %cl, %al ; X32-NEXT: retl ; ; X64-LABEL: test_mm_ucomineq_ss: ; X64: # BB#0: +; X64-NEXT: xorl %eax, %eax ; X64-NEXT: ucomiss %xmm1, %xmm0 -; X64-NEXT: setp %al -; X64-NEXT: setne %cl -; X64-NEXT: orb %al, %cl -; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: setp %cl +; X64-NEXT: setne %al +; X64-NEXT: orb %cl, %al ; X64-NEXT: retq %res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1) ret i32 %res Index: test/CodeGen/X86/sse-intrinsics-x86.ll =================================================================== --- test/CodeGen/X86/sse-intrinsics-x86.ll +++ test/CodeGen/X86/sse-intrinsics-x86.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by update_llc_test_checks.py ; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse | FileCheck %s --check-prefix=SSE ; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL @@ -53,20 +54,20 @@ define i32 @test_x86_sse_comieq_ss(<4 x float> %a0, <4 x float> %a1) { ; SSE-LABEL: test_x86_sse_comieq_ss: ; SSE: ## BB#0: +; SSE-NEXT: xorl %eax, %eax ; SSE-NEXT: comiss %xmm1, %xmm0 -; SSE-NEXT: setnp %al -; SSE-NEXT: sete %cl -; SSE-NEXT: andb %al, %cl -; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: setnp %cl +; SSE-NEXT: sete %al +; SSE-NEXT: andb %cl, %al ; SSE-NEXT: retl ; ; KNL-LABEL: test_x86_sse_comieq_ss: ; KNL: ## BB#0: +; KNL-NEXT: xorl %eax, %eax ; KNL-NEXT: vcomiss %xmm1, %xmm0 -; KNL-NEXT: setnp %al -; KNL-NEXT: sete %cl -; KNL-NEXT: andb %al, %cl -; KNL-NEXT: movzbl %cl, %eax +; KNL-NEXT: setnp %cl +; KNL-NEXT: sete %al +; KNL-NEXT: andb %cl, %al ; KNL-NEXT: retl %res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1) ; [#uses=1] ret i32 %res @@ -157,20 +158,20 @@ define i32 @test_x86_sse_comineq_ss(<4 x float> %a0, <4 x float> %a1) { ; SSE-LABEL: test_x86_sse_comineq_ss: ; SSE: ## BB#0: +; SSE-NEXT: xorl %eax, %eax ; SSE-NEXT: comiss %xmm1, %xmm0 -; SSE-NEXT: setp %al -; SSE-NEXT: setne %cl -; SSE-NEXT: orb %al, %cl -; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: setp %cl +; SSE-NEXT: setne %al +; SSE-NEXT: orb %cl, %al ; SSE-NEXT: retl ; ; KNL-LABEL: test_x86_sse_comineq_ss: ; KNL: ## BB#0: +; KNL-NEXT: xorl %eax, %eax ; KNL-NEXT: vcomiss %xmm1, %xmm0 -; KNL-NEXT: setp %al -; KNL-NEXT: setne %cl -; KNL-NEXT: orb %al, %cl -; KNL-NEXT: movzbl %cl, %eax +; KNL-NEXT: setp %cl +; KNL-NEXT: setne %al +; KNL-NEXT: orb %cl, %al ; KNL-NEXT: retl %res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1) ; [#uses=1] ret i32 %res @@ -493,20 +494,20 @@ define i32 @test_x86_sse_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) { ; SSE-LABEL: test_x86_sse_ucomieq_ss: ; SSE: ## BB#0: +; SSE-NEXT: xorl %eax, %eax ; SSE-NEXT: ucomiss %xmm1, %xmm0 -; SSE-NEXT: setnp %al -; SSE-NEXT: sete %cl -; SSE-NEXT: andb %al, %cl -; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: setnp %cl +; SSE-NEXT: sete %al +; SSE-NEXT: andb %cl, %al ; SSE-NEXT: retl ; ; KNL-LABEL: test_x86_sse_ucomieq_ss: ; KNL: ## BB#0: +; KNL-NEXT: xorl %eax, %eax ; KNL-NEXT: vucomiss %xmm1, %xmm0 -; KNL-NEXT: setnp %al -; KNL-NEXT: sete %cl -; KNL-NEXT: andb %al, %cl -; KNL-NEXT: movzbl %cl, %eax +; KNL-NEXT: setnp %cl +; KNL-NEXT: sete %al +; KNL-NEXT: andb %cl, %al ; KNL-NEXT: retl %res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1) ; [#uses=1] ret i32 %res @@ -597,20 +598,20 @@ define i32 @test_x86_sse_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) { ; SSE-LABEL: test_x86_sse_ucomineq_ss: ; SSE: ## BB#0: +; SSE-NEXT: xorl %eax, %eax ; SSE-NEXT: ucomiss %xmm1, %xmm0 -; SSE-NEXT: setp %al -; SSE-NEXT: setne %cl -; SSE-NEXT: orb %al, %cl -; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: setp %cl +; SSE-NEXT: setne %al +; SSE-NEXT: orb %cl, %al ; SSE-NEXT: retl ; ; KNL-LABEL: test_x86_sse_ucomineq_ss: ; KNL: ## BB#0: +; KNL-NEXT: xorl %eax, %eax ; KNL-NEXT: vucomiss %xmm1, %xmm0 -; KNL-NEXT: setp %al -; KNL-NEXT: setne %cl -; KNL-NEXT: orb %al, %cl -; KNL-NEXT: movzbl %cl, %eax +; KNL-NEXT: setp %cl +; KNL-NEXT: setne %al +; KNL-NEXT: orb %cl, %al ; KNL-NEXT: retl %res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1) ; [#uses=1] ret i32 %res