Index: include/llvm/Target/TargetInstrInfo.h =================================================================== --- include/llvm/Target/TargetInstrInfo.h +++ include/llvm/Target/TargetInstrInfo.h @@ -1440,6 +1440,17 @@ virtual void breakPartialRegDependency(MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {} + /// May return true if the instruction in question is a dependency breaking + /// instruction. If so, the register number for which it is dependency + /// breaking should be returned in `OutReg`. It is prefereable to return + /// false if the result cannot be determined. This would at worst result + /// in the insertion of an unnecessary instruction, while the other + /// alternative could result in significant false-dependency penalties. + virtual bool isDependencyBreak(MachineInstr &MI, + unsigned *OutReg = nullptr) const { + return false; + } + /// Create machine specific model for scheduling. virtual DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &) const { Index: lib/CodeGen/ExecutionDepsFix.cpp =================================================================== --- lib/CodeGen/ExecutionDepsFix.cpp +++ lib/CodeGen/ExecutionDepsFix.cpp @@ -214,13 +214,18 @@ bool isBlockDone(MachineBasicBlock *); void processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass, bool Done); void updateSuccessors(MachineBasicBlock *MBB, bool Primary, bool Done); - bool visitInstr(MachineInstr *); + bool visitInstr(MachineInstr *, bool PrimaryPass); void processDefs(MachineInstr *, bool BlockDone, bool Kill); void visitSoftInstr(MachineInstr*, unsigned mask); void visitHardInstr(MachineInstr*, unsigned domain); - void pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, - unsigned Pref); + void pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, unsigned Pref, + bool &TrueDependency); bool shouldBreakDependence(MachineInstr*, unsigned OpIdx, unsigned Pref); + + // Undef Reads + void collapseUndefReads(unsigned from, unsigned to, unsigned Reg); + unsigned updateChooseableRegs(SparseSet &, + const TargetRegisterClass *, bool); void processUndefReads(MachineBasicBlock*); }; } @@ -394,11 +399,19 @@ // This is the entry block. if (MBB->pred_empty()) { + // Treat all registers as being defined just before the first instruction. + // Howver, we want the logic later to prefer non live-ins over live-ins, + // so pretend the live-ins were defined slightly later. + // We used to only do this for live-ins, but that's a bit of a gamble. + // If our caller does arithmetic with these registers is is quite likely + // that it will have used registers beyond the ones that are live here. + // Given the immense penalty for getting this wrong, being conservative + // here seems worth it. + for (unsigned rx = 0; rx != NumRegs; ++rx) { + LiveRegs[rx].Def = -2; + } for (const auto &LI : MBB->liveins()) { for (int rx : regIndices(LI.PhysReg)) { - // Treat function live-ins as if they were defined just before the first - // instruction. Usually, function arguments are set up immediately - // before the call. LiveRegs[rx].Def = -1; } } @@ -470,24 +483,36 @@ LiveRegs = nullptr; } -bool ExeDepsFix::visitInstr(MachineInstr *MI) { - // Update instructions with explicit execution domains. - std::pair DomP = TII->getExecutionDomain(*MI); - if (DomP.first) { - if (DomP.second) - visitSoftInstr(MI, DomP.second); - else - visitHardInstr(MI, DomP.first); +bool ExeDepsFix::visitInstr(MachineInstr *MI, bool PrimaryPass) { + bool Kill = false; + + if (PrimaryPass) { + // Update instructions with explicit execution domains. + std::pair DomP = TII->getExecutionDomain(*MI); + if (DomP.first) { + if (DomP.second) + visitSoftInstr(MI, DomP.second); + else + visitHardInstr(MI, DomP.first); + } + Kill = !DomP.first; } - return !DomP.first; + // If this is a call, pretend all registers we are considering are def'd here. + // We have no idea which registers the callee may use. + if (MI->isCall()) { + for (unsigned i = 0, e = NumRegs; i != e; ++i) + LiveRegs[i].Def = CurInstr; + } + + return Kill; } /// \brief Helps avoid false dependencies on undef registers by updating the /// machine instructions' undef operand to use a register that the instruction /// is truly dependent on, or use a register with clearance higher than Pref. void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, - unsigned Pref) { + unsigned Pref, bool &TrueDependency) { MachineOperand &MO = MI->getOperand(OpIdx); assert(MO.isUndef() && "Expected undef machine operand"); @@ -510,6 +535,7 @@ // We found a true dependency - replace the undef register with the true // dependency. MO.setReg(CurrMO.getReg()); + TrueDependency = true; return; } @@ -571,9 +597,14 @@ if (BlockDone) { unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI); if (Pref) { - pickBestRegisterForUndef(MI, OpNum, Pref); - if (shouldBreakDependence(MI, OpNum, Pref)) + bool TrueDependency = false; + pickBestRegisterForUndef(MI, OpNum, Pref, TrueDependency); + // Don't bother adding true dependencies to UndefReads. All we'd find out + // is that the register is live (since this very instruction depends on + // it), so we can't do anything. + if (!TrueDependency && shouldBreakDependence(MI, OpNum, Pref)) { UndefReads.push_back(std::make_pair(MI, OpNum)); + } } } const MCInstrDesc &MCID = MI->getDesc(); @@ -606,9 +637,52 @@ kill(rx); } } + unsigned DepReg = 0; + if (TII->isDependencyBreak(*MI, &DepReg)) { + for (int rx : regIndices(DepReg)) { + // This instruction is a dependency break, so there are no clearance + // issues, reset the counter. + LiveRegs[rx].Def = -(1 << 20); + } + } ++CurInstr; } +// Set the undef read register to `Reg` for all UndefReads in the range +// [from,to). +void ExeDepsFix::collapseUndefReads(unsigned from, unsigned to, unsigned Reg) { + if (from >= to) + return; + for (unsigned i = from; i < to; ++i) { + MachineInstr *MI = std::get<0>(UndefReads[i]); + unsigned OpIdx = std::get<1>(UndefReads[i]); + MachineOperand &MO = MI->getOperand(OpIdx); + MO.setReg(Reg); + } + TII->breakPartialRegDependency(*std::get<0>(UndefReads[from]), + std::get<1>(UndefReads[from]), TRI); +} + +unsigned ExeDepsFix::updateChooseableRegs(SparseSet &ChoosableRegs, + const TargetRegisterClass *OpRC, + bool add) { + unsigned LowestValid = (unsigned)-1; + ArrayRef Order = RegClassInfo.getOrder(OpRC); + for (auto Reg : Order) { + if (LiveRegSet.contains(Reg)) + ChoosableRegs.erase(Reg); + else if (add) { + ChoosableRegs.insert(Reg); + if (LowestValid == (unsigned)-1) + LowestValid = Reg; + } else if (ChoosableRegs.count(Reg) == 1) { + if (LowestValid == (unsigned)-1) + LowestValid = Reg; + } + } + return LowestValid; +} + /// \break Break false dependencies on undefined register reads. /// /// Walk the block backward computing precise liveness. This is expensive, so we @@ -619,31 +693,87 @@ if (UndefReads.empty()) return; + // We want to be slightly clever here, to avoid the following common pattern: + // Suppose we have some instruction `vrandom %in, %out` and the following code + // vrandom %xmm0, %xmm0 + // vrandom %xmm1, %xmm1 + // vrandom %xmm2, %xmm2 + // vrandom %xmm3, %xmm3 + // The earlier logic likes to produce these, because it picks the first + // register + // to break ties in clearance. However, most register allocators pick the dest + // register the same way. Naively, we'd have to insert a dependency break, + // before every instruction above. However, what we really want is + // vxorps %xmm3, %xmm3, %xmm3 + // vrandom %xmm3, %xmm0 + // vrandom %xmm3, %xmm1 + // vrandom %xmm3, %xmm2 + // vrandom %xmm3, %xmm3 + // To do so, we walk backwards and cumulatively keep track of which registers + // we can use to break the dependency. Then, once the set has collapsed, we + // reset the undef read register for all following instructions. + // Collect this block's live out register units. LiveRegSet.init(*TRI); // We do not need to care about pristine registers as they are just preserved // but not actually used in the function. LiveRegSet.addLiveOutsNoPristines(*MBB); - MachineInstr *UndefMI = UndefReads.back().first; - unsigned OpIdx = UndefReads.back().second; + SparseSet ChoosableRegs; + ChoosableRegs.setUniverse(TRI->getNumRegs()); + + unsigned LastValid = (unsigned)-1; + const TargetRegisterClass *LastOpRC = nullptr; + size_t i, LastInit; + i = LastInit = UndefReads.size() - 1; + MachineInstr *UndefMI = std::get<0>(UndefReads[i]); for (MachineInstr &I : make_range(MBB->rbegin(), MBB->rend())) { // Update liveness, including the current instruction's defs. LiveRegSet.stepBackward(I); + // This ensures that we don't accidentally pick a register whose live region + // lies entirely between two undef reads (since that would defeat the + // purpose of breaking the dependency). + for (auto LiveReg : LiveRegSet) + ChoosableRegs.erase(LiveReg); + if (UndefMI == &I) { - if (!LiveRegSet.contains(UndefMI->getOperand(OpIdx).getReg())) - TII->breakPartialRegDependency(*UndefMI, OpIdx, TRI); + unsigned OpIdx = std::get<1>(UndefReads[i]); + // Get the undef operand's register class + const TargetRegisterClass *OpRC = + TII->getRegClass(UndefMI->getDesc(), OpIdx, TRI, *MF); + if (OpRC != LastOpRC || ChoosableRegs.size() == 0) { + if (LastInit != i) { + if (LastValid != (unsigned)-1) + collapseUndefReads(i + 1, LastInit + 1, LastValid); + ChoosableRegs.clear(); + LastInit = i; + } + } + + unsigned LowestValid = + updateChooseableRegs(ChoosableRegs, OpRC, LastInit == i); + + if (ChoosableRegs.size() == 0) { + if (LastInit != i) { + if (LastValid != (unsigned)-1) + collapseUndefReads(i + 1, LastInit + 1, LastValid); + LowestValid = updateChooseableRegs(ChoosableRegs, OpRC, true); + LastInit = i; + } + } + LastValid = LowestValid; + LastOpRC = OpRC; - UndefReads.pop_back(); - if (UndefReads.empty()) - return; + if (i == 0) + break; - UndefMI = UndefReads.back().first; - OpIdx = UndefReads.back().second; + UndefMI = std::get<0>(UndefReads[--i]); } } + if (LastValid != (unsigned)-1) + collapseUndefReads(0, LastInit + 1, LastValid); } // A hard instruction only works in one domain. All input registers will be @@ -793,9 +923,7 @@ enterBasicBlock(MBB); for (MachineInstr &MI : *MBB) { if (!MI.isDebugValue()) { - bool Kill = false; - if (PrimaryPass) - Kill = visitInstr(&MI); + bool Kill = visitInstr(&MI, PrimaryPass); processDefs(&MI, isBlockDone(MBB), Kill); } } Index: lib/Target/X86/X86InstrInfo.h =================================================================== --- lib/Target/X86/X86InstrInfo.h +++ lib/Target/X86/X86InstrInfo.h @@ -484,6 +484,7 @@ const TargetRegisterInfo *TRI) const override; void breakPartialRegDependency(MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override; + bool isDependencyBreak(MachineInstr &MI, unsigned *OutReg) const override; MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, unsigned OpNum, Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -7496,6 +7496,23 @@ } } +bool X86InstrInfo::isDependencyBreak(MachineInstr &MI, unsigned *OutReg) const { + unsigned Opc = MI.getOpcode(); + if (!(Opc == X86::VXORPSrr || Opc == X86::VXORPDrr || Opc == X86::XORPSrr || + Opc == X86::XORPDrr)) + return false; + unsigned Reg = 0; + for (unsigned i = 0; i < MI.getNumOperands(); ++i) { + const MachineOperand &MO = MI.getOperand(i); + if (!MO.isReg() || (Reg != 0 && MO.getReg() != Reg)) + return false; + Reg = MO.getReg(); + } + if (OutReg) + *OutReg = Reg; + return true; +} + MachineInstr * X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, Index: test/CodeGen/X86/avx-cvt.ll =================================================================== --- test/CodeGen/X86/avx-cvt.ll +++ test/CodeGen/X86/avx-cvt.ll @@ -74,6 +74,7 @@ define double @funcA(i64* nocapture %e) nounwind uwtable readonly ssp { ; CHECK-LABEL: funcA: ; CHECK: # BB#0: +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0 ; CHECK-NEXT: retq %tmp1 = load i64, i64* %e, align 8 @@ -84,6 +85,7 @@ define double @funcB(i32* nocapture %e) nounwind uwtable readonly ssp { ; CHECK-LABEL: funcB: ; CHECK: # BB#0: +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0 ; CHECK-NEXT: retq %tmp1 = load i32, i32* %e, align 4 @@ -94,6 +96,7 @@ define float @funcC(i32* nocapture %e) nounwind uwtable readonly ssp { ; CHECK-LABEL: funcC: ; CHECK: # BB#0: +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0 ; CHECK-NEXT: retq %tmp1 = load i32, i32* %e, align 4 @@ -104,6 +107,7 @@ define float @funcD(i64* nocapture %e) nounwind uwtable readonly ssp { ; CHECK-LABEL: funcD: ; CHECK: # BB#0: +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0 ; CHECK-NEXT: retq %tmp1 = load i64, i64* %e, align 8 @@ -114,6 +118,7 @@ define void @fpext() nounwind uwtable { ; CHECK-LABEL: fpext: ; CHECK: # BB#0: +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vcvtss2sd -{{[0-9]+}}(%rsp), %xmm0, %xmm0 ; CHECK-NEXT: vmovsd %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: retq @@ -144,5 +149,3 @@ ret float %res } declare float @llvm.floor.f32(float %p) - - Index: test/CodeGen/X86/avx512-cvt.ll =================================================================== --- test/CodeGen/X86/avx512-cvt.ll +++ test/CodeGen/X86/avx512-cvt.ll @@ -16,13 +16,14 @@ ; KNL: ## BB#0: ; KNL-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; KNL-NEXT: vpextrq $1, %xmm1, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; KNL-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2 ; KNL-NEXT: vmovq %xmm1, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 +; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm1 ; KNL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; KNL-NEXT: vextracti32x4 $2, %zmm0, %xmm2 ; KNL-NEXT: vpextrq $1, %xmm2, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3 +; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3 ; KNL-NEXT: vmovq %xmm2, %rax ; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2 ; KNL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] @@ -55,7 +56,8 @@ ; KNL: ## BB#0: ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; KNL-NEXT: vpextrq $1, %xmm1, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; KNL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ; KNL-NEXT: vmovq %xmm1, %rax ; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; KNL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -79,7 +81,8 @@ ; KNL-LABEL: sltof2f32: ; KNL: ## BB#0: ; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; KNL-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; KNL-NEXT: vmovq %xmm0, %rax ; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -100,9 +103,10 @@ ; KNL: ## BB#0: ; KNL-NEXT: vmovdqu (%rdi), %ymm0 ; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; KNL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 ; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vmovq %xmm0, %rax @@ -180,9 +184,10 @@ ; KNL-LABEL: sltof432: ; KNL: ## BB#0: ; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; KNL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 ; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vmovq %xmm0, %rax @@ -205,9 +210,10 @@ ; KNL-LABEL: ultof432: ; KNL: ## BB#0: ; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; KNL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 ; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 +; KNL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 ; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vmovq %xmm0, %rax @@ -231,13 +237,14 @@ ; KNL: ## BB#0: ; KNL-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; KNL-NEXT: vpextrq $1, %xmm1, %rax -; KNL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2 +; KNL-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; KNL-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2 ; KNL-NEXT: vmovq %xmm1, %rax -; KNL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1 +; KNL-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm1 ; KNL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; KNL-NEXT: vextracti32x4 $2, %zmm0, %xmm2 ; KNL-NEXT: vpextrq $1, %xmm2, %rax -; KNL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm3 +; KNL-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm3 ; KNL-NEXT: vmovq %xmm2, %rax ; KNL-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2 ; KNL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] @@ -479,6 +486,7 @@ define double @funcA(i64* nocapture %e) { ; ALL-LABEL: funcA: ; ALL: ## BB#0: ## %entry +; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; ALL-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0 ; ALL-NEXT: retq entry: @@ -490,6 +498,7 @@ define double @funcB(i32* %e) { ; ALL-LABEL: funcB: ; ALL: ## BB#0: ## %entry +; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; ALL-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0 ; ALL-NEXT: retq entry: @@ -501,6 +510,7 @@ define float @funcC(i32* %e) { ; ALL-LABEL: funcC: ; ALL: ## BB#0: ## %entry +; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; ALL-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0 ; ALL-NEXT: retq entry: @@ -512,6 +522,7 @@ define float @i64tof32(i64* %e) { ; ALL-LABEL: i64tof32: ; ALL: ## BB#0: ## %entry +; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; ALL-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0 ; ALL-NEXT: retq entry: @@ -720,6 +731,7 @@ define float @uitofp02(i32 %a) nounwind { ; ALL-LABEL: uitofp02: ; ALL: ## BB#0: +; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; ALL-NEXT: vcvtusi2ssl %edi, %xmm0, %xmm0 ; ALL-NEXT: retq %b = uitofp i32 %a to float @@ -729,6 +741,7 @@ define double @uitofp03(i32 %a) nounwind { ; ALL-LABEL: uitofp03: ; ALL: ## BB#0: +; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; ALL-NEXT: vcvtusi2sdl %edi, %xmm0, %xmm0 ; ALL-NEXT: retq %b = uitofp i32 %a to double @@ -1122,6 +1135,7 @@ ; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; KNL-NEXT: vpextrq $1, %xmm0, %rax ; KNL-NEXT: andl $1, %eax +; KNL-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; KNL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm1 ; KNL-NEXT: vmovq %xmm0, %rax ; KNL-NEXT: andl $1, %eax Index: test/CodeGen/X86/break-false-dep.ll =================================================================== --- test/CodeGen/X86/break-false-dep.ll +++ test/CodeGen/X86/break-false-dep.ll @@ -231,35 +231,55 @@ ;AVX: vcvtss2sd [[XMM0:%xmm[0-9]+]], [[XMM0]], {{%xmm[0-9]+}} } -; Make sure we are making a smart choice regarding undef registers and -; choosing the register with the highest clearence -define double @clearence(i64 %arg) { +define double @clearence(double %x, i64 %arg) { top: - tail call void asm sideeffect "", "~{xmm6},~{dirflag},~{fpsr},~{flags}"() - tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() - tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm7},~{dirflag},~{fpsr},~{flags}"() +;AVX-LABEL:@clearence +; This is carefully constructed to force LLVM to materialize a vxorps, which +; also implicitly breaks the dependency, making it a good candidate for the +; undef read below +;AVX: vxorps [[XMM1:%xmm1]], [[XMM1]], [[XMM1]] +;AVX: vucomisd [[XMM1]], %xmm0 + %0 = fcmp ult double %x, 0.0 + br i1 %0, label %main, label %fake + +main: + tail call void asm sideeffect "", "~{xmm0},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() + tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"() tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() %tmp1 = sitofp i64 %arg to double ret double %tmp1 -;AVX-LABEL:@clearence -;AVX: vxorps [[XMM6:%xmm6]], [[XMM6]], [[XMM6]] -;AVX-NEXT: vcvtsi2sdq {{.*}}, [[XMM6]], {{%xmm[0-9]+}} +; Check that we re-use the dependency break from above +;AVX-NOT: vxorps +;AVX: vcvtsi2sdq {{.*}}, [[XMM1]], {{%xmm[0-9]+}} +fake: + ret double 0.0 } ; Make sure we are making a smart choice regarding undef registers in order to ; avoid a cyclic dependence on a write to the same register in a previous ; iteration, especially when we cannot zero out the undef register because it ; is alive. -define i64 @loopclearence(i64* nocapture %x, double* nocapture %y) nounwind { +define i64 @loopclearence(float %z, double %a, double %b, double %c, i64* nocapture %x, double* nocapture %y) nounwind { entry: %vx = load i64, i64* %x - br label %loop +;AVX-LABEL:@loopclearence +;AVX: vxorps [[XMM4_7:%xmm[4-7]]], [[XMM4_7]], [[XMM4_7]] +;AVX-NEXT: vucomiss [[XMM4_7]], %xmm0 + %0 = fcmp ult float %z, 0.0 + br i1 %0, label %loop, label %fake + loop: %i = phi i64 [ 1, %entry ], [ %inc, %loop ] %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ] + store volatile double %a, double *%y + store volatile double %b, double *%y + store volatile double %c, double *%y + ; AVX-NOT: {{%xmm[4-7]}} + ; This register was forced to have an xorps, above, therefore it should be simply re-used + ; AVX-NOT: vxorps + ; AVX: vcvtsi2sdq {{.*}}, [[XMM4_7]], {{%xmm[0-9]+}} %fi = sitofp i64 %i to double - tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() %vy = load double, double* %y @@ -271,23 +291,26 @@ br i1 %exitcond, label %ret, label %loop ret: ret i64 %s2 -;AVX-LABEL:@loopclearence -;Registers 4-7 are not used and therefore one of them should be chosen -;AVX-NOT: {{%xmm[4-7]}} -;AVX: vcvtsi2sdq {{.*}}, [[XMM4_7:%xmm[4-7]]], {{%xmm[0-9]+}} -;AVX-NOT: [[XMM4_7]] +fake: + ret i64 0 } ; Make sure we are making a smart choice regarding undef registers even for more ; complicated loop structures. This example is the inner loop from ; julia> a = falses(10000); a[1:4:end] = true ; julia> linspace(1.0,2.0,10000)[a] -define void @loopclearance2(double* nocapture %y, i64* %x, double %c1, double %c2, double %c3, double %c4, i64 %size) { +define double @loopclearance2(double %z, double %c1, double %c2, double %c3, double %c4, double %c5, + double* nocapture %y, i64* %x, i64 %size) { entry: - tail call void asm sideeffect "", "~{xmm7},~{dirflag},~{fpsr},~{flags}"() - tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() - tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() - br label %loop + %fadd = fadd double %c4, %c5 + ;AVX-LABEL:@loopclearance2 +; AVX: vxorps [[XMM6:%xmm6]], [[XMM6]], [[XMM6]] +; AVX-NEXT: vucomisd [[XMM6]], %xmm + %cmp1 = fcmp ult double %fadd, 0.0 + br i1 %cmp1, label %loop, label %fake + +fake: + ret double %z loop: %phi_i = phi i64 [ 1, %entry ], [ %nexti, %loop_end ] @@ -314,13 +337,14 @@ ; the only reasonable choice. The primary thing we care about is that it's ; not one of the registers used in the loop (e.g. not the output reg here) ;AVX-NOT: %xmm6 -;AVX: vcvtsi2sdq {{.*}}, %xmm6, {{%xmm[0-9]+}} +;AVX-NOT: vxorps +;AVX-NOT: vxorpd +;AVX: vcvtsi2sdq {{.*}}, [[XMM6]], {{%xmm[0-9]+}} ;AVX-NOT: %xmm6 %nexti_f = sitofp i64 %nexti to double %sub = fsub double %c1, %nexti_f %mul = fmul double %sub, %c2 -;AVX: vcvtsi2sdq {{.*}}, %xmm6, {{%xmm[0-9]+}} -;AVX-NOT: %xmm6 +;AVX: vcvtsi2sdq {{.*}}, [[XMM6]], {{%xmm[0-9]+}} %phi_f = sitofp i64 %phi to double %mul2 = fmul double %phi_f, %c3 %add2 = fadd double %mul, %mul2 @@ -332,5 +356,37 @@ br i1 %done, label %loopdone, label %loop loopdone: + ret double 0.0 +} + +; Make sure that calls kill register clearance and that a we don't insert +; an extra dependency-breaking instruction if one suffices. +declare double @sin(double %x) +define void @callclearance(double *%x, i64 *%y, i64 *%z) { +entry: + br label %loop + +loop: + %idx = phi i32 [0, %entry], [%idx, %loop] + %valptr = getelementptr i64, i64* %y, i32 %idx + %valptr2 = getelementptr i64, i64* %z, i32 %idx + %outptr = getelementptr double, double* %x, i32 %idx +;AVX-LABEL:@callclearance +;AVX: vxorps [[THEXMM:%xmm[0-9]+]], [[THEXMM]], [[THEXMM]] +;AVX: vcvtsi2sdq {{.*}}, [[THEXMM]], {{%xmm[0-9]+}} +;AVX-NOT: vxorps +;AVX: vcvtsi2sdq {{.*}}, [[THEXMM]], {{%xmm[0-9]+}} + %val = load i64, i64 *%valptr + %val_f = sitofp i64 %val to double + %val2 = load i64, i64 *%valptr2 + %val2_f = sitofp i64 %val2 to double + %sined = call double @sin(double %val_f) + %sined2 = call double @sin(double %val2_f) + %sum = fadd double %sined, %sined2 + store double %sum, double *%x + %done = icmp sgt i32 %idx, 10000 + br i1 %done, label %end, label %loop + +end: ret void } Index: test/CodeGen/X86/combine-fcopysign.ll =================================================================== --- test/CodeGen/X86/combine-fcopysign.ll +++ test/CodeGen/X86/combine-fcopysign.ll @@ -228,6 +228,7 @@ ; SSE-LABEL: combine_vec_fcopysign_fpext_sgn: ; SSE: # BB#0: ; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: xorps %xmm4, %xmm4 ; SSE-NEXT: cvtss2sd %xmm2, %xmm4 ; SSE-NEXT: movshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] ; SSE-NEXT: movaps %xmm2, %xmm6 @@ -282,6 +283,7 @@ ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: movaps {{.*#+}} xmm5 ; SSE-NEXT: andps %xmm5, %xmm0 +; SSE-NEXT: xorps %xmm6, %xmm6 ; SSE-NEXT: cvtsd2ss %xmm1, %xmm6 ; SSE-NEXT: movaps {{.*#+}} xmm4 = [-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00] ; SSE-NEXT: andps %xmm4, %xmm6 Index: test/CodeGen/X86/fold-load-unops.ll =================================================================== --- test/CodeGen/X86/fold-load-unops.ll +++ test/CodeGen/X86/fold-load-unops.ll @@ -88,6 +88,7 @@ ; ; AVX-LABEL: rcpss_size: ; AVX: # BB#0: +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load float, float* %a @@ -105,6 +106,7 @@ ; ; AVX-LABEL: rsqrtss_size: ; AVX: # BB#0: +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load float, float* %a @@ -122,6 +124,7 @@ ; ; AVX-LABEL: sqrtss_size: ; AVX: # BB#0: +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vsqrtss (%rdi), %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load float, float* %a @@ -139,6 +142,7 @@ ; ; AVX-LABEL: sqrtsd_size: ; AVX: # BB#0: +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vsqrtsd (%rdi), %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load double, double* %a Index: test/CodeGen/X86/half.ll =================================================================== --- test/CodeGen/X86/half.ll +++ test/CodeGen/X86/half.ll @@ -101,13 +101,15 @@ ; CHECK-LIBCALL-NEXT: pushq [[ADDR:%[a-z]+]] ; CHECK-LIBCALL-NEXT: movq %rsi, [[ADDR]] +; CHECK-LIBCALL-NEXT: xorps ; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, %xmm0 ; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee ; CHECK-LIBCALL-NEXT: movw %ax, ([[ADDR]]) ; CHECK_LIBCALL-NEXT: popq [[ADDR]] ; CHECK_LIBCALL-NEXT: retq -; CHECK-F16C-NEXT: vcvtsi2ssq %rdi, [[REG0:%[a-z0-9]+]], [[REG0]] +; CHECK-F16C-NEXT: vxorps [[REG0:%[a-z0-9]+]], [[REG0]], [[REG0]] +; CHECK-F16C-NEXT: vcvtsi2ssq %rdi, [[REG0]], [[REG0]] ; CHECK-F16C-NEXT: vcvtps2ph $4, [[REG0]], [[REG0]] ; CHECK-F16C-NEXT: vmovd [[REG0]], %eax ; CHECK-F16C-NEXT: movw %ax, (%rsi) @@ -161,7 +163,9 @@ ; CHECK-NEXT: js [[LABEL1:.LBB[0-9_]+]] ; simple conversion to float if non-negative +; CHECK-LIBCALL-NEXT: xorps ; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, [[REG1:%[a-z0-9]+]] +; CHECK-F16C-NEXT: vxorps ; CHECK-F16C-NEXT: vcvtsi2ssq %rdi, [[REG1:%[a-z0-9]+]], [[REG1]] ; CHECK-NEXT: jmp [[LABEL2:.LBB[0-9_]+]] @@ -171,8 +175,10 @@ ; CHECK-NEXT: shrq %rax ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: orq %rax, [[REG2:%[a-z0-9]+]] +; CHECK-LIBCALL-NEXT: xorps ; CHECK-LIBCALL-NEXT: cvtsi2ssq [[REG2]], [[REG3:%[a-z0-9]+]] ; CHECK-LIBCALL-NEXT: addss [[REG3]], [[REG1]] +; CHECK-F16C-NEXT: vxorps ; CHECK-F16C-NEXT: vcvtsi2ssq [[REG2]], [[REG3:%[a-z0-9]+]], [[REG3]] ; CHECK-F16C-NEXT: vaddss [[REG3]], [[REG3]], [[REG1:[%a-z0-9]+]] @@ -287,6 +293,7 @@ ; CHECK-LIBCALL-NEXT: movzwl (%rsi), %edi ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee ; CHECK-LIBCALL-NEXT: movss %xmm0, 12(%rsp) +; CHECK-LIBCALL-NEXT: xorps %xmm0, %xmm0 ; CHECK-LIBCALL-NEXT: cvtsi2ssl %ebx, %xmm0 ; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee ; CHECK-LIBCALL-NEXT: movzwl %ax, %edi @@ -299,6 +306,7 @@ ; CHECK-F16C-NEXT: movswl (%rsi), %eax ; CHECK-F16C-NEXT: vmovd %eax, %xmm0 ; CHECK-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; CHECK-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-F16C-NEXT: vcvtsi2ssl %edi, %xmm1, %xmm1 ; CHECK-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; CHECK-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 Index: test/CodeGen/X86/i64-to-float.ll =================================================================== --- test/CodeGen/X86/i64-to-float.ll +++ test/CodeGen/X86/i64-to-float.ll @@ -278,9 +278,10 @@ ; X64-AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vpextrq $1, %xmm0, %rax -; X64-AVX-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 +; X64-AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-AVX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm1 ; X64-AVX-NEXT: vmovq %xmm0, %rax -; X64-AVX-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 +; X64-AVX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 ; X64-AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-AVX-NEXT: retq %clo = icmp slt <2 x i64> %a, Index: test/CodeGen/X86/isint.ll =================================================================== --- test/CodeGen/X86/isint.ll +++ test/CodeGen/X86/isint.ll @@ -9,6 +9,7 @@ ; CHECK-NOT: xor ; CHECK: cvt %i = fptosi double %d to i32 +; CHECK-NEXT: xor ; CHECK-NEXT: cvt %e = sitofp i32 %i to double ; CHECK: cmpeqsd @@ -26,6 +27,7 @@ ; CHECK-NOT: xor ; CHECK: cvt %i = fptosi float %f to i32 +; CHECK-NEXT: xor ; CHECK-NEXT: cvt %g = sitofp i32 %i to float ; CHECK: cmpeqss @@ -43,6 +45,7 @@ ; CHECK-LABEL: isint_branch: ; CHECK: cvt %i = fptosi double %d to i32 +; CHECK-NEXT: xor ; CHECK-NEXT: cvt %e = sitofp i32 %i to double ; CHECK: ucomisd Index: test/CodeGen/X86/known-bits-vector.ll =================================================================== --- test/CodeGen/X86/known-bits-vector.ll +++ test/CodeGen/X86/known-bits-vector.ll @@ -42,7 +42,7 @@ ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7] ; X64-NEXT: vmovq %xmm0, %rax -; X64-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; X64-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm0 ; X64-NEXT: retq %1 = and <2 x i64> %a0, %2 = extractelement <2 x i64> %1, i32 0 Index: test/CodeGen/X86/recip-fastmath.ll =================================================================== --- test/CodeGen/X86/recip-fastmath.ll +++ test/CodeGen/X86/recip-fastmath.ll @@ -30,6 +30,7 @@ define float @f32_one_step(float %x) #1 { ; SSE-LABEL: f32_one_step: ; SSE: # BB#0: +; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: rcpss %xmm0, %xmm2 ; SSE-NEXT: mulss %xmm2, %xmm0 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero @@ -55,6 +56,7 @@ define float @f32_two_step(float %x) #2 { ; SSE-LABEL: f32_two_step: ; SSE: # BB#0: +; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: rcpss %xmm0, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: mulss %xmm2, %xmm3 @@ -267,4 +269,3 @@ attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" } attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" } attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" } - Index: test/CodeGen/X86/recip-fastmath2.ll =================================================================== --- test/CodeGen/X86/recip-fastmath2.ll +++ test/CodeGen/X86/recip-fastmath2.ll @@ -23,6 +23,7 @@ define float @f32_one_step_2(float %x) #1 { ; SSE-LABEL: f32_one_step_2: ; SSE: # BB#0: +; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: rcpss %xmm0, %xmm2 ; SSE-NEXT: mulss %xmm2, %xmm0 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero @@ -50,6 +51,7 @@ define float @f32_two_step_2(float %x) #2 { ; SSE-LABEL: f32_two_step_2: ; SSE: # BB#0: +; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: rcpss %xmm0, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: mulss %xmm2, %xmm3 @@ -271,4 +273,3 @@ attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" } attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" } attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:0,vec-divf:0" } - Index: test/CodeGen/X86/sqrt-fastmath-tune.ll =================================================================== --- test/CodeGen/X86/sqrt-fastmath-tune.ll +++ test/CodeGen/X86/sqrt-fastmath-tune.ll @@ -13,7 +13,8 @@ define float @foo_x1(float %f) #0 { ; SCALAR-EST-LABEL: foo_x1: ; SCALAR-EST: # BB#0: -; SCALAR-EST-NEXT: rsqrtss %xmm0 +; SCALAR-EST-NEXT: xorps %xmm1 +; SCALAR-EST-NEXT: rsqrtss %xmm0, %xmm1 ; SCALAR-EST: retq ; ; SCALAR-ACC-LABEL: foo_x1: Index: test/CodeGen/X86/sqrt-fastmath.ll =================================================================== --- test/CodeGen/X86/sqrt-fastmath.ll +++ test/CodeGen/X86/sqrt-fastmath.ll @@ -57,6 +57,7 @@ define float @finite_f32_estimate(float %f) #1 { ; SSE-LABEL: finite_f32_estimate: ; SSE: # BB#0: +; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: rsqrtss %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: mulss %xmm1, %xmm2 @@ -111,6 +112,7 @@ define float @f32_no_estimate(float %x) #0 { ; SSE-LABEL: f32_no_estimate: ; SSE: # BB#0: +; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: sqrtss %xmm0, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: divss %xmm1, %xmm0 @@ -130,6 +132,7 @@ define float @f32_estimate(float %x) #1 { ; SSE-LABEL: f32_estimate: ; SSE: # BB#0: +; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: rsqrtss %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: mulss %xmm2, %xmm2 @@ -263,4 +266,3 @@ attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!sqrtf,!vec-sqrtf,!divf,!vec-divf" } attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" } attributes #2 = { nounwind readnone } - Index: test/CodeGen/X86/sse-scalar-fp-arith.ll =================================================================== --- test/CodeGen/X86/sse-scalar-fp-arith.ll +++ test/CodeGen/X86/sse-scalar-fp-arith.ll @@ -81,12 +81,14 @@ define <4 x float> @test_sqrt_ss(<4 x float> %a) { ; SSE2-LABEL: test_sqrt_ss: ; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: sqrtss %xmm0, %xmm1 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_sqrt_ss: ; SSE41: # BB#0: +; SSE41-NEXT: xorps %xmm1, %xmm1 ; SSE41-NEXT: sqrtss %xmm0, %xmm1 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE41-NEXT: retq @@ -180,12 +182,14 @@ define <2 x double> @test_sqrt_sd(<2 x double> %a) { ; SSE2-LABEL: test_sqrt_sd: ; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: sqrtsd %xmm0, %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_sqrt_sd: ; SSE41: # BB#0: +; SSE41-NEXT: xorps %xmm1, %xmm1 ; SSE41-NEXT: sqrtsd %xmm0, %xmm1 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE41-NEXT: retq Index: test/CodeGen/X86/sse2-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -1258,12 +1258,14 @@ ; X32-LABEL: test_mm_cvtsi32_sd: ; X32: # BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: xorps %xmm1, %xmm1 ; X32-NEXT: cvtsi2sdl %eax, %xmm1 ; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; X32-NEXT: retl ; ; X64-LABEL: test_mm_cvtsi32_sd: ; X64: # BB#0: +; X64-NEXT: xorps %xmm1, %xmm1 ; X64-NEXT: cvtsi2sdl %edi, %xmm1 ; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; X64-NEXT: retq @@ -3878,4 +3880,3 @@ } !0 = !{i32 1} - Index: test/CodeGen/X86/sse_partial_update.ll =================================================================== --- test/CodeGen/X86/sse_partial_update.ll +++ test/CodeGen/X86/sse_partial_update.ll @@ -12,8 +12,10 @@ ; CHECK-LABEL: rsqrtss: ; CHECK: ## BB#0: ## %entry ; CHECK-NEXT: rsqrtss %xmm0, %xmm0 +; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: cvtss2sd %xmm0, %xmm2 ; CHECK-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: cvtss2sd %xmm0, %xmm1 ; CHECK-NEXT: movaps %xmm2, %xmm0 ; CHECK-NEXT: jmp _callee ## TAILCALL @@ -34,8 +36,10 @@ ; CHECK-LABEL: rcpss: ; CHECK: ## BB#0: ## %entry ; CHECK-NEXT: rcpss %xmm0, %xmm0 +; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: cvtss2sd %xmm0, %xmm2 ; CHECK-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: cvtss2sd %xmm0, %xmm1 ; CHECK-NEXT: movaps %xmm2, %xmm0 ; CHECK-NEXT: jmp _callee ## TAILCALL @@ -55,8 +59,10 @@ ; CHECK-LABEL: sqrtss: ; CHECK: ## BB#0: ## %entry ; CHECK-NEXT: sqrtss %xmm0, %xmm0 +; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: cvtss2sd %xmm0, %xmm2 ; CHECK-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: cvtss2sd %xmm0, %xmm1 ; CHECK-NEXT: movaps %xmm2, %xmm0 ; CHECK-NEXT: jmp _callee ## TAILCALL @@ -76,8 +82,10 @@ ; CHECK-LABEL: sqrtsd: ; CHECK: ## BB#0: ## %entry ; CHECK-NEXT: sqrtsd %xmm0, %xmm0 +; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: cvtsd2ss %xmm0, %xmm2 ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: cvtsd2ss %xmm0, %xmm1 ; CHECK-NEXT: movaps %xmm2, %xmm0 ; CHECK-NEXT: jmp _callee2 ## TAILCALL @@ -129,4 +137,3 @@ } declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind readnone - Index: test/CodeGen/X86/uint64-to-float.ll =================================================================== --- test/CodeGen/X86/uint64-to-float.ll +++ test/CodeGen/X86/uint64-to-float.ll @@ -33,6 +33,7 @@ ; X64-NEXT: testq %rdi, %rdi ; X64-NEXT: js .LBB0_1 ; X64-NEXT: # BB#2: # %entry +; X64-NEXT: xorps %xmm0, %xmm0 ; X64-NEXT: cvtsi2ssq %rdi, %xmm0 ; X64-NEXT: retq ; X64-NEXT: .LBB0_1: @@ -40,6 +41,7 @@ ; X64-NEXT: shrq %rax ; X64-NEXT: andl $1, %edi ; X64-NEXT: orq %rax, %rdi +; X64-NEXT: xorps %xmm0, %xmm0 ; X64-NEXT: cvtsi2ssq %rdi, %xmm0 ; X64-NEXT: addss %xmm0, %xmm0 ; X64-NEXT: retq Index: test/CodeGen/X86/uint_to_fp.ll =================================================================== --- test/CodeGen/X86/uint_to_fp.ll +++ test/CodeGen/X86/uint_to_fp.ll @@ -9,6 +9,7 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: shrl $23, %ecx +; X32-NEXT: xorps %xmm0, %xmm0 ; X32-NEXT: cvtsi2ssl %ecx, %xmm0 ; X32-NEXT: movss %xmm0, (%eax) ; X32-NEXT: retl @@ -16,6 +17,7 @@ ; X64-LABEL: test: ; X64: ## BB#0: ## %entry ; X64-NEXT: shrl $23, %edi +; X64-NEXT: xorps %xmm0, %xmm0 ; X64-NEXT: cvtsi2ssl %edi, %xmm0 ; X64-NEXT: movss %xmm0, (%rsi) ; X64-NEXT: retq Index: test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- test/CodeGen/X86/vec_int_to_fp.ll +++ test/CodeGen/X86/vec_int_to_fp.ll @@ -20,6 +20,7 @@ ; SSE-LABEL: sitofp_2i64_to_2f64: ; SSE: # BB#0: ; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: cvtsi2sdq %rax, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE-NEXT: movd %xmm0, %rax @@ -32,7 +33,8 @@ ; VEX-LABEL: sitofp_2i64_to_2f64: ; VEX: # BB#0: ; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1 +; VEX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; VEX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm1 ; VEX-NEXT: vmovq %xmm0, %rax ; VEX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 ; VEX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -41,7 +43,8 @@ ; AVX512F-LABEL: sitofp_2i64_to_2f64: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -50,7 +53,8 @@ ; AVX512VL-LABEL: sitofp_2i64_to_2f64: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -221,6 +225,7 @@ ; SSE-LABEL: sitofp_4i64_to_4f64: ; SSE: # BB#0: ; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: cvtsi2sdq %rax, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE-NEXT: movd %xmm0, %rax @@ -228,6 +233,7 @@ ; SSE-NEXT: cvtsi2sdq %rax, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: xorps %xmm3, %xmm3 ; SSE-NEXT: cvtsi2sdq %rax, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE-NEXT: movd %xmm0, %rax @@ -242,7 +248,8 @@ ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ; AVX1-NEXT: vmovq %xmm1, %rax ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -258,7 +265,8 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ; AVX2-NEXT: vmovq %xmm1, %rax ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -274,7 +282,8 @@ ; AVX512F: # BB#0: ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ; AVX512F-NEXT: vmovq %xmm1, %rax ; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -290,7 +299,8 @@ ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax -; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ; AVX512VL-NEXT: vmovq %xmm1, %rax ; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -478,7 +488,8 @@ ; AVX512F-LABEL: uitofp_2i64_to_2f64: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm0 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -487,7 +498,8 @@ ; AVX512VL-LABEL: uitofp_2i64_to_2f64: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -807,7 +819,8 @@ ; AVX512F: # BB#0: ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm2 ; AVX512F-NEXT: vmovq %xmm1, %rax ; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -823,7 +836,8 @@ ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax -; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm2 ; AVX512VL-NEXT: vmovq %xmm1, %rax ; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -1042,6 +1056,7 @@ ; SSE-LABEL: sitofp_2i64_to_4f32: ; SSE: # BB#0: ; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE-NEXT: movd %xmm0, %rax @@ -1054,7 +1069,8 @@ ; VEX-LABEL: sitofp_2i64_to_4f32: ; VEX: # BB#0: ; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; VEX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; VEX-NEXT: vmovq %xmm0, %rax ; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1065,7 +1081,8 @@ ; AVX512F-LABEL: sitofp_2i64_to_4f32: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1076,7 +1093,8 @@ ; AVX512VL-LABEL: sitofp_2i64_to_4f32: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1117,7 +1135,8 @@ ; VEX-LABEL: sitofp_2i64_to_4f32_zero: ; VEX: # BB#0: ; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; VEX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; VEX-NEXT: vmovq %xmm0, %rax ; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero @@ -1126,7 +1145,8 @@ ; AVX512F-LABEL: sitofp_2i64_to_4f32_zero: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero @@ -1135,7 +1155,8 @@ ; AVX512VL-LABEL: sitofp_2i64_to_4f32_zero: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1161,8 +1182,10 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; SSE-LABEL: sitofp_4i64_to_4f32_undef: ; SSE: # BB#0: +; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] @@ -1177,7 +1200,8 @@ ; VEX-LABEL: sitofp_4i64_to_4f32_undef: ; VEX: # BB#0: ; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; VEX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; VEX-NEXT: vmovq %xmm0, %rax ; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1188,7 +1212,8 @@ ; AVX512F-LABEL: sitofp_4i64_to_4f32_undef: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1199,7 +1224,8 @@ ; AVX512VL-LABEL: sitofp_4i64_to_4f32_undef: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1356,8 +1382,10 @@ ; SSE-LABEL: sitofp_4i64_to_4f32: ; SSE: # BB#0: ; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: xorps %xmm3, %xmm3 ; SSE-NEXT: cvtsi2ssq %rax, %xmm3 ; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] @@ -1376,9 +1404,10 @@ ; AVX1-LABEL: sitofp_4i64_to_4f32: ; AVX1: # BB#0: ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -1393,9 +1422,10 @@ ; AVX2-LABEL: sitofp_4i64_to_4f32: ; AVX2: # BB#0: ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -1410,9 +1440,10 @@ ; AVX512F-LABEL: sitofp_4i64_to_4f32: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, %rax @@ -1426,9 +1457,10 @@ ; AVX512VL-LABEL: sitofp_4i64_to_4f32: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -1636,6 +1668,7 @@ ; VEX-NEXT: testq %rax, %rax ; VEX-NEXT: js .LBB39_1 ; VEX-NEXT: # BB#2: +; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; VEX-NEXT: jmp .LBB39_3 ; VEX-NEXT: .LBB39_1: @@ -1643,6 +1676,7 @@ ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; VEX-NEXT: .LBB39_3: @@ -1650,14 +1684,16 @@ ; VEX-NEXT: testq %rax, %rax ; VEX-NEXT: js .LBB39_4 ; VEX-NEXT: # BB#5: -; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; VEX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; VEX-NEXT: jmp .LBB39_6 ; VEX-NEXT: .LBB39_4: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; VEX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; VEX-NEXT: .LBB39_6: ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1665,7 +1701,7 @@ ; VEX-NEXT: testq %rax, %rax ; VEX-NEXT: js .LBB39_8 ; VEX-NEXT: # BB#7: -; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; VEX-NEXT: .LBB39_8: ; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; VEX-NEXT: retq @@ -1673,7 +1709,8 @@ ; AVX512F-LABEL: uitofp_2i64_to_4f32: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1684,7 +1721,8 @@ ; AVX512VL-LABEL: uitofp_2i64_to_4f32: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1754,6 +1792,7 @@ ; VEX-NEXT: testq %rax, %rax ; VEX-NEXT: js .LBB40_1 ; VEX-NEXT: # BB#2: +; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; VEX-NEXT: jmp .LBB40_3 ; VEX-NEXT: .LBB40_1: @@ -1761,6 +1800,7 @@ ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; VEX-NEXT: .LBB40_3: @@ -1768,7 +1808,8 @@ ; VEX-NEXT: testq %rax, %rax ; VEX-NEXT: js .LBB40_4 ; VEX-NEXT: # BB#5: -; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; VEX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero ; VEX-NEXT: retq ; VEX-NEXT: .LBB40_4: @@ -1776,7 +1817,8 @@ ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; VEX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero ; VEX-NEXT: retq @@ -1784,7 +1826,8 @@ ; AVX512F-LABEL: uitofp_2i64_to_2f32: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero @@ -1793,7 +1836,8 @@ ; AVX512VL-LABEL: uitofp_2i64_to_2f32: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1824,7 +1868,6 @@ ; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: js .LBB41_2 ; SSE-NEXT: # BB#1: -; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: .LBB41_2: ; SSE-NEXT: movd %xmm1, %rax @@ -1871,6 +1914,7 @@ ; VEX-NEXT: testq %rax, %rax ; VEX-NEXT: js .LBB41_1 ; VEX-NEXT: # BB#2: +; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; VEX-NEXT: jmp .LBB41_3 ; VEX-NEXT: .LBB41_1: @@ -1878,6 +1922,7 @@ ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; VEX-NEXT: .LBB41_3: @@ -1885,14 +1930,16 @@ ; VEX-NEXT: testq %rax, %rax ; VEX-NEXT: js .LBB41_4 ; VEX-NEXT: # BB#5: -; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; VEX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; VEX-NEXT: jmp .LBB41_6 ; VEX-NEXT: .LBB41_4: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; VEX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; VEX-NEXT: .LBB41_6: ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1900,7 +1947,7 @@ ; VEX-NEXT: testq %rax, %rax ; VEX-NEXT: js .LBB41_8 ; VEX-NEXT: # BB#7: -; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; VEX-NEXT: .LBB41_8: ; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; VEX-NEXT: retq @@ -1908,7 +1955,8 @@ ; AVX512F-LABEL: uitofp_4i64_to_4f32_undef: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1919,7 +1967,8 @@ ; AVX512VL-LABEL: uitofp_4i64_to_4f32_undef: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -2125,6 +2174,7 @@ ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB47_1 ; SSE-NEXT: # BB#2: +; SSE-NEXT: xorps %xmm3, %xmm3 ; SSE-NEXT: cvtsi2ssq %rax, %xmm3 ; SSE-NEXT: jmp .LBB47_3 ; SSE-NEXT: .LBB47_1: @@ -2132,6 +2182,7 @@ ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: xorps %xmm3, %xmm3 ; SSE-NEXT: cvtsi2ssq %rax, %xmm3 ; SSE-NEXT: addss %xmm3, %xmm3 ; SSE-NEXT: .LBB47_3: @@ -2139,6 +2190,7 @@ ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB47_4 ; SSE-NEXT: # BB#5: +; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: jmp .LBB47_6 ; SSE-NEXT: .LBB47_4: @@ -2146,6 +2198,7 @@ ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: addss %xmm2, %xmm2 ; SSE-NEXT: .LBB47_6: @@ -2195,6 +2248,7 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB47_1 ; AVX1-NEXT: # BB#2: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX1-NEXT: jmp .LBB47_3 ; AVX1-NEXT: .LBB47_1: @@ -2202,6 +2256,7 @@ ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: .LBB47_3: @@ -2209,6 +2264,7 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB47_4 ; AVX1-NEXT: # BB#5: +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: jmp .LBB47_6 ; AVX1-NEXT: .LBB47_4: @@ -2216,6 +2272,7 @@ ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .LBB47_6: @@ -2225,14 +2282,16 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB47_7 ; AVX1-NEXT: # BB#8: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: jmp .LBB47_9 ; AVX1-NEXT: .LBB47_7: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .LBB47_9: ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] @@ -2240,7 +2299,8 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB47_10 ; AVX1-NEXT: # BB#11: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2249,7 +2309,8 @@ ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX1-NEXT: vzeroupper @@ -2261,6 +2322,7 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB47_1 ; AVX2-NEXT: # BB#2: +; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX2-NEXT: jmp .LBB47_3 ; AVX2-NEXT: .LBB47_1: @@ -2268,6 +2330,7 @@ ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: .LBB47_3: @@ -2275,6 +2338,7 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB47_4 ; AVX2-NEXT: # BB#5: +; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: jmp .LBB47_6 ; AVX2-NEXT: .LBB47_4: @@ -2282,6 +2346,7 @@ ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .LBB47_6: @@ -2291,14 +2356,16 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB47_7 ; AVX2-NEXT: # BB#8: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: jmp .LBB47_9 ; AVX2-NEXT: .LBB47_7: ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .LBB47_9: ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] @@ -2306,7 +2373,8 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB47_10 ; AVX2-NEXT: # BB#11: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2315,7 +2383,8 @@ ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX2-NEXT: vzeroupper @@ -2324,9 +2393,10 @@ ; AVX512F-LABEL: uitofp_4i64_to_4f32: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, %rax @@ -2340,9 +2410,10 @@ ; AVX512VL-LABEL: uitofp_4i64_to_4f32: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -2564,6 +2635,7 @@ ; SSE: # BB#0: ; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2sdq %rax, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSE-NEXT: movd %xmm1, %rax @@ -2576,7 +2648,8 @@ ; VEX: # BB#0: ; VEX-NEXT: vmovdqa (%rdi), %xmm0 ; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1 +; VEX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; VEX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm1 ; VEX-NEXT: vmovq %xmm0, %rax ; VEX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 ; VEX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -2586,7 +2659,8 @@ ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -2596,7 +2670,8 @@ ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -2705,6 +2780,7 @@ ; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa 16(%rdi), %xmm2 ; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2sdq %rax, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSE-NEXT: movd %xmm1, %rax @@ -2726,7 +2802,8 @@ ; AVX1-NEXT: vmovaps (%rdi), %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ; AVX1-NEXT: vmovq %xmm1, %rax ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -2743,7 +2820,8 @@ ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ; AVX2-NEXT: vmovq %xmm1, %rax ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -2760,7 +2838,8 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ; AVX512F-NEXT: vmovq %xmm1, %rax ; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -2777,7 +2856,8 @@ ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax -; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ; AVX512VL-NEXT: vmovq %xmm1, %rax ; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -2905,7 +2985,8 @@ ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm0 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -2915,7 +2996,8 @@ ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -3175,7 +3257,8 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm2 ; AVX512F-NEXT: vmovq %xmm1, %rax ; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -3192,7 +3275,8 @@ ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax -; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm2 ; AVX512VL-NEXT: vmovq %xmm1, %rax ; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -3349,8 +3433,10 @@ ; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa 16(%rdi), %xmm2 ; SSE-NEXT: movd %xmm2, %rax +; SSE-NEXT: xorps %xmm3, %xmm3 ; SSE-NEXT: cvtsi2ssq %rax, %xmm3 ; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] @@ -3369,9 +3455,10 @@ ; AVX1: # BB#0: ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -3387,9 +3474,10 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -3405,9 +3493,10 @@ ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, %rax @@ -3422,9 +3511,10 @@ ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -3513,8 +3603,10 @@ ; SSE-NEXT: movdqa 32(%rdi), %xmm3 ; SSE-NEXT: movdqa 48(%rdi), %xmm4 ; SSE-NEXT: movd %xmm2, %rax +; SSE-NEXT: xorps %xmm5, %xmm5 ; SSE-NEXT: cvtsi2ssq %rax, %xmm5 ; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] @@ -3551,9 +3643,10 @@ ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 ; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 ; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vmovq %xmm1, %rax @@ -3582,9 +3675,10 @@ ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX2-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 ; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vmovq %xmm1, %rax @@ -3613,13 +3707,14 @@ ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 ; AVX512F-NEXT: vmovq %xmm1, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm1 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] ; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; AVX512F-NEXT: vmovq %xmm2, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX512F-NEXT: vpextrq $1, %xmm2, %rax ; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 @@ -3644,13 +3739,14 @@ ; AVX512VL-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 ; AVX512VL-NEXT: vmovq %xmm1, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm1 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] ; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; AVX512VL-NEXT: vmovq %xmm2, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 @@ -3791,6 +3887,7 @@ ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB76_1 ; SSE-NEXT: # BB#2: +; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: jmp .LBB76_3 ; SSE-NEXT: .LBB76_1: @@ -3798,6 +3895,7 @@ ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: addss %xmm2, %xmm2 ; SSE-NEXT: .LBB76_3: @@ -3805,6 +3903,7 @@ ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB76_4 ; SSE-NEXT: # BB#5: +; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: jmp .LBB76_6 ; SSE-NEXT: .LBB76_4: @@ -3812,6 +3911,7 @@ ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: addss %xmm0, %xmm0 ; SSE-NEXT: .LBB76_6: @@ -3861,6 +3961,7 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB76_1 ; AVX1-NEXT: # BB#2: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX1-NEXT: jmp .LBB76_3 ; AVX1-NEXT: .LBB76_1: @@ -3868,6 +3969,7 @@ ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: .LBB76_3: @@ -3875,6 +3977,7 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB76_4 ; AVX1-NEXT: # BB#5: +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: jmp .LBB76_6 ; AVX1-NEXT: .LBB76_4: @@ -3882,6 +3985,7 @@ ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .LBB76_6: @@ -3891,14 +3995,16 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB76_7 ; AVX1-NEXT: # BB#8: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: jmp .LBB76_9 ; AVX1-NEXT: .LBB76_7: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .LBB76_9: ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] @@ -3906,7 +4012,8 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB76_10 ; AVX1-NEXT: # BB#11: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -3915,7 +4022,8 @@ ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX1-NEXT: vzeroupper @@ -3928,6 +4036,7 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB76_1 ; AVX2-NEXT: # BB#2: +; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX2-NEXT: jmp .LBB76_3 ; AVX2-NEXT: .LBB76_1: @@ -3935,6 +4044,7 @@ ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: .LBB76_3: @@ -3942,6 +4052,7 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB76_4 ; AVX2-NEXT: # BB#5: +; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: jmp .LBB76_6 ; AVX2-NEXT: .LBB76_4: @@ -3949,6 +4060,7 @@ ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .LBB76_6: @@ -3958,14 +4070,16 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB76_7 ; AVX2-NEXT: # BB#8: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: jmp .LBB76_9 ; AVX2-NEXT: .LBB76_7: ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .LBB76_9: ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] @@ -3973,7 +4087,8 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB76_10 ; AVX2-NEXT: # BB#11: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -3982,7 +4097,8 @@ ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX2-NEXT: vzeroupper @@ -3992,9 +4108,10 @@ ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, %rax @@ -4009,9 +4126,10 @@ ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -4152,6 +4270,7 @@ ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB80_1 ; SSE-NEXT: # BB#2: +; SSE-NEXT: xorps %xmm4, %xmm4 ; SSE-NEXT: cvtsi2ssq %rax, %xmm4 ; SSE-NEXT: jmp .LBB80_3 ; SSE-NEXT: .LBB80_1: @@ -4159,6 +4278,7 @@ ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: xorps %xmm4, %xmm4 ; SSE-NEXT: cvtsi2ssq %rax, %xmm4 ; SSE-NEXT: addss %xmm4, %xmm4 ; SSE-NEXT: .LBB80_3: @@ -4166,6 +4286,7 @@ ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB80_4 ; SSE-NEXT: # BB#5: +; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: jmp .LBB80_6 ; SSE-NEXT: .LBB80_4: @@ -4173,6 +4294,7 @@ ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: addss %xmm0, %xmm0 ; SSE-NEXT: .LBB80_6: @@ -4181,6 +4303,7 @@ ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB80_7 ; SSE-NEXT: # BB#8: +; SSE-NEXT: xorps %xmm6, %xmm6 ; SSE-NEXT: cvtsi2ssq %rax, %xmm6 ; SSE-NEXT: jmp .LBB80_9 ; SSE-NEXT: .LBB80_7: @@ -4188,6 +4311,7 @@ ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: xorps %xmm6, %xmm6 ; SSE-NEXT: cvtsi2ssq %rax, %xmm6 ; SSE-NEXT: addss %xmm6, %xmm6 ; SSE-NEXT: .LBB80_9: @@ -4212,6 +4336,7 @@ ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB80_13 ; SSE-NEXT: # BB#14: +; SSE-NEXT: xorps %xmm7, %xmm7 ; SSE-NEXT: cvtsi2ssq %rax, %xmm7 ; SSE-NEXT: jmp .LBB80_15 ; SSE-NEXT: .LBB80_13: @@ -4219,6 +4344,7 @@ ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: xorps %xmm7, %xmm7 ; SSE-NEXT: cvtsi2ssq %rax, %xmm7 ; SSE-NEXT: addss %xmm7, %xmm7 ; SSE-NEXT: .LBB80_15: @@ -4288,6 +4414,7 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_1 ; AVX1-NEXT: # BB#2: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX1-NEXT: jmp .LBB80_3 ; AVX1-NEXT: .LBB80_1: @@ -4295,6 +4422,7 @@ ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: .LBB80_3: @@ -4302,6 +4430,7 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_4 ; AVX1-NEXT: # BB#5: +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX1-NEXT: jmp .LBB80_6 ; AVX1-NEXT: .LBB80_4: @@ -4309,6 +4438,7 @@ ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: .LBB80_6: @@ -4317,6 +4447,7 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_7 ; AVX1-NEXT: # BB#8: +; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 ; AVX1-NEXT: jmp .LBB80_9 ; AVX1-NEXT: .LBB80_7: @@ -4324,6 +4455,7 @@ ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 ; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: .LBB80_9: @@ -4331,20 +4463,23 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_10 ; AVX1-NEXT: # BB#11: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: jmp .LBB80_12 ; AVX1-NEXT: .LBB80_10: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .LBB80_12: ; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_13 ; AVX1-NEXT: # BB#14: +; AVX1-NEXT: vxorps %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 ; AVX1-NEXT: jmp .LBB80_15 ; AVX1-NEXT: .LBB80_13: @@ -4352,6 +4487,7 @@ ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vxorps %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 ; AVX1-NEXT: vaddss %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: .LBB80_15: @@ -4360,14 +4496,16 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_16 ; AVX1-NEXT: # BB#17: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX1-NEXT: jmp .LBB80_18 ; AVX1-NEXT: .LBB80_16: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: .LBB80_18: ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] @@ -4377,14 +4515,16 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_19 ; AVX1-NEXT: # BB#20: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5 +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5 ; AVX1-NEXT: jmp .LBB80_21 ; AVX1-NEXT: .LBB80_19: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm5 ; AVX1-NEXT: .LBB80_21: ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] @@ -4393,14 +4533,16 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_22 ; AVX1-NEXT: # BB#23: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: jmp .LBB80_24 ; AVX1-NEXT: .LBB80_22: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .LBB80_24: ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] @@ -4415,6 +4557,7 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_1 ; AVX2-NEXT: # BB#2: +; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX2-NEXT: jmp .LBB80_3 ; AVX2-NEXT: .LBB80_1: @@ -4422,6 +4565,7 @@ ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: .LBB80_3: @@ -4429,6 +4573,7 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_4 ; AVX2-NEXT: # BB#5: +; AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX2-NEXT: jmp .LBB80_6 ; AVX2-NEXT: .LBB80_4: @@ -4436,6 +4581,7 @@ ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: .LBB80_6: @@ -4444,6 +4590,7 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_7 ; AVX2-NEXT: # BB#8: +; AVX2-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 ; AVX2-NEXT: jmp .LBB80_9 ; AVX2-NEXT: .LBB80_7: @@ -4451,6 +4598,7 @@ ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 ; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: .LBB80_9: @@ -4458,20 +4606,23 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_10 ; AVX2-NEXT: # BB#11: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 +; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: jmp .LBB80_12 ; AVX2-NEXT: .LBB80_10: ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 +; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .LBB80_12: ; AVX2-NEXT: vpextrq $1, %xmm0, %rax ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_13 ; AVX2-NEXT: # BB#14: +; AVX2-NEXT: vxorps %xmm5, %xmm5, %xmm5 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 ; AVX2-NEXT: jmp .LBB80_15 ; AVX2-NEXT: .LBB80_13: @@ -4479,6 +4630,7 @@ ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vxorps %xmm5, %xmm5, %xmm5 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 ; AVX2-NEXT: vaddss %xmm5, %xmm5, %xmm5 ; AVX2-NEXT: .LBB80_15: @@ -4487,14 +4639,16 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_16 ; AVX2-NEXT: # BB#17: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 +; AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX2-NEXT: jmp .LBB80_18 ; AVX2-NEXT: .LBB80_16: ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 +; AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: .LBB80_18: ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] @@ -4504,14 +4658,16 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_19 ; AVX2-NEXT: # BB#20: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5 +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5 ; AVX2-NEXT: jmp .LBB80_21 ; AVX2-NEXT: .LBB80_19: ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm5 ; AVX2-NEXT: .LBB80_21: ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] @@ -4520,14 +4676,16 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_22 ; AVX2-NEXT: # BB#23: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 +; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: jmp .LBB80_24 ; AVX2-NEXT: .LBB80_22: ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 +; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .LBB80_24: ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] @@ -4539,13 +4697,14 @@ ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 ; AVX512F-NEXT: vmovq %xmm1, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm1 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] ; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; AVX512F-NEXT: vmovq %xmm2, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3 +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX512F-NEXT: vpextrq $1, %xmm2, %rax ; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 @@ -4570,13 +4729,14 @@ ; AVX512VL-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 ; AVX512VL-NEXT: vmovq %xmm1, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm1 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] ; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; AVX512VL-NEXT: vmovq %xmm2, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3 +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 Index: test/CodeGen/X86/vector-sqrt.ll =================================================================== --- test/CodeGen/X86/vector-sqrt.ll +++ test/CodeGen/X86/vector-sqrt.ll @@ -4,7 +4,7 @@ ; Function Attrs: nounwind readonly uwtable define <2 x double> @sqrtd2(double* nocapture readonly %v) local_unnamed_addr #0 { ; CHECK-LABEL: sqrtd2: -; CHECK: vsqrtsd (%rdi), %xmm0, %xmm0 +; CHECK: vsqrtsd (%rdi), %xmm1, %xmm0 ; CHECK-NEXT: vsqrtsd 8(%rdi), %xmm1, %xmm1 ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq @@ -25,9 +25,9 @@ ; Function Attrs: nounwind readonly uwtable define <4 x float> @sqrtf4(float* nocapture readonly %v) local_unnamed_addr #0 { ; CHECK-LABEL: sqrtf4: -; CHECK: vsqrtss (%rdi), %xmm0, %xmm0 -; CHECK-NEXT: vsqrtss 4(%rdi), %xmm1, %xmm1 -; CHECK-NEXT: vsqrtss 8(%rdi), %xmm2, %xmm2 +; CHECK: vsqrtss (%rdi), %xmm3, %xmm0 +; CHECK-NEXT: vsqrtss 4(%rdi), %xmm3, %xmm1 +; CHECK-NEXT: vsqrtss 8(%rdi), %xmm3, %xmm2 ; CHECK-NEXT: vsqrtss 12(%rdi), %xmm3, %xmm3 ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]