Index: include/llvm/Target/TargetInstrInfo.h =================================================================== --- include/llvm/Target/TargetInstrInfo.h +++ include/llvm/Target/TargetInstrInfo.h @@ -1440,6 +1440,17 @@ virtual void breakPartialRegDependency(MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {} + /// May return true if the instruction in question is a dependency breaking + /// instruction. If so, the register number for which it is dependency + /// breaking should be returned in `OutReg`. It is prefereable to return + /// false if the result cannot be determined. This would at worst result + /// in the insertion of an unnecessary instruction, while the other + /// alternative could result in significant false-dependency penalties. + virtual bool isDependencyBreak(MachineInstr &MI, + unsigned *OutReg = nullptr) const { + return false; + } + /// Create machine specific model for scheduling. virtual DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &) const { Index: lib/CodeGen/ExecutionDepsFix.cpp =================================================================== --- lib/CodeGen/ExecutionDepsFix.cpp +++ lib/CodeGen/ExecutionDepsFix.cpp @@ -222,9 +222,14 @@ void processDefs(MachineInstr *, bool breakDependency, bool Kill); void visitSoftInstr(MachineInstr*, unsigned mask); void visitHardInstr(MachineInstr*, unsigned domain); - void pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, - unsigned Pref); + void pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, unsigned Pref, + bool &TrueDependency); bool shouldBreakDependence(MachineInstr*, unsigned OpIdx, unsigned Pref); + + // Undef Reads + void collapseUndefReads(unsigned from, unsigned to, unsigned Reg); + unsigned updateChooseableRegs(SparseSet &, + const TargetRegisterClass *, bool); void processUndefReads(MachineBasicBlock*); }; } @@ -485,7 +490,6 @@ else visitHardInstr(MI, DomP.first); } - return !DomP.first; } @@ -493,7 +497,7 @@ /// machine instructions' undef operand to use a register that the instruction /// is truly dependent on, or use a register with clearance higher than Pref. void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, - unsigned Pref) { + unsigned Pref, bool &TrueDependency) { MachineOperand &MO = MI->getOperand(OpIdx); assert(MO.isUndef() && "Expected undef machine operand"); @@ -516,6 +520,7 @@ // We found a true dependency - replace the undef register with the true // dependency. MO.setReg(CurrMO.getReg()); + TrueDependency = true; return; } @@ -575,9 +580,14 @@ if (breakDependency) { unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI); if (Pref) { - pickBestRegisterForUndef(MI, OpNum, Pref); - if (shouldBreakDependence(MI, OpNum, Pref)) + bool TrueDependency = false; + pickBestRegisterForUndef(MI, OpNum, Pref, TrueDependency); + // Don't bother adding true dependencies to UndefReads. All we'd find out + // is that the register is live (since this very instruction depends on + // it), so we can't do anything. + if (!TrueDependency && shouldBreakDependence(MI, OpNum, Pref)) { UndefReads.push_back(std::make_pair(MI, OpNum)); + } } } const MCInstrDesc &MCID = MI->getDesc(); @@ -610,9 +620,52 @@ kill(rx); } } + unsigned DepReg = 0; + if (TII->isDependencyBreak(*MI, &DepReg)) { + for (int rx : regIndices(DepReg)) { + // This instruction is a dependency break, so there are no clearance + // issues, reset the counter. + LiveRegs[rx].Def = -(1 << 20); + } + } ++CurInstr; } +// Set the undef read register to `Reg` for all UndefReads in the range +// [from,to). +void ExeDepsFix::collapseUndefReads(unsigned from, unsigned to, unsigned Reg) { + if (from >= to) + return; + for (unsigned i = from; i < to; ++i) { + MachineInstr *MI = std::get<0>(UndefReads[i]); + unsigned OpIdx = std::get<1>(UndefReads[i]); + MachineOperand &MO = MI->getOperand(OpIdx); + MO.setReg(Reg); + } + TII->breakPartialRegDependency(*std::get<0>(UndefReads[from]), + std::get<1>(UndefReads[from]), TRI); +} + +unsigned ExeDepsFix::updateChooseableRegs(SparseSet &ChoosableRegs, + const TargetRegisterClass *OpRC, + bool add) { + unsigned LowestValid = (unsigned)-1; + ArrayRef Order = RegClassInfo.getOrder(OpRC); + for (auto Reg : Order) { + if (LiveRegSet.contains(Reg)) + ChoosableRegs.erase(Reg); + else if (add) { + ChoosableRegs.insert(Reg); + if (LowestValid == (unsigned)-1) + LowestValid = Reg; + } else if (ChoosableRegs.count(Reg) == 1) { + if (LowestValid == (unsigned)-1) + LowestValid = Reg; + } + } + return LowestValid; +} + /// \break Break false dependencies on undefined register reads. /// /// Walk the block backward computing precise liveness. This is expensive, so we @@ -623,31 +676,87 @@ if (UndefReads.empty()) return; + // We want to be slightly clever here, to avoid the following common pattern: + // Suppose we have some instruction `vrandom %in, %out` and the following code + // vrandom %xmm0, %xmm0 + // vrandom %xmm1, %xmm1 + // vrandom %xmm2, %xmm2 + // vrandom %xmm3, %xmm3 + // The earlier logic likes to produce these, because it picks the first + // register + // to break ties in clearance. However, most register allocators pick the dest + // register the same way. Naively, we'd have to insert a dependency break, + // before every instruction above. However, what we really want is + // vxorps %xmm3, %xmm3, %xmm3 + // vrandom %xmm3, %xmm0 + // vrandom %xmm3, %xmm1 + // vrandom %xmm3, %xmm2 + // vrandom %xmm3, %xmm3 + // To do so, we walk backwards and cumulatively keep track of which registers + // we can use to break the dependency. Then, once the set has collapsed, we + // reset the undef read register for all following instructions. + // Collect this block's live out register units. LiveRegSet.init(*TRI); // We do not need to care about pristine registers as they are just preserved // but not actually used in the function. LiveRegSet.addLiveOutsNoPristines(*MBB); - MachineInstr *UndefMI = UndefReads.back().first; - unsigned OpIdx = UndefReads.back().second; + SparseSet ChoosableRegs; + ChoosableRegs.setUniverse(TRI->getNumRegs()); + + unsigned LastValid = (unsigned)-1; + const TargetRegisterClass *LastOpRC = nullptr; + size_t i, LastInit; + i = LastInit = UndefReads.size() - 1; + MachineInstr *UndefMI = std::get<0>(UndefReads[i]); for (MachineInstr &I : make_range(MBB->rbegin(), MBB->rend())) { // Update liveness, including the current instruction's defs. LiveRegSet.stepBackward(I); + // This ensures that we don't accidentally pick a register whose live region + // lies entirely between two undef reads (since that would defeat the + // purpose of breaking the dependency). + for (auto LiveReg : LiveRegSet) + ChoosableRegs.erase(LiveReg); + if (UndefMI == &I) { - if (!LiveRegSet.contains(UndefMI->getOperand(OpIdx).getReg())) - TII->breakPartialRegDependency(*UndefMI, OpIdx, TRI); + unsigned OpIdx = std::get<1>(UndefReads[i]); + // Get the undef operand's register class + const TargetRegisterClass *OpRC = + TII->getRegClass(UndefMI->getDesc(), OpIdx, TRI, *MF); + if (OpRC != LastOpRC || ChoosableRegs.size() == 0) { + if (LastInit != i) { + if (LastValid != (unsigned)-1) + collapseUndefReads(i + 1, LastInit + 1, LastValid); + ChoosableRegs.clear(); + LastInit = i; + } + } + + unsigned LowestValid = + updateChooseableRegs(ChoosableRegs, OpRC, LastInit == i); + + if (ChoosableRegs.size() == 0) { + if (LastInit != i) { + if (LastValid != (unsigned)-1) + collapseUndefReads(i + 1, LastInit + 1, LastValid); + LowestValid = updateChooseableRegs(ChoosableRegs, OpRC, true); + LastInit = i; + } + } + LastValid = LowestValid; + LastOpRC = OpRC; - UndefReads.pop_back(); - if (UndefReads.empty()) - return; + if (i == 0) + break; - UndefMI = UndefReads.back().first; - OpIdx = UndefReads.back().second; + UndefMI = std::get<0>(UndefReads[--i]); } } + if (LastValid != (unsigned)-1) + collapseUndefReads(0, LastInit + 1, LastValid); } // A hard instruction only works in one domain. All input registers will be Index: lib/Target/X86/X86InstrInfo.h =================================================================== --- lib/Target/X86/X86InstrInfo.h +++ lib/Target/X86/X86InstrInfo.h @@ -484,6 +484,7 @@ const TargetRegisterInfo *TRI) const override; void breakPartialRegDependency(MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override; + bool isDependencyBreak(MachineInstr &MI, unsigned *OutReg) const override; MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, unsigned OpNum, Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -7496,6 +7496,23 @@ } } +bool X86InstrInfo::isDependencyBreak(MachineInstr &MI, unsigned *OutReg) const { + unsigned Opc = MI.getOpcode(); + if (!(Opc == X86::VXORPSrr || Opc == X86::VXORPDrr || Opc == X86::XORPSrr || + Opc == X86::XORPDrr)) + return false; + unsigned Reg = 0; + for (unsigned i = 0; i < MI.getNumOperands(); ++i) { + const MachineOperand &MO = MI.getOperand(i); + if (!MO.isReg() || (Reg != 0 && MO.getReg() != Reg)) + return false; + Reg = MO.getReg(); + } + if (OutReg) + *OutReg = Reg; + return true; +} + MachineInstr * X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, Index: test/CodeGen/X86/break-false-dep.ll =================================================================== --- test/CodeGen/X86/break-false-dep.ll +++ test/CodeGen/X86/break-false-dep.ll @@ -231,20 +231,29 @@ ;AVX: vcvtss2sd [[XMM0:%xmm[0-9]+]], [[XMM0]], {{%xmm[0-9]+}} } -; Make sure we are making a smart choice regarding undef registers and -; choosing the register with the highest clearence -define double @clearence(i64 %arg) { +define double @clearence(double %x, i64 %arg) { top: - tail call void asm sideeffect "", "~{xmm6},~{dirflag},~{fpsr},~{flags}"() - tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() - tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm7},~{dirflag},~{fpsr},~{flags}"() +;AVX-LABEL:@clearence +; This is carefully constructed to force LLVM to materialize a vxorps, which +; also implicitly breaks the dependency, making it a good candidate for the +; undef read below +;AVX: vxorps [[XMM1:%xmm1]], [[XMM1]], [[XMM1]] +;AVX: vucomisd [[XMM1]], %xmm0 + %0 = fcmp ult double %x, 0.0 + br i1 %0, label %main, label %fake + +main: + tail call void asm sideeffect "", "~{xmm0},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() + tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"() tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() %tmp1 = sitofp i64 %arg to double ret double %tmp1 -;AVX-LABEL:@clearence -;AVX: vxorps [[XMM6:%xmm6]], [[XMM6]], [[XMM6]] -;AVX-NEXT: vcvtsi2sdq {{.*}}, [[XMM6]], {{%xmm[0-9]+}} +; Check that we re-use the dependency break from above +;AVX-NOT: vxorps +;AVX: vcvtsi2sdq {{.*}}, [[XMM1]], {{%xmm[0-9]+}} +fake: + ret double 0.0 } ; Make sure we are making a smart choice regarding undef registers in order to @@ -334,3 +343,25 @@ loopdone: ret void } + +define double @breakoptimization(i64 %a, i64 %b, i64 %c, i64 %d) { +;AVX-LABEL:@breakoptimization +top: + tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() + tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"() + tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() + tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() +;AVX: vxorps [[XMM:%xmm[0-9]+]], [[XMM]], [[XMM]] +;AVX-NEXT: vcvtsi2sdq {{.*}}, [[XMM]], {{%xmm[0-9]+}} +;AVX-NEXT: vcvtsi2sdq {{.*}}, [[XMM]], {{%xmm[0-9]+}} +;AVX-NEXT: vcvtsi2sdq {{.*}}, [[XMM]], {{%xmm[0-9]+}} +;AVX-NEXT: vcvtsi2sdq {{.*}}, [[XMM]], {{%xmm[0-9]+}} + %af = sitofp i64 %a to double + %bf = sitofp i64 %b to double + %cf = sitofp i64 %c to double + %df = sitofp i64 %d to double + %fadd1 = fadd double %af, %bf + %fadd2 = fadd double %cf, %df + %fadd3 = fadd double %fadd1, %fadd2 + ret double %fadd3 +} Index: test/CodeGen/X86/known-bits-vector.ll =================================================================== --- test/CodeGen/X86/known-bits-vector.ll +++ test/CodeGen/X86/known-bits-vector.ll @@ -42,7 +42,7 @@ ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7] ; X64-NEXT: vmovq %xmm0, %rax -; X64-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; X64-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm0 ; X64-NEXT: retq %1 = and <2 x i64> %a0, %2 = extractelement <2 x i64> %1, i32 0 Index: test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- test/CodeGen/X86/vec_int_to_fp.ll +++ test/CodeGen/X86/vec_int_to_fp.ll @@ -1665,7 +1665,7 @@ ; VEX-NEXT: testq %rax, %rax ; VEX-NEXT: js .LBB39_8 ; VEX-NEXT: # BB#7: -; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; VEX-NEXT: .LBB39_8: ; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; VEX-NEXT: retq @@ -1824,7 +1824,6 @@ ; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: js .LBB41_2 ; SSE-NEXT: # BB#1: -; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: .LBB41_2: ; SSE-NEXT: movd %xmm1, %rax @@ -1900,7 +1899,7 @@ ; VEX-NEXT: testq %rax, %rax ; VEX-NEXT: js .LBB41_8 ; VEX-NEXT: # BB#7: -; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; VEX-NEXT: .LBB41_8: ; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; VEX-NEXT: retq