Index: lib/CodeGen/ExecutionDepsFix.cpp =================================================================== --- lib/CodeGen/ExecutionDepsFix.cpp +++ lib/CodeGen/ExecutionDepsFix.cpp @@ -164,7 +164,8 @@ MBBInfoMap MBBInfos; /// List of undefined register reads in this block in forward order. - std::vector > UndefReads; + std::vector> + UndefReads; /// Storage for register unit liveness. LivePhysRegs LiveRegSet; @@ -214,12 +215,12 @@ bool isBlockDone(MachineBasicBlock *); void processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass, bool Done); void updateSuccessors(MachineBasicBlock *MBB, bool Primary, bool Done); - bool visitInstr(MachineInstr *); + bool visitInstr(MachineInstr *, bool PrimaryPass); void processDefs(MachineInstr *, bool BlockDone, bool Kill); void visitSoftInstr(MachineInstr*, unsigned mask); void visitHardInstr(MachineInstr*, unsigned domain); - void pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, - unsigned Pref); + unsigned pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, + unsigned Pref, bool &TrueDependency); bool shouldBreakDependence(MachineInstr*, unsigned OpIdx, unsigned Pref); void processUndefReads(MachineBasicBlock*); }; @@ -470,24 +471,37 @@ LiveRegs = nullptr; } -bool ExeDepsFix::visitInstr(MachineInstr *MI) { - // Update instructions with explicit execution domains. - std::pair DomP = TII->getExecutionDomain(*MI); - if (DomP.first) { - if (DomP.second) - visitSoftInstr(MI, DomP.second); - else - visitHardInstr(MI, DomP.first); +bool ExeDepsFix::visitInstr(MachineInstr *MI, bool PrimaryPass) { + bool Kill = false; + + if (PrimaryPass) { + // Update instructions with explicit execution domains. + std::pair DomP = TII->getExecutionDomain(*MI); + if (DomP.first) { + if (DomP.second) + visitSoftInstr(MI, DomP.second); + else + visitHardInstr(MI, DomP.first); + } + Kill = !DomP.first; + } + + // If this is a call, pretend all registers we are considering are def'd here. + // We have no idea which registers the callee may use. + if (MI->isCall()) { + for (unsigned i = 0, e = NumRegs; i != e; ++i) + LiveRegs[i].Def = CurInstr; } - return !DomP.first; + return Kill; } /// \brief Helps avoid false dependencies on undef registers by updating the /// machine instructions' undef operand to use a register that the instruction /// is truly dependent on, or use a register with clearance higher than Pref. -void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, - unsigned Pref) { +unsigned ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, + unsigned Pref, + bool &TrueDependency) { MachineOperand &MO = MI->getOperand(OpIdx); assert(MO.isUndef() && "Expected undef machine operand"); @@ -495,7 +509,7 @@ // Update only undef operands that are mapped to one register. if (AliasMap[OriginalReg].size() != 1) - return; + return (unsigned)-1; // Get the undef operand's register class const TargetRegisterClass *OpRC = @@ -510,7 +524,8 @@ // We found a true dependency - replace the undef register with the true // dependency. MO.setReg(CurrMO.getReg()); - return; + TrueDependency = true; + return (unsigned)-1; } // Go over all registers in the register class and find the register with @@ -535,6 +550,8 @@ // Update the operand if we found a register with better clearance. if (MaxClearanceReg != OriginalReg) MO.setReg(MaxClearanceReg); + + return MaxClearance; } /// \brief Return true to if it makes sense to break dependence on a partial def @@ -571,9 +588,16 @@ if (BlockDone) { unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI); if (Pref) { - pickBestRegisterForUndef(MI, OpNum, Pref); - if (shouldBreakDependence(MI, OpNum, Pref)) - UndefReads.push_back(std::make_pair(MI, OpNum)); + bool TrueDependency = false; + unsigned MaxClearance = + pickBestRegisterForUndef(MI, OpNum, Pref, TrueDependency); + // Don't bother adding true dependencies to UndefReads. All we'd find out + // is that the register is live (since this very instruction depends on + // it), so we can't do anything. + if (!TrueDependency && shouldBreakDependence(MI, OpNum, Pref)) { + UndefReads.push_back( + std::make_tuple(MI, OpNum, MaxClearance, CurInstr)); + } } } const MCInstrDesc &MCID = MI->getDesc(); @@ -619,31 +643,70 @@ if (UndefReads.empty()) return; + // We will make two passes through the list of undef regs. The first + // (backward) to determine legality of inserting the dependency breaking + // instruction, the second (forward) to do the actual insertion. This allows + // us to avoid inserting the dependency breaking instruction twice, if an + // earlier such instruction also breaks the dependency for the latter (e.g. + // because there's multiple undef reads from the same register in close + // succession). + + //// First pass // Collect this block's live out register units. LiveRegSet.init(*TRI); // We do not need to care about pristine registers as they are just preserved // but not actually used in the function. LiveRegSet.addLiveOutsNoPristines(*MBB); - MachineInstr *UndefMI = UndefReads.back().first; - unsigned OpIdx = UndefReads.back().second; + size_t i = UndefReads.size(); + MachineInstr *UndefMI = std::get<0>(UndefReads[i]); for (MachineInstr &I : make_range(MBB->rbegin(), MBB->rend())) { // Update liveness, including the current instruction's defs. LiveRegSet.stepBackward(I); if (UndefMI == &I) { - if (!LiveRegSet.contains(UndefMI->getOperand(OpIdx).getReg())) - TII->breakPartialRegDependency(*UndefMI, OpIdx, TRI); + unsigned OpIdx = std::get<1>(UndefReads[i]); + if (LiveRegSet.contains(UndefMI->getOperand(OpIdx).getReg())) { + // This is illegal. Null out the Instruction, so we now to skip this + // on the next pass + std::get<0>(UndefReads[i]) = nullptr; + } - UndefReads.pop_back(); - if (UndefReads.empty()) - return; + if (i == 0) + break; + UndefMI = std::get<0>(UndefReads[--i]); + } + } - UndefMI = UndefReads.back().first; - OpIdx = UndefReads.back().second; + //// Second pass + // Keep track of the instruction number at which we broke the instruction. + // If the clearance for a later instruction is larger than the + // difference between that instruction and where we broke the dependence, + // then the first insertion also breaks the dependence for the second + // instruction. + unsigned *BrokeDependenceAt = new unsigned[NumRegs]; + memset(BrokeDependenceAt, (unsigned)-1, sizeof(unsigned) * NumRegs); + for (auto ToBreak : UndefReads) { + MachineInstr *MI = std::get<0>(ToBreak); + unsigned OpIdx = std::get<1>(ToBreak); + unsigned Clearance = std::get<2>(ToBreak); + unsigned Here = std::get<3>(ToBreak); + if (!MI) + continue; + for (int rx : regIndices(MI->getOperand(OpIdx).getReg())) { + if (BrokeDependenceAt[rx] != (unsigned)-1 && Clearance != (unsigned)-1) { + unsigned There = BrokeDependenceAt[rx]; + if (Clearance > Here - There) + continue; + } + TII->breakPartialRegDependency(*MI, OpIdx, TRI); + BrokeDependenceAt[rx] = Here; } } + delete[] BrokeDependenceAt; + + UndefReads.clear(); } // A hard instruction only works in one domain. All input registers will be @@ -793,9 +856,7 @@ enterBasicBlock(MBB); for (MachineInstr &MI : *MBB) { if (!MI.isDebugValue()) { - bool Kill = false; - if (PrimaryPass) - Kill = visitInstr(&MI); + bool Kill = visitInstr(&MI, PrimaryPass); processDefs(&MI, isBlockDone(MBB), Kill); } } Index: test/CodeGen/X86/break-false-dep.ll =================================================================== --- test/CodeGen/X86/break-false-dep.ll +++ test/CodeGen/X86/break-false-dep.ll @@ -310,6 +310,7 @@ loop_end: %nexti = add i64 %phi_i, 1 %nextj = add i64 %phi_j, 1 +;AVX-LABEL:@loopclearance2 ; Register use, plus us clobbering 7-15 above, basically forces xmm7 here as ; the only reasonable choice. The primary thing we care about is that it's ; not one of the registers used in the loop (e.g. not the output reg here) @@ -334,3 +335,35 @@ loopdone: ret void } + +; Make sure that calls kill register clearance and that a we don't insert +; an extra dependency-breaking instruction if one suffices. +declare double @sin(double %x) +define void @callclearance(double *%x, i64 *%y, i64 *%z) { +entry: + br label %loop + +loop: + %idx = phi i32 [0, %entry], [%idx, %loop] + %valptr = getelementptr i64, i64* %y, i32 %idx + %valptr2 = getelementptr i64, i64* %z, i32 %idx + %outptr = getelementptr double, double* %x, i32 %idx +;AVX-LABEL:@callclearance +;AVX: vxorps [[THEXMM:%xmm[0-9]+]], [[THEXMM]], [[THEXMM]] +;AVX: vcvtsi2sdq {{.*}}, [[THEXMM]], {{%xmm[0-9]+}} +;AVX-NOT: vxorps +;AVX: vcvtsi2sdq {{.*}}, [[THEXMM]], {{%xmm[0-9]+}} + %val = load i64, i64 *%valptr + %val_f = sitofp i64 %val to double + %val2 = load i64, i64 *%valptr2 + %val2_f = sitofp i64 %val2 to double + %sined = call double @sin(double %val_f) + %sined2 = call double @sin(double %val2_f) + %sum = fadd double %sined, %sined2 + store double %sum, double *%x + %done = icmp sgt i32 %idx, 10000 + br i1 %done, label %end, label %loop + +end: + ret void +} Index: test/CodeGen/X86/half.ll =================================================================== --- test/CodeGen/X86/half.ll +++ test/CodeGen/X86/half.ll @@ -287,6 +287,7 @@ ; CHECK-LIBCALL-NEXT: movzwl (%rsi), %edi ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee ; CHECK-LIBCALL-NEXT: movss %xmm0, 12(%rsp) +; CHECK-LIBCALL-NEXT: xorps %xmm0, %xmm0 ; CHECK-LIBCALL-NEXT: cvtsi2ssl %ebx, %xmm0 ; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee ; CHECK-LIBCALL-NEXT: movzwl %ax, %edi