Index: lib/Target/X86/X86FixupBWInsts.cpp =================================================================== --- lib/Target/X86/X86FixupBWInsts.cpp +++ lib/Target/X86/X86FixupBWInsts.cpp @@ -95,6 +95,12 @@ /// nullptr. MachineInstr *tryReplaceCopy(MachineInstr *MI) const; + // Change the MachineInstr \p MI into an eqivalent 32 bit instruction if + // possible. Return the replacement instruction if OK, return nullptr + // otherwise. Set WasCandidate to true or false depending on whether the + // MI was a candidate for this sort of transformation. + MachineInstr *tryReplaceInstr(MachineInstr *MI, MachineBasicBlock &MBB, + bool &WasCandidate) const; public: static char ID; @@ -267,6 +273,54 @@ return MIB; } +MachineInstr *FixupBWInstPass::tryReplaceInstr( + MachineInstr *MI, MachineBasicBlock &MBB, + bool &WasCandidate) const { + MachineInstr *NewMI = nullptr; + WasCandidate = false; + + // See if this is an instruction of the type we are currently looking for. + switch (MI->getOpcode()) { + + case X86::MOV8rm: + // Only replace 8 bit loads with the zero extending versions if + // in an inner most loop and not optimizing for size. This takes + // an extra byte to encode, and provides limited performance upside. + if (MachineLoop *ML = MLI->getLoopFor(&MBB)) { + if (ML->begin() == ML->end() && !OptForSize) { + NewMI = tryReplaceLoad(X86::MOVZX32rm8, MI); + WasCandidate = true; + } + } + break; + + case X86::MOV16rm: + // Always try to replace 16 bit load with 32 bit zero extending. + // Code size is the same, and there is sometimes a perf advantage + // from eliminating a false dependence on the upper portion of + // the register. + NewMI = tryReplaceLoad(X86::MOVZX32rm16, MI); + WasCandidate = true; + break; + + case X86::MOV8rr: + case X86::MOV16rr: + // Always try to replace 8/16 bit copies with a 32 bit copy. + // Code size is either less (16) or equal (8), and there is sometimes a + // perf advantage from eliminating a false dependence on the upper portion + // of the register. + NewMI = tryReplaceCopy(MI); + WasCandidate = true; + break; + + default: + // nothing to do here. + break; + } + + return NewMI; +} + void FixupBWInstPass::processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB) { @@ -280,7 +334,9 @@ // and notes that and the original in a data structure, until the // whole BB has been analyzed. This keeps the replacement instructions // from making it seem as if the larger register might be live. - SmallVector, 8> MIReplacements; + typedef SmallVector, 8> + MIReplacementsType; + MIReplacementsType MIReplacements; // Start computing liveness for this block. We iterate from the end to be able // to update this for each instruction. @@ -288,57 +344,59 @@ // We run after PEI, so we need to AddPristinesAndCSRs. LiveRegs.addLiveOuts(MBB); + bool CandidateDidntGetTransformed = false; + bool WasCandidate = false; + for (auto I = MBB.rbegin(); I != MBB.rend(); ++I) { - MachineInstr *NewMI = nullptr; MachineInstr *MI = &*I; + + MachineInstr *NewMI = tryReplaceInstr(MI, MBB, WasCandidate); - // See if this is an instruction of the type we are currently looking for. - switch (MI->getOpcode()) { - - case X86::MOV8rm: - // Only replace 8 bit loads with the zero extending versions if - // in an inner most loop and not optimizing for size. This takes - // an extra byte to encode, and provides limited performance upside. - if (MachineLoop *ML = MLI->getLoopFor(&MBB)) { - if (ML->begin() == ML->end() && !OptForSize) - NewMI = tryReplaceLoad(X86::MOVZX32rm8, MI); - } - break; - - case X86::MOV16rm: - // Always try to replace 16 bit load with 32 bit zero extending. - // Code size is the same, and there is sometimes a perf advantage - // from eliminating a false dependence on the upper portion of - // the register. - NewMI = tryReplaceLoad(X86::MOVZX32rm16, MI); - break; - - case X86::MOV8rr: - case X86::MOV16rr: - // Always try to replace 8/16 bit copies with a 32 bit copy. - // Code size is either less (16) or equal (8), and there is sometimes a - // perf advantage from eliminating a false dependence on the upper portion - // of the register. - NewMI = tryReplaceCopy(MI); - break; - - default: - // nothing to do here. - break; - } - - if (NewMI) + // Add this to replacements if it was a candidate, even if NewMI is + // nullptr. We will revisit that in a bit. + if (WasCandidate) { MIReplacements.push_back(std::make_pair(MI, NewMI)); + if (!NewMI) + CandidateDidntGetTransformed = true; + } // We're done with this instruction, update liveness for the next one. LiveRegs.stepBackward(*MI); } + if (CandidateDidntGetTransformed) { + // If there was a candidate that didn't get transformed then let's try + // doing the register liveness going forward. Sometimes one direction + // is overly conservative compared to the other. + LiveRegs.clear(); + LiveRegs.addLiveIns(MBB); + + auto NextCandidateIter = MIReplacements.begin(); + + for (auto I = MBB.begin(); I != MBB.end(); ++I) { + MachineInstr *MI = &*I; + SmallVector, 4> Clobbers; + LiveRegs.stepForward(*MI, Clobbers); + + // Only check and create a new instruction if this instruction is + // known to be a candidate that didn't get transformed. + if (NextCandidateIter->first == MI) { + if (NextCandidateIter->second == nullptr) { + MachineInstr *NewMI = tryReplaceInstr(MI, MBB, WasCandidate); + NextCandidateIter->second = NewMI; + } + ++NextCandidateIter; + } + } + } + while (!MIReplacements.empty()) { MachineInstr *MI = MIReplacements.back().first; MachineInstr *NewMI = MIReplacements.back().second; MIReplacements.pop_back(); - MBB.insert(MI, NewMI); - MBB.erase(MI); + if (NewMI) { + MBB.insert(MI, NewMI); + MBB.erase(MI); + } } } Index: test/CodeGen/X86/fixup-bw-inst-fwlive.ll =================================================================== --- test/CodeGen/X86/fixup-bw-inst-fwlive.ll +++ test/CodeGen/X86/fixup-bw-inst-fwlive.ll @@ -0,0 +1,117 @@ +; RUN: llc -fixup-byte-word-insts=1 -march=x86-64 < %s | \ +; RUN: FileCheck -check-prefix CHECK -check-prefix BWON %s +; RUN: llc -fixup-byte-word-insts=0 -march=x86-64 < %s | \ +; RUN: FileCheck -check-prefix CHECK -check-prefix BWOFF %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; These CHECKs are complex. They were derived by running +; fixup-byte-word-insts with and without the forward liveness checking code +; in order to create this unit test, and using bugpoint to reduce the test +; to something as simple as possible that would reproduce the need for the +; forward liveness. +; +; CHECK-LABEL: getAndMoveToFrontDecode: +; CHECK-LABEL: # BB#1: +; BWON: movl %eax, %ecx +; BWOFF: movb %al, %cl +; CHECK-NEXT: shll %cl, %esi + +@bsLive = external global i32, align 4 +@bsBuff = external global i32, align 4 +@limit = external global [6 x [258 x i32]], align 16 + +define void @getAndMoveToFrontDecode() { +entry: + br label %while.body.i.i + +while.body.i.i: ; preds = %if.end.i.i, %entry + br i1 false, label %if.then.i.i, label %if.end.i.i + +if.then.i.i: ; preds = %while.body.i.i + unreachable + +if.end.i.i: ; preds = %while.body.i.i + br i1 undef, label %while.body.i.i, label %vector.body + +vector.body: ; preds = %if.end.i.i + br label %while.body.i + +while.body.i: ; preds = %if.end.i, %vector.body + br i1 false, label %if.then.i, label %if.end.i + +if.then.i: ; preds = %while.body.i + unreachable + +if.end.i: ; preds = %while.body.i + br i1 undef, label %while.body.i, label %bsR.exit + +bsR.exit: ; preds = %if.end.i + br i1 undef, label %while.end307, label %if.end57 + +if.end57: ; preds = %while.end297, %bsR.exit + br i1 false, label %do.body, label %if.else172 + +do.body: ; preds = %if.end57 + unreachable + +if.else172: ; preds = %if.end57 + %cmp174 = icmp slt i32 undef, undef + br i1 %cmp174, label %if.end177, label %if.then176 + +if.then176: ; preds = %if.else172 + unreachable + +if.end177: ; preds = %if.else172 + %0 = load i32, i32* undef, align 4 + %idxprom264480 = sext i32 %0 to i64 + br i1 false, label %while.body.i438, label %entry.while.end_crit_edge.i435 + +entry.while.end_crit_edge.i435: ; preds = %if.end177 + %.pre.i434 = load i32, i32* @bsBuff, align 4 + %1 = trunc i64 %idxprom264480 to i32 + %2 = trunc i64 %idxprom264480 to i32 + %sub.i447 = sub nsw i32 0, %1 + %shr.i448 = lshr i32 %.pre.i434, %sub.i447 + %shl4.i449 = shl i32 1, %2 + %sub5.i450 = add nsw i32 %shl4.i449, -1 + %and6.i451 = and i32 %shr.i448, %sub5.i450 + store i32 %sub.i447, i32* @bsLive, align 4 + %arrayidx267481 = getelementptr inbounds [6 x [258 x i32]], [6 x [258 x i32]]* @limit, i64 0, i64 undef, i64 %idxprom264480 + %3 = load i32, i32* %arrayidx267481, align 4 + %cmp268482 = icmp sgt i32 %and6.i451, %3 + br i1 %cmp268482, label %while.body270, label %while.end297 + +while.body.i438: ; preds = %if.end177 + unreachable + +while.body270: ; preds = %while.end290, %entry.while.end_crit_edge.i435 + %zvec248.0484 = phi i32 [ %or296, %while.end290 ], [ %and6.i451, %entry.while.end_crit_edge.i435 ] + %indvars.iv.next529 = add i64 undef, 1 + %cmp273478 = icmp slt i32 undef, 1 + br i1 %cmp273478, label %while.body275, label %while.end290 + +while.body275: ; preds = %if.end282, %while.body270 + br i1 undef, label %if.then281, label %if.end282 + +if.then281: ; preds = %while.body275 + unreachable + +if.end282: ; preds = %while.body275 + br i1 undef, label %while.body275, label %while.end290 + +while.end290: ; preds = %if.end282, %while.body270 + %shl295 = shl i32 %zvec248.0484, 1 + %or296 = or i32 0, %shl295 + %arrayidx267 = getelementptr inbounds [6 x [258 x i32]], [6 x [258 x i32]]* @limit, i64 0, i64 undef, i64 %indvars.iv.next529 + %4 = load i32, i32* %arrayidx267, align 4 + %cmp268 = icmp sgt i32 %or296, %4 + br i1 %cmp268, label %while.body270, label %while.end297 + +while.end297: ; preds = %while.end290, %entry.while.end_crit_edge.i435 + br i1 undef, label %while.end307, label %if.end57 + +while.end307: ; preds = %while.end297, %bsR.exit + ret void +}