diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -741,7 +741,8 @@ bool convertToImmediateForm(MachineInstr &MI, MachineInstr **KilledDef = nullptr) const; bool foldFrameOffset(MachineInstr &MI) const; - bool combineRLWINM(MachineInstr &MI, MachineInstr **ToErase = nullptr) const; + bool simplifyRotateAndMaskInstr(MachineInstr &MI, + MachineInstr *&ToErase) const; bool isADDIInstrEligibleForFolding(MachineInstr &ADDIMI, int64_t &Imm) const; bool isADDInstrEligibleForFolding(MachineInstr &ADDMI) const; bool isImmInstrEligibleForFolding(MachineInstr &MI, unsigned &BaseReg, diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -3890,18 +3890,78 @@ return false; } -bool PPCInstrInfo::combineRLWINM(MachineInstr &MI, - MachineInstr **ToErase) const { +// Return true if SrcMI and MI are both 32-bit or both 64-bit instructions. +static bool sameWidthMIs(MachineInstr *SrcMI, MachineInstr *MI, bool &Is64Bit) { + unsigned Opc = MI->getOpcode(); + unsigned SrcOpc = SrcMI->getOpcode(); + if ((SrcOpc == PPC::RLWINM8 || SrcOpc == PPC::RLWINM8_rec) && + (Opc == PPC::RLWINM8 || Opc == PPC::RLWINM8_rec)) { + Is64Bit = true; + return true; + } + if ((SrcOpc == PPC::RLWINM || SrcOpc == PPC::RLWINM_rec) && + (Opc == PPC::RLWINM || Opc == PPC::RLWINM_rec)) + return true; + return false; +} + +// This function tries to combine two RLWINMs. We not only perform such +// optimization in SSA, but also after RA, since some RLWINM is generated after +// RA. +bool PPCInstrInfo::simplifyRotateAndMaskInstr(MachineInstr &MI, + MachineInstr *&ToErase) const { + unsigned UseOpc = MI.getOpcode(); + if (UseOpc != PPC::RLWINM && UseOpc != PPC::RLWINM_rec && + UseOpc != PPC::RLWINM8 && UseOpc != PPC::RLWINM8_rec) + return false; + + // Find the source MI. MachineRegisterInfo *MRI = &MI.getParent()->getParent()->getRegInfo(); Register FoldingReg = MI.getOperand(1).getReg(); - if (!FoldingReg.isVirtual()) + MachineInstr *SrcMI = nullptr; + bool CanErase = false; + bool OtherIntermediateUse = true; + if (MRI->isSSA()) { + if (!FoldingReg.isVirtual()) + return false; + SrcMI = MRI->getVRegDef(FoldingReg); + } else { + if (!Register::isPhysicalRegister(FoldingReg)) + return false; + SrcMI = getDefMIPostRA(FoldingReg, MI, OtherIntermediateUse); + } + if (!SrcMI) return false; - MachineInstr *SrcMI = MRI->getVRegDef(FoldingReg); - if (SrcMI->getOpcode() != PPC::RLWINM && - SrcMI->getOpcode() != PPC::RLWINM_rec && - SrcMI->getOpcode() != PPC::RLWINM8 && - SrcMI->getOpcode() != PPC::RLWINM8_rec) + + // Check if MI and SrcMI are both 32-bit or both 64-bit instructions. + // TODO: The pairs of RLWINM8(RLWINM) or RLWINM(RLWINM8) never occur before + // RA, but after RA. Even they are not in the same bit-width, we can do the + // foldings for RLWINM8(RLWINM)->RLWINM8, or RLWINM(RLWINM8)->RLWINM. + bool Is64Bit = false; + if (!sameWidthMIs(SrcMI, &MI, Is64Bit)) return false; + + // Check if the registers(def and use) meet the requirements for folding. + MachineOperand ForwardRegOp = SrcMI->getOperand(1); + Register ForwardReg = ForwardRegOp.getReg(); + bool IsFwdFeederRegKilled = false; + bool SeenIntermediateUse = false; + bool IsMIUseRegKilled = MI.getOperand(1).isKill(); + if (MRI->isSSA()) { + CanErase = !SrcMI->hasImplicitDef() && MRI->hasOneNonDBGUse(FoldingReg); + } else { + bool KillFwdDefMI = !OtherIntermediateUse && IsMIUseRegKilled; + CanErase = KillFwdDefMI && !SrcMI->hasImplicitDef(); + // In post-RA, if SrcMI also defines the register to be forwarded, we can + // only do the folding if SrcMI is going to be erased. + if (!CanErase && SrcMI->definesRegister(ForwardReg)) + return false; + // Check if the SrcReg can be forwarded to MI. + if (!isRegElgibleForForwarding(ForwardRegOp, *SrcMI, MI, KillFwdDefMI, + IsFwdFeederRegKilled, SeenIntermediateUse)) + return false; + } + assert((MI.getOperand(2).isImm() && MI.getOperand(3).isImm() && MI.getOperand(4).isImm() && SrcMI->getOperand(2).isImm() && SrcMI->getOperand(3).isImm() && SrcMI->getOperand(4).isImm()) && @@ -3912,7 +3972,6 @@ uint64_t MBMI = MI.getOperand(3).getImm(); uint64_t MESrc = SrcMI->getOperand(4).getImm(); uint64_t MEMI = MI.getOperand(4).getImm(); - assert((MEMI < 32 && MESrc < 32 && MBMI < 32 && MBSrc < 32) && "Invalid PPC::RLWINM Instruction!"); // If MBMI is bigger than MEMI, we always can not get run of ones. @@ -3935,7 +3994,8 @@ // MaskMI: -----------|--E B------ // Result: -----------|--- ------- (Good candidate) - // Mark special case. + // Mark the special cases of all bits in a 64-bit register or the low 32 bits + // in a 64-bit register. bool SrcMaskFull = (MBSrc - MESrc == 1) || (MBSrc == 0 && MESrc == 31); // For other MBMI > MEMI cases, just return. @@ -3945,8 +4005,8 @@ // Handle MBMI <= MEMI cases. APInt MaskMI = APInt::getBitsSetWithWrap(32, 32 - MEMI - 1, 32 - MBMI); // In MI, we only need low 32 bits of SrcMI, just consider about low 32 - // bit of SrcMI mask. Note that in APInt, lowerest bit is at index 0, - // while in PowerPC ISA, lowerest bit is at index 63. + // bit of SrcMI mask. Note that in APInt, the least significant bit is at + // index 0, while in PowerPC ISA, the least significant bit is at index 63. APInt MaskSrc = APInt::getBitsSetWithWrap(32, 32 - MESrc - 1, 32 - MBSrc); APInt RotatedSrcMask = MaskSrc.rotl(SHMI); @@ -3954,29 +4014,23 @@ uint32_t NewMB, NewME; bool Simplified = false; - // If final mask is 0, MI result should be 0 too. + // If final mask is 0, replace MI with LI/LI8 0 or ANDI_rec/ANDI8_rec 0. if (FinalMask.isZero()) { - bool Is64Bit = - (MI.getOpcode() == PPC::RLWINM8 || MI.getOpcode() == PPC::RLWINM8_rec); Simplified = true; LLVM_DEBUG(dbgs() << "Replace Instr: "); LLVM_DEBUG(MI.dump()); - if (MI.getOpcode() == PPC::RLWINM || MI.getOpcode() == PPC::RLWINM8) { - // Replace MI with "LI 0" - MI.removeOperand(4); - MI.removeOperand(3); - MI.removeOperand(2); - MI.getOperand(1).ChangeToImmediate(0); - MI.setDesc(get(Is64Bit ? PPC::LI8 : PPC::LI)); - } else { - // Replace MI with "ANDI_rec reg, 0" - MI.removeOperand(4); - MI.removeOperand(3); - MI.getOperand(2).setImm(0); - MI.setDesc(get(Is64Bit ? PPC::ANDI8_rec : PPC::ANDI_rec)); - MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg()); - if (SrcMI->getOperand(1).isKill()) { + LoadImmediateInfo LII; + LII.Imm = 0; + LII.Is64Bit = Is64Bit; + LII.SetCR = (UseOpc == PPC::RLWINM_rec || UseOpc == PPC::RLWINM8_rec); + replaceInstrWithLI(MI, LII); + if (LII.SetCR) { + MI.getOperand(1).setReg(ForwardReg); + // FIXME: If the register used by MI is `killed` before change, we need + // update the kill flag on the previous use of that register. Here we only + // considered the kill flag of the register used by SrcMI. + if (ForwardRegOp.isKill()) { MI.getOperand(1).setIsKill(true); SrcMI->getOperand(1).setIsKill(false); } else @@ -3986,7 +4040,6 @@ LLVM_DEBUG(dbgs() << "With: "); LLVM_DEBUG(MI.dump()); - } else if ((isRunOfOnes((unsigned)(FinalMask.getZExtValue()), NewMB, NewME) && NewMB <= NewME) || SrcMaskFull) { @@ -3997,15 +4050,14 @@ LLVM_DEBUG(dbgs() << "Converting Instr: "); LLVM_DEBUG(MI.dump()); - uint16_t NewSH = (SHSrc + SHMI) % 32; - MI.getOperand(2).setImm(NewSH); - // If SrcMI mask is full, no need to update MBMI and MEMI. + MI.getOperand(2).setImm((SHSrc + SHMI) % 32); + // If SrcMI mask is full, do not update MBMI and MEMI. if (!SrcMaskFull) { MI.getOperand(3).setImm(NewMB); MI.getOperand(4).setImm(NewME); } - MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg()); - if (SrcMI->getOperand(1).isKill()) { + MI.getOperand(1).setReg(ForwardReg); + if (ForwardRegOp.isKill()) { MI.getOperand(1).setIsKill(true); SrcMI->getOperand(1).setIsKill(false); } else @@ -4015,12 +4067,10 @@ LLVM_DEBUG(dbgs() << "To: "); LLVM_DEBUG(MI.dump()); } - if (Simplified & MRI->use_nodbg_empty(FoldingReg) && - !SrcMI->hasImplicitDef()) { - // If FoldingReg has no non-debug use and it has no implicit def (it - // is not RLWINMO or RLWINM8o), it's safe to delete its def SrcMI. - // Otherwise keep it. - *ToErase = SrcMI; + if (Simplified && CanErase) { + // If SrcMI has no implicit def, and FoldingReg has no non-debug use or + // its flag is "killed", it's safe to delete SrcMI. Otherwise keep it. + ToErase = SrcMI; LLVM_DEBUG(dbgs() << "Delete dead instruction: "); LLVM_DEBUG(SrcMI->dump()); } diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp --- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp +++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -1055,7 +1055,7 @@ case PPC::RLWINM_rec: case PPC::RLWINM8: case PPC::RLWINM8_rec: { - Simplified = TII->combineRLWINM(MI, &ToErase); + Simplified = TII->simplifyRotateAndMaskInstr(MI, ToErase); if (Simplified) ++NumRotatesCollapsed; break; diff --git a/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp b/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp --- a/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp +++ b/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp @@ -40,6 +40,8 @@ "Number of folding frame offset by using r+r in pre-emit peephole"); STATISTIC(NumCmpsInPreEmit, "Number of compares eliminated in pre-emit peephole"); +STATISTIC(NumRotateInstrFoldInPreEmit, + "Number of folding Rotate instructions in pre-emit peephole"); static cl::opt EnablePCRelLinkerOpt("ppc-pcrel-linker-opt", cl::Hidden, cl::init(true), @@ -517,6 +519,13 @@ LLVM_DEBUG(MI.dump()); InstrsToErase.push_back(&MI); } + MachineInstr *ToErase = nullptr; + if (TII->simplifyRotateAndMaskInstr(MI, ToErase)) { + Changed = true; + NumRotateInstrFoldInPreEmit++; + if (ToErase) + InstrsToErase.push_back(ToErase); + } } // Eliminate conditional branch based on a constant CR bit by diff --git a/llvm/test/CodeGen/PowerPC/fold-rlwinm-after-ra.mir b/llvm/test/CodeGen/PowerPC/fold-rlwinm-after-ra.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/fold-rlwinm-after-ra.mir @@ -0,0 +1,194 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-unknown -stop-after \ +# RUN: ppc-pre-emit-peephole %s -o - | FileCheck %s + +--- +name: testFoldRLWINM +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $r3 + ; CHECK-LABEL: name: testFoldRLWINM + ; CHECK: liveins: $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $r3 = RLWINM killed renamable $r3, 14, 0, 12, implicit-def $x3 + ; CHECK-NEXT: BLR8 implicit $lr8, implicit $rm, implicit killed $x3 + $r3 = RLWINM killed $r3, 27, 5, 31 + dead renamable $r3 = RLWINM killed renamable $r3, 19, 0, 12, implicit-def $x3 + BLR8 implicit $lr8, implicit $rm, implicit killed $x3 +... +--- +name: testFoldRLWINMSrcFullMask1 +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $r3 + ; CHECK-LABEL: name: testFoldRLWINMSrcFullMask1 + ; CHECK: liveins: $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $r3 = RLWINM killed renamable $r3, 14, 0, 12, implicit-def $x3 + ; CHECK-NEXT: BLR8 implicit $lr8, implicit $rm, implicit killed $x3 + $r3 = RLWINM killed $r3, 27, 0, 31 + dead renamable $r3 = RLWINM killed renamable $r3, 19, 0, 12, implicit-def $x3 + BLR8 implicit $lr8, implicit $rm, implicit killed $x3 +... +--- +name: testFoldRLWINMSrcFullMask2 +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $r2, $r3 + ; CHECK-LABEL: name: testFoldRLWINMSrcFullMask2 + ; CHECK: liveins: $r2, $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $r3 = RLWINM $r2, 14, 10, 1, implicit-def $x3 + ; CHECK-NEXT: BLR8 implicit $lr8, implicit $rm, implicit killed $x3 + $r3 = RLWINM killed $r2, 27, 10, 9 + dead renamable $r3 = RLWINM killed renamable $r3, 19, 10, 1, implicit-def $x3 + BLR8 implicit $lr8, implicit $rm, implicit killed $x3 +... +--- +name: testFoldRLWINMSrcWrapped +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $r3 + ; CHECK-LABEL: name: testFoldRLWINMSrcWrapped + ; CHECK: liveins: $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $r3 = RLWINM killed renamable $r3, 14, 11, 12, implicit-def $x3 + ; CHECK-NEXT: BLR8 implicit $lr8, implicit $rm, implicit killed $x3 + $r3 = RLWINM killed $r3, 27, 30, 10 + dead renamable $r3 = RLWINM killed renamable $r3, 19, 0, 12, implicit-def $x3 + BLR8 implicit $lr8, implicit $rm, implicit killed $x3 +... +--- +name: testFoldRLWINMUserWrapped +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $r3 + ; CHECK-LABEL: name: testFoldRLWINMUserWrapped + ; CHECK: liveins: $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $r3 = RLWINM killed $r3, 10, 5, 31 + ; CHECK-NEXT: renamable $r3 = RLWINM killed renamable $r3, 10, 30, 5, implicit-def $x3 + ; CHECK-NEXT: BLR8 implicit $lr8, implicit $rm, implicit killed $x3 + $r3 = RLWINM killed $r3, 10, 5, 31 + dead renamable $r3 = RLWINM killed renamable $r3, 10, 30, 5, implicit-def $x3 + BLR8 implicit $lr8, implicit $rm, implicit killed $x3 +... +--- +name: testFoldRLWINMResultWrapped +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $r3 + ; CHECK-LABEL: name: testFoldRLWINMResultWrapped + ; CHECK: liveins: $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $r3 = RLWINM killed $r3, 10, 20, 10 + ; CHECK-NEXT: renamable $r3 = RLWINM killed renamable $r3, 10, 0, 31, implicit-def $x3 + ; CHECK-NEXT: BLR8 implicit $lr8, implicit $rm, implicit killed $x3 + $r3 = RLWINM killed $r3, 10, 20, 10 + dead renamable $r3 = RLWINM killed renamable $r3, 10, 0, 31, implicit-def $x3 + BLR8 implicit $lr8, implicit $rm, implicit killed $x3 +... +--- +name: testFoldRLWINMToZero +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $r3 + ; CHECK-LABEL: name: testFoldRLWINMToZero + ; CHECK: liveins: $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $r3 = LI 0 + ; CHECK-NEXT: BLR8 implicit $lr8, implicit $rm, implicit killed $x3 + $r3 = RLWINM killed $r3, 27, 5, 10 + dead renamable $r3 = RLWINM killed renamable $r3, 8, 5, 10, implicit-def $x3 + BLR8 implicit $lr8, implicit $rm, implicit killed $x3 +... +--- +name: testFoldRLWINM_recToZero +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $r3 + ; CHECK-LABEL: name: testFoldRLWINM_recToZero + ; CHECK: liveins: $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: dead renamable $r3 = ANDI_rec killed renamable $r3, 0, implicit-def $cr0 + ; CHECK-NEXT: BLR8 implicit $lr8, implicit $rm, implicit killed $cr0 + $r3 = RLWINM killed $r3, 27, 5, 10 + dead renamable $r3 = RLWINM_rec killed renamable $r3, 8, 5, 10, implicit-def $cr0 + BLR8 implicit $lr8, implicit $rm, implicit killed $cr0 +... +--- +name: testFoldRLWINMInvalidMask +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $r3 + ; CHECK-LABEL: name: testFoldRLWINMInvalidMask + ; CHECK: liveins: $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $r3 = RLWINM killed $r3, 20, 5, 31 + ; CHECK-NEXT: renamable $r3 = RLWINM killed renamable $r3, 19, 10, 20, implicit-def $x3 + ; CHECK-NEXT: BLR8 implicit $lr8, implicit $rm, implicit killed $x3 + $r3 = RLWINM killed $r3, 20, 5, 31 + dead renamable $r3 = RLWINM killed renamable $r3, 19, 10, 20, implicit-def $x3 + BLR8 implicit $lr8, implicit $rm, implicit killed $x3 +... +--- +name: testFoldRLWINCanNotBeDeleted +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $r2, $r3 + ; CHECK-LABEL: name: testFoldRLWINCanNotBeDeleted + ; CHECK: liveins: $r2, $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $r3 = RLWINM_rec $r2, 27, 5, 10, implicit-def dead $cr0 + ; CHECK-NEXT: dead renamable $r3 = ANDI_rec $r2, 0, implicit-def $cr0 + ; CHECK-NEXT: BLR8 implicit $lr8, implicit $rm, implicit killed $cr0 + $r3 = RLWINM_rec $r2, 27, 5, 10, implicit-def $cr0 + dead renamable $r3 = RLWINM_rec killed renamable $r3, 8, 5, 10, implicit-def $cr0 + BLR8 implicit $lr8, implicit $rm, implicit killed $cr0 +... +--- +name: testCanNotFoldRLWINM +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $r3 + ; CHECK-LABEL: name: testCanNotFoldRLWINM + ; CHECK: liveins: $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $r3 = RLWINM_rec killed $r3, 27, 5, 10, implicit-def dead $cr0 + ; CHECK-NEXT: dead renamable $r3 = RLWINM_rec killed renamable $r3, 8, 5, 10, implicit-def $cr0 + ; CHECK-NEXT: BLR8 implicit $lr8, implicit $rm, implicit killed $cr0 + $r3 = RLWINM_rec $r3, 27, 5, 10, implicit-def $cr0 + dead renamable $r3 = RLWINM_rec killed renamable $r3, 8, 5, 10, implicit-def $cr0 + BLR8 implicit $lr8, implicit $rm, implicit killed $cr0 +... +--- +name: testCanNotFoldRLWINM2 +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $r2, $r3 + ; CHECK-LABEL: name: testCanNotFoldRLWINM2 + ; CHECK: liveins: $r2, $r3, $x2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: STD $x2, -8, $x1 :: (store (s64) into %stack.0) + ; CHECK-NEXT: $r3 = RLWINM killed $r2, 4, 28, 31 + ; CHECK-NEXT: $r2 = LI 0, implicit-def $x2 + ; CHECK-NEXT: $x2 = LD -8, $x1 :: (load (s64) from %stack.0) + ; CHECK-NEXT: renamable $r3 = RLWINM killed renamable $r3, 19, 0, 12, implicit-def $x3 + ; CHECK-NEXT: BLR8 implicit $lr8, implicit $rm, implicit $x2, implicit killed $x3 + $r3 = RLWINM killed $r2, 4, 28, 31 + $r2 = LI 0, implicit-def $x2 + dead renamable $r3 = RLWINM killed renamable $r3, 19, 0, 12, implicit-def $x3 + BLR8 implicit $lr8, implicit $rm, implicit killed $x2, implicit killed $x3 +... diff --git a/llvm/test/CodeGen/PowerPC/vsx_builtins.ll b/llvm/test/CodeGen/PowerPC/vsx_builtins.ll --- a/llvm/test/CodeGen/PowerPC/vsx_builtins.ll +++ b/llvm/test/CodeGen/PowerPC/vsx_builtins.ll @@ -160,8 +160,7 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvtdivdp cr0, v2, v3 ; CHECK-NEXT: mfocrf r3, 128 -; CHECK-NEXT: srwi r3, r3, 28 -; CHECK-NEXT: rlwinm r3, r3, 28, 31, 31 +; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: blr entry: %0 = tail call i32 @llvm.ppc.vsx.xvtdivdp(<2 x double> %a, <2 x double> %b)