diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -3189,91 +3189,189 @@ return false; } +// This function tries to combine RLWINM followed by RLWINM/ANDI_rec. We not +// only do such combining in SSA, but also after RA, since some RLWINM is +// generated after RA. bool PPCInstrInfo::combineRLWINM(MachineInstr &MI, MachineInstr **ToErase) const { - MachineRegisterInfo *MRI = &MI.getParent()->getParent()->getRegInfo(); - unsigned FoldingReg = MI.getOperand(1).getReg(); - if (!Register::isVirtualRegister(FoldingReg)) + unsigned MIOpCode = MI.getOpcode(); + if (MIOpCode != PPC::ANDI_rec && MIOpCode != PPC::ANDI8_rec && + MIOpCode != PPC::RLWINM && MIOpCode != PPC::RLWINM_rec && + MIOpCode != PPC::RLWINM8 && MIOpCode != PPC::RLWINM8_rec) return false; - MachineInstr *SrcMI = MRI->getVRegDef(FoldingReg); - if (SrcMI->getOpcode() != PPC::RLWINM && - SrcMI->getOpcode() != PPC::RLWINM_rec && - SrcMI->getOpcode() != PPC::RLWINM8 && - SrcMI->getOpcode() != PPC::RLWINM8_rec) + MachineRegisterInfo *MRI = &MI.getParent()->getParent()->getRegInfo(); + Register FoldingReg = MI.getOperand(1).getReg(); + MachineInstr *SrcMI; + // If we're in SSA. + if (MRI->isSSA()) { + if (!Register::isVirtualRegister(FoldingReg)) + return false; + SrcMI = MRI->getVRegDef(FoldingReg); + if (!SrcMI) + return false; + } + // If we're after RA. + else { + bool OtherIntermediateUse = false; + SrcMI = getDefMIPostRA(FoldingReg, MI, OtherIntermediateUse); + // Make sure there is no intermediate user. + if (OtherIntermediateUse || !SrcMI) + return false; + } + unsigned SrcOpCode = SrcMI->getOpcode(); + if (SrcOpCode != PPC::RLWINM && SrcOpCode != PPC::RLWINM_rec && + SrcOpCode != PPC::RLWINM8 && SrcOpCode != PPC::RLWINM8_rec) return false; - assert((MI.getOperand(2).isImm() && MI.getOperand(3).isImm() && - MI.getOperand(4).isImm() && SrcMI->getOperand(2).isImm() && - SrcMI->getOperand(3).isImm() && SrcMI->getOperand(4).isImm()) && + assert((SrcMI->getOperand(2).isImm() && SrcMI->getOperand(3).isImm() && + SrcMI->getOperand(4).isImm()) && "Invalid PPC::RLWINM Instruction!"); uint64_t SHSrc = SrcMI->getOperand(2).getImm(); - uint64_t SHMI = MI.getOperand(2).getImm(); uint64_t MBSrc = SrcMI->getOperand(3).getImm(); - uint64_t MBMI = MI.getOperand(3).getImm(); uint64_t MESrc = SrcMI->getOperand(4).getImm(); - uint64_t MEMI = MI.getOperand(4).getImm(); - - assert((MEMI < 32 && MESrc < 32 && MBMI < 32 && MBSrc < 32) && - "Invalid PPC::RLWINM Instruction!"); - // If MBMI is bigger than MEMI, we always can not get run of ones. - // RotatedSrcMask non-wrap: - // 0........31|32........63 - // RotatedSrcMask: B---E B---E - // MaskMI: -----------|--E B------ - // Result: ----- --- (Bad candidate) - // - // RotatedSrcMask wrap: - // 0........31|32........63 - // RotatedSrcMask: --E B----|--E B---- - // MaskMI: -----------|--E B------ - // Result: --- -----|--- ----- (Bad candidate) - // - // One special case is RotatedSrcMask is a full set mask. - // RotatedSrcMask full: - // 0........31|32........63 - // RotatedSrcMask: ------EB---|-------EB--- - // MaskMI: -----------|--E B------ - // Result: -----------|--- ------- (Good candidate) - - // Mark special case. - bool SrcMaskFull = (MBSrc - MESrc == 1) || (MBSrc == 0 && MESrc == 31); - - // For other MBMI > MEMI cases, just return. - if ((MBMI > MEMI) && !SrcMaskFull) + assert((MESrc < 32 && MBSrc < 32) && "Invalid PPC::RLWINM Instruction!"); + bool IsSrc64Bit = + (SrcOpCode == PPC::RLWINM8 || SrcOpCode == PPC::RLWINM8_rec || + SrcOpCode == PPC::ANDI8_rec); + bool Is64Bit = (MIOpCode == PPC::RLWINM8 || MIOpCode == PPC::RLWINM8_rec || + MIOpCode == PPC::ANDI8_rec); + if (Is64Bit != IsSrc64Bit) return false; - - // Handle MBMI <= MEMI cases. - APInt MaskMI = APInt::getBitsSetWithWrap(32, 32 - MEMI - 1, 32 - MBMI); - // In MI, we only need low 32 bits of SrcMI, just consider about low 32 - // bit of SrcMI mask. Note that in APInt, lowerest bit is at index 0, - // while in PowerPC ISA, lowerest bit is at index 63. - APInt MaskSrc = APInt::getBitsSetWithWrap(32, 32 - MESrc - 1, 32 - MBSrc); - - APInt RotatedSrcMask = MaskSrc.rotl(SHMI); - APInt FinalMask = RotatedSrcMask & MaskMI; - uint32_t NewMB, NewME; bool Simplified = false; + bool IsFoldingRegKilled = MI.getOperand(1).isKill(); + if (MIOpCode == PPC::ANDI_rec || MIOpCode == PPC::ANDI8_rec) { + uint64_t Imm = MI.getOperand(2).getImm(); + // Combine RLWINM and ANDI_rec to RLWINM. + // RLWINM can be used to extract an n-bit field and it can be folded if it + // followed by an ANDI_rec, which AND the extracted field with the immediate + // composed of one contiguous 1s with any number of 0s on both sides, like + // 0b0110. + // RLWINMMask is used to extract an n-bit field. + // 0........31|32........63 + // ExtractMask: -----------|////------- + // ANDIMask: --/--------|----------- + // Result: -----------|--/-------- + if (SHSrc > 0 && MBSrc > 0 && MESrc == 31 && (SHSrc + MBSrc) > 31) { + uint32_t ExtraNum = 32 - MBSrc; + uint32_t ExtraBit = SHSrc + MBSrc - 32; + uint64_t AndMask = Imm & maskTrailingOnes(ExtraNum); + if (isShiftedMask_64(AndMask)) { + LLVM_DEBUG(dbgs() << "Combining pair: "); + LLVM_DEBUG(SrcMI->dump()); + LLVM_DEBUG(MI.dump()); + + Simplified = true; + ExtraNum = countPopulation(AndMask); + ExtraBit += countTrailingZeros(AndMask); + MI.setDesc(get(Is64Bit ? PPC::RLWINM8_rec : PPC::RLWINM_rec)); + MI.RemoveOperand(2); + MI.RemoveOperand(1); + MachineInstrBuilder(*MI.getParent()->getParent(), MI) + .add(SrcMI->getOperand(1)) + .addImm(ExtraBit + ExtraNum) + .addImm(32 - ExtraNum) + .addImm(31); + + LLVM_DEBUG(dbgs() << "TO: "); + LLVM_DEBUG(MI.dump()); + } + } + } + // Combine two RLWINMs. + else if (MIOpCode == PPC::RLWINM || MIOpCode == PPC::RLWINM_rec || + MIOpCode == PPC::RLWINM8 || MIOpCode == PPC::RLWINM8_rec) { + assert((MI.getOperand(2).isImm() && MI.getOperand(3).isImm() && + MI.getOperand(4).isImm()) && + "Invalid PPC::RLWINM Instruction!"); + uint64_t SHMI = MI.getOperand(2).getImm(); + uint64_t MBMI = MI.getOperand(3).getImm(); + uint64_t MEMI = MI.getOperand(4).getImm(); + assert((MEMI < 32 && MBMI < 32) && "Invalid PPC::RLWINM Instruction!"); + // If MBMI is bigger than MEMI, we always can not get run of ones. + // RotatedSrcMask non-wrap: + // 0........31|32........63 + // RotatedSrcMask: B---E B---E + // MaskMI: -----------|--E B------ + // Result: ----- --- (Bad candidate) + // + // RotatedSrcMask wrap: + // 0........31|32........63 + // RotatedSrcMask: --E B----|--E B---- + // MaskMI: -----------|--E B------ + // Result: --- -----|--- ----- (Bad candidate) + // + // One special case is RotatedSrcMask is a full set mask. + // RotatedSrcMask full: + // 0........31|32........63 + // RotatedSrcMask: ------EB---|-------EB--- + // MaskMI: -----------|--E B------ + // Result: -----------|--- ------- (Good candidate) + + // Mark special case. + bool SrcMaskFull = (MBSrc - MESrc == 1) || (MBSrc == 0 && MESrc == 31); + + // For other MBMI > MEMI cases, just return. + if ((MBMI > MEMI) && !SrcMaskFull) + return false; - // If final mask is 0, MI result should be 0 too. - if (FinalMask.isNullValue()) { - bool Is64Bit = - (MI.getOpcode() == PPC::RLWINM8 || MI.getOpcode() == PPC::RLWINM8_rec); - Simplified = true; - LLVM_DEBUG(dbgs() << "Replace Instr: "); - LLVM_DEBUG(MI.dump()); - - if (MI.getOpcode() == PPC::RLWINM || MI.getOpcode() == PPC::RLWINM8) { - // Replace MI with "LI 0" - MI.RemoveOperand(4); - MI.RemoveOperand(3); - MI.RemoveOperand(2); - MI.getOperand(1).ChangeToImmediate(0); - MI.setDesc(get(Is64Bit ? PPC::LI8 : PPC::LI)); - } else { - // Replace MI with "ANDI_rec reg, 0" - MI.RemoveOperand(4); - MI.RemoveOperand(3); - MI.getOperand(2).setImm(0); - MI.setDesc(get(Is64Bit ? PPC::ANDI8_rec : PPC::ANDI_rec)); + // Handle MBMI <= MEMI cases. + APInt MaskMI = APInt::getBitsSetWithWrap(32, 32 - MEMI - 1, 32 - MBMI); + // In MI, we only need low 32 bits of SrcMI, just consider about low 32 + // bit of SrcMI mask. Note that in APInt, lowerest bit is at index 0, + // while in PowerPC ISA, lowerest bit is at index 63. + APInt MaskSrc = APInt::getBitsSetWithWrap(32, 32 - MESrc - 1, 32 - MBSrc); + + APInt RotatedSrcMask = MaskSrc.rotl(SHMI); + APInt FinalMask = RotatedSrcMask & MaskMI; + uint32_t NewMB, NewME; + // If final mask is 0, MI result should be 0 too. + if (FinalMask.isNullValue()) { + + LLVM_DEBUG(dbgs() << "Replace Instr: "); + LLVM_DEBUG(MI.dump()); + + Simplified = true; + if (MIOpCode == PPC::RLWINM || MIOpCode == PPC::RLWINM8) { + // Replace MI with "LI 0" + MI.RemoveOperand(4); + MI.RemoveOperand(3); + MI.RemoveOperand(2); + MI.getOperand(1).ChangeToImmediate(0); + MI.setDesc(get(Is64Bit ? PPC::LI8 : PPC::LI)); + } else { + // Replace MI with "ANDI_rec reg, 0" + MI.RemoveOperand(4); + MI.RemoveOperand(3); + MI.getOperand(2).setImm(0); + MI.setDesc(get(Is64Bit ? PPC::ANDI8_rec : PPC::ANDI_rec)); + MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg()); + if (SrcMI->getOperand(1).isKill()) { + MI.getOperand(1).setIsKill(true); + SrcMI->getOperand(1).setIsKill(false); + } else + // About to replace MI.getOperand(1), clear its kill flag. + MI.getOperand(1).setIsKill(false); + } + LLVM_DEBUG(dbgs() << "With: "); + LLVM_DEBUG(MI.dump()); + } else if ((isRunOfOnes((unsigned)(FinalMask.getZExtValue()), NewMB, + NewME) && + NewMB <= NewME) || + SrcMaskFull) { + // Here we only handle MBMI <= MEMI case, so NewMB must be no bigger + // than NewME. Otherwise we get a 64 bit value after folding, but MI + // return a 32 bit value. + + LLVM_DEBUG(dbgs() << "Converting Instr: "); + LLVM_DEBUG(MI.dump()); + + Simplified = true; + uint16_t NewSH = (SHSrc + SHMI) % 32; + MI.getOperand(2).setImm(NewSH); + // If SrcMI mask is full, no need to update MBMI and MEMI. + if (!SrcMaskFull) { + MI.getOperand(3).setImm(NewMB); + MI.getOperand(4).setImm(NewME); + } MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg()); if (SrcMI->getOperand(1).isKill()) { MI.getOperand(1).setIsKill(true); @@ -3281,47 +3379,20 @@ } else // About to replace MI.getOperand(1), clear its kill flag. MI.getOperand(1).setIsKill(false); - } - - LLVM_DEBUG(dbgs() << "With: "); - LLVM_DEBUG(MI.dump()); - - } else if ((isRunOfOnes((unsigned)(FinalMask.getZExtValue()), NewMB, NewME) && - NewMB <= NewME) || - SrcMaskFull) { - // Here we only handle MBMI <= MEMI case, so NewMB must be no bigger - // than NewME. Otherwise we get a 64 bit value after folding, but MI - // return a 32 bit value. - Simplified = true; - LLVM_DEBUG(dbgs() << "Converting Instr: "); - LLVM_DEBUG(MI.dump()); - uint16_t NewSH = (SHSrc + SHMI) % 32; - MI.getOperand(2).setImm(NewSH); - // If SrcMI mask is full, no need to update MBMI and MEMI. - if (!SrcMaskFull) { - MI.getOperand(3).setImm(NewMB); - MI.getOperand(4).setImm(NewME); + LLVM_DEBUG(dbgs() << "To: "); + LLVM_DEBUG(MI.dump()); } - MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg()); - if (SrcMI->getOperand(1).isKill()) { - MI.getOperand(1).setIsKill(true); - SrcMI->getOperand(1).setIsKill(false); - } else - // About to replace MI.getOperand(1), clear its kill flag. - MI.getOperand(1).setIsKill(false); - - LLVM_DEBUG(dbgs() << "To: "); - LLVM_DEBUG(MI.dump()); } - if (Simplified & MRI->use_nodbg_empty(FoldingReg) && - !SrcMI->hasImplicitDef()) { - // If FoldingReg has no non-debug use and it has no implicit def (it - // is not RLWINMO or RLWINM8o), it's safe to delete its def SrcMI. - // Otherwise keep it. - *ToErase = SrcMI; - LLVM_DEBUG(dbgs() << "Delete dead instruction: "); - LLVM_DEBUG(SrcMI->dump()); + if (Simplified && !SrcMI->hasImplicitDef()) { + // If SrcMI has no implicit def, and FoldingReg has no non-debug use or + // its flag is "killed", it's safe to delete SrcMI. Otherwise keep it. + if ((!MRI->isSSA() && IsFoldingRegKilled) || + (MRI->isSSA() && MRI->use_nodbg_empty(FoldingReg))) { + *ToErase = SrcMI; + LLVM_DEBUG(dbgs() << "Delete dead instruction: "); + LLVM_DEBUG(SrcMI->dump()); + } } return Simplified; } diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp --- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp +++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -844,6 +844,8 @@ combineSEXTAndSHL(MI, ToErase); break; } + case PPC::ANDI_rec: + case PPC::ANDI8_rec: case PPC::RLWINM: case PPC::RLWINM_rec: case PPC::RLWINM8: diff --git a/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp b/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp --- a/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp +++ b/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp @@ -37,6 +37,8 @@ "Number of self copy instructions eliminated"); STATISTIC(NumFrameOffFoldInPreEmit, "Number of folding frame offset by using r+r in pre-emit peephole"); +STATISTIC(NumCombineRLWINMInPreEmit, + "Number of combining RLWINM with ANDI/RLWINM in pre-emit peephole"); static cl::opt EnablePCRelLinkerOpt("ppc-pcrel-linker-opt", cl::Hidden, cl::init(true), @@ -413,6 +415,13 @@ LLVM_DEBUG(dbgs() << "Frame offset folding by using index form: "); LLVM_DEBUG(MI.dump()); } + MachineInstr *ToErase = nullptr; + if (TII->combineRLWINM(MI, &ToErase)) { + Changed = true; + NumCombineRLWINMInPreEmit++; + if (ToErase) + InstrsToErase.push_back(ToErase); + } } // Eliminate conditional branch based on a constant CR bit by diff --git a/llvm/test/CodeGen/PowerPC/fold-rlwinm.mir b/llvm/test/CodeGen/PowerPC/fold-rlwinm.mir --- a/llvm/test/CodeGen/PowerPC/fold-rlwinm.mir +++ b/llvm/test/CodeGen/PowerPC/fold-rlwinm.mir @@ -192,8 +192,7 @@ ; CHECK: liveins: $x3 ; CHECK: [[COPY:%[0-9]+]]:g8rc = COPY $x3 ; CHECK: [[COPY1:%[0-9]+]]:gprc = COPY [[COPY]].sub_32 - ; CHECK: [[RLWINM:%[0-9]+]]:gprc = RLWINM [[COPY1]], 4, 28, 31 - ; CHECK: [[ANDI_rec:%[0-9]+]]:gprc = ANDI_rec [[RLWINM]], 4, implicit-def $cr0 + ; CHECK: [[RLWINM_rec:%[0-9]+]]:gprc = RLWINM_rec [[COPY1]], 3, 31, 31, implicit-def $cr0 ; CHECK: BLR8 implicit $lr8, implicit $rm %0:g8rc = COPY $x3 %1:gprc = COPY %0.sub_32:g8rc diff --git a/llvm/test/CodeGen/PowerPC/sms-phi-5.ll b/llvm/test/CodeGen/PowerPC/sms-phi-5.ll --- a/llvm/test/CodeGen/PowerPC/sms-phi-5.ll +++ b/llvm/test/CodeGen/PowerPC/sms-phi-5.ll @@ -14,9 +14,8 @@ ; CHECK-NEXT: # %bb.2: ; CHECK-NEXT: lhz 3, 0(3) ; CHECK-NEXT: slwi 3, 3, 15 -; CHECK-NEXT: clrlwi 3, 3, 31 -; CHECK-NEXT: rlwinm 4, 3, 31, 17, 31 -; CHECK-NEXT: or 3, 3, 4 +; CHECK-NEXT: li 4, 0 +; CHECK-NEXT: ori 3, 4, 0 ; CHECK-NEXT: rlwimi 3, 3, 15, 0, 16 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/vsx_builtins.ll b/llvm/test/CodeGen/PowerPC/vsx_builtins.ll --- a/llvm/test/CodeGen/PowerPC/vsx_builtins.ll +++ b/llvm/test/CodeGen/PowerPC/vsx_builtins.ll @@ -113,8 +113,7 @@ ; CHECK-NEXT: xvtdivdp cr0, v2, v3 ; CHECK-NEXT: li r4, 222 ; CHECK-NEXT: mfocrf r3, 128 -; CHECK-NEXT: srwi r3, r3, 28 -; CHECK-NEXT: andi. r3, r3, 2 +; CHECK-NEXT: rlwinm. r3, r3, 2, 31, 31 ; CHECK-NEXT: li r3, 22 ; CHECK-NEXT: iseleq r3, r4, r3 ; CHECK-NEXT: blr @@ -131,8 +130,7 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvtdivdp cr0, v2, v3 ; CHECK-NEXT: mfocrf r3, 128 -; CHECK-NEXT: srwi r3, r3, 28 -; CHECK-NEXT: rlwinm r3, r3, 28, 31, 31 +; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: blr entry: %0 = tail call i32 @llvm.ppc.vsx.xvtdivdp(<2 x double> %a, <2 x double> %b)