diff --git a/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp b/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp --- a/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp +++ b/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp @@ -21,6 +21,8 @@ #define DEBUG_TYPE "riscv-sextw-removal" STATISTIC(NumRemovedSExtW, "Number of removed sign-extensions"); +STATISTIC(NumTransformedToWInstrs, + "Number of instructions transformed to W-ops"); static cl::opt DisableSExtWRemoval("riscv-disable-sextw-removal", cl::desc("Disable removal of sext.w"), @@ -55,11 +57,143 @@ return new RISCVSExtWRemoval(); } +// add uses of MI to the Worklist +static void addUses(const MachineInstr &MI, + SmallVectorImpl &Worklist, + MachineRegisterInfo &MRI) { + for (auto &UserOp : MRI.reg_operands(MI.getOperand(0).getReg())) { + const auto *User = UserOp.getParent(); + if (User == &MI) // ignore the def, current MI + continue; + Worklist.push_back(User); + } +} + +// returns true if all uses of OrigMI only depend on the lower word of its +// output, so we can transform OrigMI to the corresponding W-version. +// TODO: handle multiple interdependent transformations +static bool isAllUsesReadW(const MachineInstr &OrigMI, + MachineRegisterInfo &MRI) { + + SmallPtrSet Visited; + SmallVector Worklist; + + Visited.insert(&OrigMI); + addUses(OrigMI, Worklist, MRI); + + while (!Worklist.empty()) { + const MachineInstr *MI = Worklist.pop_back_val(); + + if (!Visited.insert(MI).second) { + // If we've looped back to OrigMI through a PHI cycle, we can't transform + // LD or LWU, because these operations use all 64 bits of input. + if (MI == &OrigMI) { + unsigned opcode = MI->getOpcode(); + if (opcode == RISCV::LD || opcode == RISCV::LWU) + return false; + } + continue; + } + + switch (MI->getOpcode()) { + case RISCV::ADDIW: + case RISCV::ADDW: + case RISCV::DIVUW: + case RISCV::DIVW: + case RISCV::MULW: + case RISCV::REMUW: + case RISCV::REMW: + case RISCV::SLLIW: + case RISCV::SLLW: + case RISCV::SRAIW: + case RISCV::SRAW: + case RISCV::SRLIW: + case RISCV::SRLW: + case RISCV::SUBW: + case RISCV::ROLW: + case RISCV::RORW: + case RISCV::RORIW: + case RISCV::CLZW: + case RISCV::CTZW: + case RISCV::CPOPW: + case RISCV::SLLI_UW: + case RISCV::FCVT_S_W: + case RISCV::FCVT_S_WU: + case RISCV::FCVT_D_W: + case RISCV::FCVT_D_WU: + continue; + + // these overwrite higher input bits, otherwise the lower word of output + // depends only on the lower word of input. So check their uses read W. + case RISCV::SLLI: + if (MI->getOperand(2).getImm() >= 32) + continue; + addUses(*MI, Worklist, MRI); + continue; + case RISCV::ANDI: + if (isUInt<11>(MI->getOperand(2).getImm())) + continue; + addUses(*MI, Worklist, MRI); + continue; + case RISCV::ORI: + if (!isUInt<11>(MI->getOperand(2).getImm())) + continue; + addUses(*MI, Worklist, MRI); + continue; + + case RISCV::BEXTI: + if (MI->getOperand(2).getImm() >= 32) + return false; + continue; + + // For these, lower word of output in these operations, depends only on + // the lower word of input. So, we check all uses only read lower word. + case RISCV::COPY: + case RISCV::PHI: + + case RISCV::ADD: + case RISCV::ADDI: + case RISCV::AND: + case RISCV::MUL: + case RISCV::OR: + case RISCV::SLL: + case RISCV::SUB: + case RISCV::XOR: + case RISCV::XORI: + + case RISCV::ADD_UW: + case RISCV::ANDN: + case RISCV::CLMUL: + case RISCV::ORC_B: + case RISCV::ORN: + case RISCV::SEXT_B: + case RISCV::SEXT_H: + case RISCV::SH1ADD: + case RISCV::SH1ADD_UW: + case RISCV::SH2ADD: + case RISCV::SH2ADD_UW: + case RISCV::SH3ADD: + case RISCV::SH3ADD_UW: + case RISCV::XNOR: + case RISCV::ZEXT_H_RV64: + addUses(*MI, Worklist, MRI); + continue; + default: + return false; + } + } + return true; +} + // This function returns true if the machine instruction always outputs a value // where bits 63:32 match bit 31. +// Alternatively, if the instruction can be converted to W variant +// (e.g. ADD->ADDW) and all of its uses only use the lower word of its output, +// then return true and add the instr to FixableDef to be convereted later // TODO: Allocate a bit in TSFlags for the W instructions? // TODO: Add other W instructions. -static bool isSignExtendingOpW(const MachineInstr &MI) { +static bool isSignExtendingOpW(MachineInstr &MI, MachineRegisterInfo &MRI, + SmallPtrSetImpl &FixableDef) { switch (MI.getOpcode()) { case RISCV::LUI: case RISCV::LW: @@ -117,7 +251,14 @@ return MI.getOperand(2).getImm() > 32; // The LI pattern ADDI rd, X0, imm is sign extended. case RISCV::ADDI: - return MI.getOperand(1).isReg() && MI.getOperand(1).getReg() == RISCV::X0; + if (MI.getOperand(1).isReg() && MI.getOperand(1).getReg() == RISCV::X0) + return true; + if (isAllUsesReadW(MI, MRI)) { + // transform to ADDIW + FixableDef.insert(&MI); + return true; + } + return false; // An ANDI with an 11 bit immediate will zero bits 63:11. case RISCV::ANDI: return isUInt<11>(MI.getOperand(2).getImm()); @@ -127,28 +268,45 @@ // Copying from X0 produces zero. case RISCV::COPY: return MI.getOperand(1).getReg() == RISCV::X0; + + // With these opcode, we can "fix" them with the W-version + // if we know all users of the result only rely on bits 31:0 + case RISCV::SLLI: + // SLLIW reads the lowest 5 bits, while SLLI reads lowest 6 bits + if (MI.getOperand(2).getImm() >= 32) + return false; + LLVM_FALLTHROUGH; + case RISCV::ADD: + case RISCV::LD: + case RISCV::LWU: + case RISCV::MUL: + case RISCV::SUB: + if (isAllUsesReadW(MI, MRI)) { + FixableDef.insert(&MI); + return true; + } } return false; } -static bool isSignExtendedW(const MachineInstr &OrigMI, - MachineRegisterInfo &MRI) { +static bool isSignExtendedW(MachineInstr &OrigMI, MachineRegisterInfo &MRI, + SmallPtrSetImpl &FixableDef) { SmallPtrSet Visited; - SmallVector Worklist; + SmallVector Worklist; Worklist.push_back(&OrigMI); while (!Worklist.empty()) { - const MachineInstr *MI = Worklist.pop_back_val(); + MachineInstr *MI = Worklist.pop_back_val(); // If we already visited this instruction, we don't need to check it again. if (!Visited.insert(MI).second) continue; // If this is a sign extending operation we don't need to look any further. - if (isSignExtendingOpW(*MI)) + if (isSignExtendingOpW(*MI, MRI, FixableDef)) continue; // Is this an instruction that propagates sign extend. @@ -164,7 +322,7 @@ // If this is a copy from another register, check its source instruction. if (!SrcReg.isVirtual()) return false; - const MachineInstr *SrcMI = MRI.getVRegDef(SrcReg); + MachineInstr *SrcMI = MRI.getVRegDef(SrcReg); if (!SrcMI) return false; @@ -190,7 +348,7 @@ Register SrcReg = MI->getOperand(1).getReg(); if (!SrcReg.isVirtual()) return false; - const MachineInstr *SrcMI = MRI.getVRegDef(SrcReg); + MachineInstr *SrcMI = MRI.getVRegDef(SrcReg); if (!SrcMI) return false; @@ -228,7 +386,7 @@ Register SrcReg = MI->getOperand(I).getReg(); if (!SrcReg.isVirtual()) return false; - const MachineInstr *SrcMI = MRI.getVRegDef(SrcReg); + MachineInstr *SrcMI = MRI.getVRegDef(SrcReg); if (!SrcMI) return false; @@ -246,6 +404,26 @@ return true; } +static unsigned getWOp(unsigned Opcode) { + switch (Opcode) { + case RISCV::ADDI: + return RISCV::ADDIW; + case RISCV::ADD: + return RISCV::ADDW; + case RISCV::LD: + case RISCV::LWU: + return RISCV::LW; + case RISCV::MUL: + return RISCV::MULW; + case RISCV::SLLI: + return RISCV::SLLIW; + case RISCV::SUB: + return RISCV::SUBW; + default: + llvm_unreachable("Unexpected opcode for replacement with W variant"); + } +} + bool RISCVSExtWRemoval::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction()) || DisableSExtWRemoval) return false; @@ -256,7 +434,10 @@ if (!ST.is64Bit()) return false; - bool MadeChange = false; + SmallPtrSet SExtWRemovalCands; + + // Replacing instructions invalidates the MI iterator + // we collect the candidates, then iterate over them separately. for (MachineBasicBlock &MBB : MF) { for (auto I = MBB.begin(), IE = MBB.end(); I != IE;) { MachineInstr *MI = &*I++; @@ -271,21 +452,49 @@ if (!SrcReg.isVirtual()) continue; - const MachineInstr &SrcMI = *MRI.getVRegDef(SrcReg); - if (!isSignExtendedW(SrcMI, MRI)) - continue; + SExtWRemovalCands.insert(MI); + } + } - Register DstReg = MI->getOperand(0).getReg(); - if (!MRI.constrainRegClass(SrcReg, MRI.getRegClass(DstReg))) - continue; + bool MadeChange = false; + for (auto MI : SExtWRemovalCands) { + SmallPtrSet FixableDef; + Register SrcReg = MI->getOperand(1).getReg(); + MachineInstr &SrcMI = *MRI.getVRegDef(SrcReg); + + // If all definitions reaching MI sign-extend their output, + // then sext.w is redundant + if (!isSignExtendedW(SrcMI, MRI, FixableDef)) + continue; - LLVM_DEBUG(dbgs() << "Removing redundant sign-extension\n"); - MRI.replaceRegWith(DstReg, SrcReg); - MRI.clearKillFlags(SrcReg); - MI->eraseFromParent(); - ++NumRemovedSExtW; - MadeChange = true; + Register DstReg = MI->getOperand(0).getReg(); + if (!MRI.constrainRegClass(SrcReg, MRI.getRegClass(DstReg))) + continue; + // Replace Fixable instructions with their W versions. + for (MachineInstr *Fixable : FixableDef) { + MachineBasicBlock &MBB = *Fixable->getParent(); + const DebugLoc &DL = Fixable->getDebugLoc(); + unsigned Code = getWOp(Fixable->getOpcode()); + MachineInstrBuilder Replacement = + BuildMI(MBB, Fixable, DL, ST.getInstrInfo()->get(Code)); + for (auto Op : Fixable->operands()) + Replacement.add(Op); + for (auto Op : Fixable->memoperands()) + Replacement.addMemOperand(Op); + + LLVM_DEBUG(dbgs() << "Replacing " << *Fixable); + LLVM_DEBUG(dbgs() << " with " << *Replacement); + + Fixable->eraseFromParent(); + ++NumTransformedToWInstrs; } + + LLVM_DEBUG(dbgs() << "Removing redundant sign-extension\n"); + MRI.replaceRegWith(DstReg, SrcReg); + MRI.clearKillFlags(SrcReg); + MI->eraseFromParent(); + ++NumRemovedSExtW; + MadeChange = true; } return MadeChange; diff --git a/llvm/test/CodeGen/RISCV/rv64zbt.ll b/llvm/test/CodeGen/RISCV/rv64zbt.ll --- a/llvm/test/CodeGen/RISCV/rv64zbt.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbt.ll @@ -812,9 +812,8 @@ ; RV64I-LABEL: fshri_i32: ; RV64I: # %bb.0: ; RV64I-NEXT: srliw a1, a1, 5 -; RV64I-NEXT: slli a0, a0, 27 +; RV64I-NEXT: slliw a0, a0, 27 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: sext.w a0, a0 ; RV64I-NEXT: ret ; ; RV64ZBT-LABEL: fshri_i32: @@ -865,9 +864,8 @@ ; RV64I-LABEL: fshli_i32: ; RV64I: # %bb.0: ; RV64I-NEXT: srliw a1, a1, 27 -; RV64I-NEXT: slli a0, a0, 5 +; RV64I-NEXT: slliw a0, a0, 5 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: sext.w a0, a0 ; RV64I-NEXT: ret ; ; RV64ZBT-LABEL: fshli_i32: diff --git a/llvm/test/CodeGen/RISCV/sextw-removal.ll b/llvm/test/CodeGen/RISCV/sextw-removal.ll --- a/llvm/test/CodeGen/RISCV/sextw-removal.ll +++ b/llvm/test/CodeGen/RISCV/sextw-removal.ll @@ -562,3 +562,149 @@ bb7: ; preds = %bb2 ret void } + +; simple test for forward-searching. (and 1234) only uses lower word of input +define signext i32 @test11(i64 %arg1, i64 %arg2, i64 %arg3) { +; CHECK-LABEL: test11: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: li a3, 256 +; CHECK-NEXT: .LBB10_1: # %bb2 +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: andi a0, a0, 1234 +; CHECK-NEXT: addi a2, a2, 1 +; CHECK-NEXT: addw a0, a0, a1 +; CHECK-NEXT: bltu a2, a3, .LBB10_1 +; CHECK-NEXT: # %bb.2: # %bb7 +; CHECK-NEXT: ret +; +; NOREMOVAL-LABEL: test11: +; NOREMOVAL: # %bb.0: # %entry +; NOREMOVAL-NEXT: addi a2, a2, -1 +; NOREMOVAL-NEXT: li a3, 256 +; NOREMOVAL-NEXT: .LBB10_1: # %bb2 +; NOREMOVAL-NEXT: # =>This Inner Loop Header: Depth=1 +; NOREMOVAL-NEXT: andi a0, a0, 1234 +; NOREMOVAL-NEXT: addi a2, a2, 1 +; NOREMOVAL-NEXT: add a0, a0, a1 +; NOREMOVAL-NEXT: bltu a2, a3, .LBB10_1 +; NOREMOVAL-NEXT: # %bb.2: # %bb7 +; NOREMOVAL-NEXT: sext.w a0, a0 +; NOREMOVAL-NEXT: ret +entry: + br label %bb2 + +bb2: ; preds = %bb2, %entry + %i1 = phi i64 [ %arg1, %entry ], [ %i5, %bb2 ] + %i2 = phi i64 [ %arg3, %entry ], [ %i3, %bb2 ] + %i3 = add i64 %i2, 1 + %i4 = and i64 %i1, 1234 + %i5 = add i64 %i4, %arg2 + %i6 = icmp ugt i64 %i2, 255 + br i1 %i6, label %bb7, label %bb2 + +bb7: ; preds = %bb2 + %i7 = trunc i64 %i5 to i32 + ret i32 %i7 +} + +; circular use-dependency and multiple transformations. +define signext i32 @test12(i64 %arg1, i64 %arg2, i64 %arg3) { +; CHECK-LABEL: test12: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi a3, a2, -1 +; CHECK-NEXT: li a4, 256 +; CHECK-NEXT: .LBB11_1: # %bb2 +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: xor a0, a0, a1 +; CHECK-NEXT: mulw a2, a0, a1 +; CHECK-NEXT: addw a0, a0, a2 +; CHECK-NEXT: and a2, a2, a0 +; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: add a0, a2, a1 +; CHECK-NEXT: bltu a3, a4, .LBB11_1 +; CHECK-NEXT: # %bb.2: # %bb7 +; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: ret +; +; NOREMOVAL-LABEL: test12: +; NOREMOVAL: # %bb.0: # %entry +; NOREMOVAL-NEXT: addi a2, a2, -1 +; NOREMOVAL-NEXT: li a3, 256 +; NOREMOVAL-NEXT: .LBB11_1: # %bb2 +; NOREMOVAL-NEXT: # =>This Inner Loop Header: Depth=1 +; NOREMOVAL-NEXT: xor a0, a0, a1 +; NOREMOVAL-NEXT: mul a4, a0, a1 +; NOREMOVAL-NEXT: add a0, a0, a4 +; NOREMOVAL-NEXT: and a4, a4, a0 +; NOREMOVAL-NEXT: addi a2, a2, 1 +; NOREMOVAL-NEXT: add a0, a4, a1 +; NOREMOVAL-NEXT: bltu a2, a3, .LBB11_1 +; NOREMOVAL-NEXT: # %bb.2: # %bb7 +; NOREMOVAL-NEXT: sext.w a0, a4 +; NOREMOVAL-NEXT: ret +entry: + br label %bb2 + +bb2: ; preds = %bb2, %entry + %i1 = phi i64 [ %arg1, %entry ], [ %i6, %bb2 ] + %i2 = phi i64 [ %arg3, %entry ], [ %i3, %bb2 ] + %i3 = add i64 %i2, 1 + %i4 = xor i64 %i1, %arg2 + %i5 = mul i64 %i4, %arg2 + %i9 = add i64 %i4, %i5 + %i8 = and i64 %i5, %i9 + %i6 = add i64 %i8, %arg2 + %i7 = icmp ugt i64 %i2, 255 + br i1 %i7, label %bb7, label %bb2 + +bb7: ; preds = %bb2 + %r = trunc i64 %i8 to i32 + ret i32 %r +} + +; Not optimized. sdiv doesn't only use lower word +define signext i32 @test13(i64 %arg1, i64 %arg2, i64 %arg3) { +; CHECK-LABEL: test13: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: li a3, 256 +; CHECK-NEXT: .LBB12_1: # %bb2 +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: div a0, a0, a1 +; CHECK-NEXT: addi a2, a2, 1 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: bltu a2, a3, .LBB12_1 +; CHECK-NEXT: # %bb.2: # %bb7 +; CHECK-NEXT: sext.w a0, a0 +; CHECK-NEXT: ret +; +; NOREMOVAL-LABEL: test13: +; NOREMOVAL: # %bb.0: # %entry +; NOREMOVAL-NEXT: addi a2, a2, -1 +; NOREMOVAL-NEXT: li a3, 256 +; NOREMOVAL-NEXT: .LBB12_1: # %bb2 +; NOREMOVAL-NEXT: # =>This Inner Loop Header: Depth=1 +; NOREMOVAL-NEXT: div a0, a0, a1 +; NOREMOVAL-NEXT: addi a2, a2, 1 +; NOREMOVAL-NEXT: add a0, a0, a1 +; NOREMOVAL-NEXT: bltu a2, a3, .LBB12_1 +; NOREMOVAL-NEXT: # %bb.2: # %bb7 +; NOREMOVAL-NEXT: sext.w a0, a0 +; NOREMOVAL-NEXT: ret +entry: + br label %bb2 + +bb2: ; preds = %bb2, %entry + %i1 = phi i64 [ %arg1, %entry ], [ %i5, %bb2 ] + %i2 = phi i64 [ %arg3, %entry ], [ %i3, %bb2 ] + %i3 = add i64 %i2, 1 + %i4 = sdiv i64 %i1, %arg2 + %i5 = add i64 %i4, %arg2 + %i6 = icmp ugt i64 %i2, 255 + br i1 %i6, label %bb7, label %bb2 + +bb7: ; preds = %bb2 + %i8 = trunc i64 %i5 to i32 + ret i32 %i8 +}