diff --git a/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp b/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp
--- a/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp
@@ -21,6 +21,8 @@
 #define DEBUG_TYPE "riscv-sextw-removal"
 
 STATISTIC(NumRemovedSExtW, "Number of removed sign-extensions");
+STATISTIC(NumTransformedToWInstrs,
+          "Number of instructions transformed to W-ops");
 
 static cl::opt<bool> DisableSExtWRemoval("riscv-disable-sextw-removal",
                                          cl::desc("Disable removal of sext.w"),
@@ -55,11 +57,143 @@
   return new RISCVSExtWRemoval();
 }
 
+// add uses of MI to the Worklist
+static void addUses(const MachineInstr &MI,
+                    SmallVectorImpl<const MachineInstr *> &Worklist,
+                    MachineRegisterInfo &MRI) {
+  for (auto &UserOp : MRI.reg_operands(MI.getOperand(0).getReg())) {
+    const auto *User = UserOp.getParent();
+    if (User == &MI) // ignore the def, current MI
+      continue;
+    Worklist.push_back(User);
+  }
+}
+
+// returns true if all uses of OrigMI only depend on the lower word of its
+// output, so we can transform OrigMI to the corresponding W-version.
+// TODO: handle multiple interdependent transformations
+static bool isAllUsesReadW(const MachineInstr &OrigMI,
+                           MachineRegisterInfo &MRI) {
+
+  SmallPtrSet<const MachineInstr *, 4> Visited;
+  SmallVector<const MachineInstr *, 4> Worklist;
+
+  Visited.insert(&OrigMI);
+  addUses(OrigMI, Worklist, MRI);
+
+  while (!Worklist.empty()) {
+    const MachineInstr *MI = Worklist.pop_back_val();
+
+    if (!Visited.insert(MI).second) {
+      // If we've looped back to OrigMI through a PHI cycle, we can't transform
+      // LD or LWU, because these operations use all 64 bits of input.
+      if (MI == &OrigMI) {
+        unsigned opcode = MI->getOpcode();
+        if (opcode == RISCV::LD || opcode == RISCV::LWU)
+          return false;
+      }
+      continue;
+    }
+
+    switch (MI->getOpcode()) {
+    case RISCV::ADDIW:
+    case RISCV::ADDW:
+    case RISCV::DIVUW:
+    case RISCV::DIVW:
+    case RISCV::MULW:
+    case RISCV::REMUW:
+    case RISCV::REMW:
+    case RISCV::SLLIW:
+    case RISCV::SLLW:
+    case RISCV::SRAIW:
+    case RISCV::SRAW:
+    case RISCV::SRLIW:
+    case RISCV::SRLW:
+    case RISCV::SUBW:
+    case RISCV::ROLW:
+    case RISCV::RORW:
+    case RISCV::RORIW:
+    case RISCV::CLZW:
+    case RISCV::CTZW:
+    case RISCV::CPOPW:
+    case RISCV::SLLI_UW:
+    case RISCV::FCVT_S_W:
+    case RISCV::FCVT_S_WU:
+    case RISCV::FCVT_D_W:
+    case RISCV::FCVT_D_WU:
+      continue;
+
+    // these overwrite higher input bits, otherwise the lower word of output
+    // depends only on the lower word of input. So check their uses read W.
+    case RISCV::SLLI:
+      if (MI->getOperand(2).getImm() >= 32)
+        continue;
+      addUses(*MI, Worklist, MRI);
+      continue;
+    case RISCV::ANDI:
+      if (isUInt<11>(MI->getOperand(2).getImm()))
+        continue;
+      addUses(*MI, Worklist, MRI);
+      continue;
+    case RISCV::ORI:
+      if (!isUInt<11>(MI->getOperand(2).getImm()))
+        continue;
+      addUses(*MI, Worklist, MRI);
+      continue;
+
+    case RISCV::BEXTI:
+      if (MI->getOperand(2).getImm() >= 32)
+        return false;
+      continue;
+
+    // For these, lower word of output in these operations, depends only on
+    // the lower word of input. So, we check all uses only read lower word.
+    case RISCV::COPY:
+    case RISCV::PHI:
+
+    case RISCV::ADD:
+    case RISCV::ADDI:
+    case RISCV::AND:
+    case RISCV::MUL:
+    case RISCV::OR:
+    case RISCV::SLL:
+    case RISCV::SUB:
+    case RISCV::XOR:
+    case RISCV::XORI:
+
+    case RISCV::ADD_UW:
+    case RISCV::ANDN:
+    case RISCV::CLMUL:
+    case RISCV::ORC_B:
+    case RISCV::ORN:
+    case RISCV::SEXT_B:
+    case RISCV::SEXT_H:
+    case RISCV::SH1ADD:
+    case RISCV::SH1ADD_UW:
+    case RISCV::SH2ADD:
+    case RISCV::SH2ADD_UW:
+    case RISCV::SH3ADD:
+    case RISCV::SH3ADD_UW:
+    case RISCV::XNOR:
+    case RISCV::ZEXT_H_RV64:
+      addUses(*MI, Worklist, MRI);
+      continue;
+    default:
+      return false;
+    }
+  }
+  return true;
+}
+
 // This function returns true if the machine instruction always outputs a value
 // where bits 63:32 match bit 31.
+// Alternatively, if the instruction can be converted to W variant
+// (e.g. ADD->ADDW) and all of its uses only use the lower word of its output,
+// then return true and add the instr to FixableDef to be convereted later
 // TODO: Allocate a bit in TSFlags for the W instructions?
 // TODO: Add other W instructions.
-static bool isSignExtendingOpW(const MachineInstr &MI) {
+static bool isSignExtendingOpW(MachineInstr &MI, MachineRegisterInfo &MRI,
+                               SmallPtrSetImpl<MachineInstr *> &FixableDef) {
   switch (MI.getOpcode()) {
   case RISCV::LUI:
   case RISCV::LW:
@@ -117,7 +251,14 @@
     return MI.getOperand(2).getImm() > 32;
   // The LI pattern ADDI rd, X0, imm is sign extended.
   case RISCV::ADDI:
-    return MI.getOperand(1).isReg() && MI.getOperand(1).getReg() == RISCV::X0;
+    if (MI.getOperand(1).isReg() && MI.getOperand(1).getReg() == RISCV::X0)
+      return true;
+    if (isAllUsesReadW(MI, MRI)) {
+      // transform to ADDIW
+      FixableDef.insert(&MI);
+      return true;
+    }
+    return false;
   // An ANDI with an 11 bit immediate will zero bits 63:11.
   case RISCV::ANDI:
     return isUInt<11>(MI.getOperand(2).getImm());
@@ -127,28 +268,45 @@
   // Copying from X0 produces zero.
   case RISCV::COPY:
     return MI.getOperand(1).getReg() == RISCV::X0;
+
+  // With these opcode, we can "fix" them with the W-version
+  // if we know all users of the result only rely on bits 31:0
+  case RISCV::SLLI:
+    // SLLIW reads the lowest 5 bits, while SLLI reads lowest 6 bits
+    if (MI.getOperand(2).getImm() >= 32)
+      return false;
+    LLVM_FALLTHROUGH;
+  case RISCV::ADD:
+  case RISCV::LD:
+  case RISCV::LWU:
+  case RISCV::MUL:
+  case RISCV::SUB:
+    if (isAllUsesReadW(MI, MRI)) {
+      FixableDef.insert(&MI);
+      return true;
+    }
   }
 
   return false;
 }
 
-static bool isSignExtendedW(const MachineInstr &OrigMI,
-                            MachineRegisterInfo &MRI) {
+static bool isSignExtendedW(MachineInstr &OrigMI, MachineRegisterInfo &MRI,
+                            SmallPtrSetImpl<MachineInstr *> &FixableDef) {
 
   SmallPtrSet<const MachineInstr *, 4> Visited;
-  SmallVector<const MachineInstr *, 4> Worklist;
+  SmallVector<MachineInstr *, 4> Worklist;
 
   Worklist.push_back(&OrigMI);
 
   while (!Worklist.empty()) {
-    const MachineInstr *MI = Worklist.pop_back_val();
+    MachineInstr *MI = Worklist.pop_back_val();
 
     // If we already visited this instruction, we don't need to check it again.
     if (!Visited.insert(MI).second)
       continue;
 
     // If this is a sign extending operation we don't need to look any further.
-    if (isSignExtendingOpW(*MI))
+    if (isSignExtendingOpW(*MI, MRI, FixableDef))
       continue;
 
     // Is this an instruction that propagates sign extend.
@@ -164,7 +322,7 @@
       // If this is a copy from another register, check its source instruction.
       if (!SrcReg.isVirtual())
         return false;
-      const MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
+      MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
       if (!SrcMI)
         return false;
 
@@ -190,7 +348,7 @@
       Register SrcReg = MI->getOperand(1).getReg();
       if (!SrcReg.isVirtual())
         return false;
-      const MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
+      MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
       if (!SrcMI)
         return false;
 
@@ -228,7 +386,7 @@
         Register SrcReg = MI->getOperand(I).getReg();
         if (!SrcReg.isVirtual())
           return false;
-        const MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
+        MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
         if (!SrcMI)
           return false;
 
@@ -246,6 +404,26 @@
   return true;
 }
 
+static unsigned getWOp(unsigned Opcode) {
+  switch (Opcode) {
+  case RISCV::ADDI:
+    return RISCV::ADDIW;
+  case RISCV::ADD:
+    return RISCV::ADDW;
+  case RISCV::LD:
+  case RISCV::LWU:
+    return RISCV::LW;
+  case RISCV::MUL:
+    return RISCV::MULW;
+  case RISCV::SLLI:
+    return RISCV::SLLIW;
+  case RISCV::SUB:
+    return RISCV::SUBW;
+  default:
+    llvm_unreachable("Unexpected opcode for replacement with W variant");
+  }
+}
+
 bool RISCVSExtWRemoval::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()) || DisableSExtWRemoval)
     return false;
@@ -256,7 +434,10 @@
   if (!ST.is64Bit())
     return false;
 
-  bool MadeChange = false;
+  SmallPtrSet<MachineInstr *, 4> SExtWRemovalCands;
+
+  // Replacing instructions invalidates the MI iterator
+  // we collect the candidates, then iterate over them separately.
   for (MachineBasicBlock &MBB : MF) {
     for (auto I = MBB.begin(), IE = MBB.end(); I != IE;) {
       MachineInstr *MI = &*I++;
@@ -271,21 +452,49 @@
       if (!SrcReg.isVirtual())
         continue;
 
-      const MachineInstr &SrcMI = *MRI.getVRegDef(SrcReg);
-      if (!isSignExtendedW(SrcMI, MRI))
-        continue;
+      SExtWRemovalCands.insert(MI);
+    }
+  }
 
-      Register DstReg = MI->getOperand(0).getReg();
-      if (!MRI.constrainRegClass(SrcReg, MRI.getRegClass(DstReg)))
-        continue;
+  bool MadeChange = false;
+  for (auto MI : SExtWRemovalCands) {
+    SmallPtrSet<MachineInstr *, 4> FixableDef;
+    Register SrcReg = MI->getOperand(1).getReg();
+    MachineInstr &SrcMI = *MRI.getVRegDef(SrcReg);
+
+    // If all definitions reaching MI sign-extend their output,
+    // then sext.w is redundant
+    if (!isSignExtendedW(SrcMI, MRI, FixableDef))
+      continue;
 
-      LLVM_DEBUG(dbgs() << "Removing redundant sign-extension\n");
-      MRI.replaceRegWith(DstReg, SrcReg);
-      MRI.clearKillFlags(SrcReg);
-      MI->eraseFromParent();
-      ++NumRemovedSExtW;
-      MadeChange = true;
+    Register DstReg = MI->getOperand(0).getReg();
+    if (!MRI.constrainRegClass(SrcReg, MRI.getRegClass(DstReg)))
+      continue;
+    // Replace Fixable instructions with their W versions.
+    for (MachineInstr *Fixable : FixableDef) {
+      MachineBasicBlock &MBB = *Fixable->getParent();
+      const DebugLoc &DL = Fixable->getDebugLoc();
+      unsigned Code = getWOp(Fixable->getOpcode());
+      MachineInstrBuilder Replacement =
+          BuildMI(MBB, Fixable, DL, ST.getInstrInfo()->get(Code));
+      for (auto Op : Fixable->operands())
+        Replacement.add(Op);
+      for (auto Op : Fixable->memoperands())
+        Replacement.addMemOperand(Op);
+
+      LLVM_DEBUG(dbgs() << "Replacing " << *Fixable);
+      LLVM_DEBUG(dbgs() << "     with " << *Replacement);
+
+      Fixable->eraseFromParent();
+      ++NumTransformedToWInstrs;
     }
+
+    LLVM_DEBUG(dbgs() << "Removing redundant sign-extension\n");
+    MRI.replaceRegWith(DstReg, SrcReg);
+    MRI.clearKillFlags(SrcReg);
+    MI->eraseFromParent();
+    ++NumRemovedSExtW;
+    MadeChange = true;
   }
 
   return MadeChange;
diff --git a/llvm/test/CodeGen/RISCV/rv64zbt.ll b/llvm/test/CodeGen/RISCV/rv64zbt.ll
--- a/llvm/test/CodeGen/RISCV/rv64zbt.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbt.ll
@@ -812,9 +812,8 @@
 ; RV64I-LABEL: fshri_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srliw a1, a1, 5
-; RV64I-NEXT:    slli a0, a0, 27
+; RV64I-NEXT:    slliw a0, a0, 27
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    sext.w a0, a0
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBT-LABEL: fshri_i32:
@@ -865,9 +864,8 @@
 ; RV64I-LABEL: fshli_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srliw a1, a1, 27
-; RV64I-NEXT:    slli a0, a0, 5
+; RV64I-NEXT:    slliw a0, a0, 5
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    sext.w a0, a0
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBT-LABEL: fshli_i32:
diff --git a/llvm/test/CodeGen/RISCV/sextw-removal.ll b/llvm/test/CodeGen/RISCV/sextw-removal.ll
--- a/llvm/test/CodeGen/RISCV/sextw-removal.ll
+++ b/llvm/test/CodeGen/RISCV/sextw-removal.ll
@@ -562,3 +562,149 @@
 bb7:                                              ; preds = %bb2
   ret void
 }
+
+; simple test for forward-searching. (and 1234) only uses lower word of input
+define signext i32 @test11(i64 %arg1, i64 %arg2, i64 %arg3)  {
+; CHECK-LABEL: test11:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    li a3, 256
+; CHECK-NEXT:  .LBB10_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    andi a0, a0, 1234
+; CHECK-NEXT:    addi a2, a2, 1
+; CHECK-NEXT:    addw a0, a0, a1
+; CHECK-NEXT:    bltu a2, a3, .LBB10_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ret
+;
+; NOREMOVAL-LABEL: test11:
+; NOREMOVAL:       # %bb.0: # %entry
+; NOREMOVAL-NEXT:    addi a2, a2, -1
+; NOREMOVAL-NEXT:    li a3, 256
+; NOREMOVAL-NEXT:  .LBB10_1: # %bb2
+; NOREMOVAL-NEXT:    # =>This Inner Loop Header: Depth=1
+; NOREMOVAL-NEXT:    andi a0, a0, 1234
+; NOREMOVAL-NEXT:    addi a2, a2, 1
+; NOREMOVAL-NEXT:    add a0, a0, a1
+; NOREMOVAL-NEXT:    bltu a2, a3, .LBB10_1
+; NOREMOVAL-NEXT:  # %bb.2: # %bb7
+; NOREMOVAL-NEXT:    sext.w a0, a0
+; NOREMOVAL-NEXT:    ret
+entry:
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %entry
+  %i1 = phi i64 [ %arg1, %entry ], [ %i5, %bb2 ]
+  %i2 = phi i64 [ %arg3, %entry ], [ %i3, %bb2 ]
+  %i3 = add i64 %i2, 1
+  %i4 = and i64 %i1, 1234
+  %i5 = add i64 %i4, %arg2
+  %i6 = icmp ugt i64 %i2, 255
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  %i7 = trunc i64 %i5 to i32
+  ret i32 %i7
+}
+
+; circular use-dependency and multiple transformations.
+define signext i32 @test12(i64 %arg1, i64 %arg2, i64 %arg3)  {
+; CHECK-LABEL: test12:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi a3, a2, -1
+; CHECK-NEXT:    li a4, 256
+; CHECK-NEXT:  .LBB11_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    xor a0, a0, a1
+; CHECK-NEXT:    mulw a2, a0, a1
+; CHECK-NEXT:    addw a0, a0, a2
+; CHECK-NEXT:    and a2, a2, a0
+; CHECK-NEXT:    addi a3, a3, 1
+; CHECK-NEXT:    add a0, a2, a1
+; CHECK-NEXT:    bltu a3, a4, .LBB11_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    ret
+;
+; NOREMOVAL-LABEL: test12:
+; NOREMOVAL:       # %bb.0: # %entry
+; NOREMOVAL-NEXT:    addi a2, a2, -1
+; NOREMOVAL-NEXT:    li a3, 256
+; NOREMOVAL-NEXT:  .LBB11_1: # %bb2
+; NOREMOVAL-NEXT:    # =>This Inner Loop Header: Depth=1
+; NOREMOVAL-NEXT:    xor a0, a0, a1
+; NOREMOVAL-NEXT:    mul a4, a0, a1
+; NOREMOVAL-NEXT:    add a0, a0, a4
+; NOREMOVAL-NEXT:    and a4, a4, a0
+; NOREMOVAL-NEXT:    addi a2, a2, 1
+; NOREMOVAL-NEXT:    add a0, a4, a1
+; NOREMOVAL-NEXT:    bltu a2, a3, .LBB11_1
+; NOREMOVAL-NEXT:  # %bb.2: # %bb7
+; NOREMOVAL-NEXT:    sext.w a0, a4
+; NOREMOVAL-NEXT:    ret
+entry:
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %entry
+  %i1 = phi i64 [ %arg1, %entry ], [ %i6, %bb2 ]
+  %i2 = phi i64 [ %arg3, %entry ], [ %i3, %bb2 ]
+  %i3 = add i64 %i2, 1
+  %i4 = xor i64 %i1, %arg2
+  %i5 = mul i64 %i4, %arg2
+  %i9 = add i64 %i4, %i5
+  %i8 = and i64 %i5, %i9
+  %i6 = add i64 %i8, %arg2
+  %i7 = icmp ugt i64 %i2, 255
+  br i1 %i7, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  %r = trunc i64 %i8 to i32
+  ret i32 %r
+}
+
+; Not optimized. sdiv doesn't only use lower word
+define signext i32 @test13(i64 %arg1, i64 %arg2, i64 %arg3)  {
+; CHECK-LABEL: test13:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    li a3, 256
+; CHECK-NEXT:  .LBB12_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    div a0, a0, a1
+; CHECK-NEXT:    addi a2, a2, 1
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    bltu a2, a3, .LBB12_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    sext.w a0, a0
+; CHECK-NEXT:    ret
+;
+; NOREMOVAL-LABEL: test13:
+; NOREMOVAL:       # %bb.0: # %entry
+; NOREMOVAL-NEXT:    addi a2, a2, -1
+; NOREMOVAL-NEXT:    li a3, 256
+; NOREMOVAL-NEXT:  .LBB12_1: # %bb2
+; NOREMOVAL-NEXT:    # =>This Inner Loop Header: Depth=1
+; NOREMOVAL-NEXT:    div a0, a0, a1
+; NOREMOVAL-NEXT:    addi a2, a2, 1
+; NOREMOVAL-NEXT:    add a0, a0, a1
+; NOREMOVAL-NEXT:    bltu a2, a3, .LBB12_1
+; NOREMOVAL-NEXT:  # %bb.2: # %bb7
+; NOREMOVAL-NEXT:    sext.w a0, a0
+; NOREMOVAL-NEXT:    ret
+entry:
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %entry
+  %i1 = phi i64 [ %arg1, %entry ], [ %i5, %bb2 ]
+  %i2 = phi i64 [ %arg3, %entry ], [ %i3, %bb2 ]
+  %i3 = add i64 %i2, 1
+  %i4 = sdiv i64 %i1, %arg2
+  %i5 = add i64 %i4, %arg2
+  %i6 = icmp ugt i64 %i2, 255
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  %i8 = trunc i64 %i5 to i32
+  ret i32 %i8
+}