diff --git a/clang/docs/tools/clang-formatted-files.txt b/clang/docs/tools/clang-formatted-files.txt
--- a/clang/docs/tools/clang-formatted-files.txt
+++ b/clang/docs/tools/clang-formatted-files.txt
@@ -6384,7 +6384,6 @@
 llvm/lib/Target/AVR/AVRRegisterInfo.cpp
 llvm/lib/Target/AVR/AVRRegisterInfo.h
 llvm/lib/Target/AVR/AVRSelectionDAGInfo.h
-llvm/lib/Target/AVR/AVRShiftExpand.cpp
 llvm/lib/Target/AVR/AVRSubtarget.cpp
 llvm/lib/Target/AVR/AVRSubtarget.h
 llvm/lib/Target/AVR/AVRTargetMachine.cpp
diff --git a/llvm/lib/Target/AVR/AVR.h b/llvm/lib/Target/AVR/AVR.h
--- a/llvm/lib/Target/AVR/AVR.h
+++ b/llvm/lib/Target/AVR/AVR.h
@@ -25,7 +25,6 @@
 class FunctionPass;
 class PassRegistry;
 
-Pass *createAVRShiftExpandPass();
 FunctionPass *createAVRISelDag(AVRTargetMachine &TM,
                                CodeGenOpt::Level OptLevel);
 FunctionPass *createAVRExpandPseudoPass();
@@ -34,7 +33,6 @@
 
 void initializeAVRDAGToDAGISelPass(PassRegistry &);
 void initializeAVRExpandPseudoPass(PassRegistry &);
-void initializeAVRShiftExpandPass(PassRegistry &);
 
 /// Contains the AVR backend.
 namespace AVR {
diff --git a/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
--- a/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
@@ -229,6 +229,11 @@
   return false;
 }
 
+/// Returns whether given logic operation effectively does not depend its
+/// first argument.
+///
+/// For instrance, `reg & 0x00` will behave the same way regardless of the reg's
+/// value.
 bool AVRExpandPseudo::isLogicRegOpUndef(unsigned Op, unsigned ImmVal) const {
   // ANDI Rd, 0x00 clears all input bits.
   if (Op == AVR::ANDIRdK && ImmVal == 0x00)
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -286,11 +286,6 @@
          "Expected power-of-2 shift amount");
 
   if (VT.getSizeInBits() == 32) {
-    if (!isa<ConstantSDNode>(N->getOperand(1))) {
-      // 32-bit shifts are converted to a loop in IR.
-      // This should be unreachable.
-      report_fatal_error("Expected a constant shift amount!");
-    }
     SDVTList ResTys = DAG.getVTList(MVT::i16, MVT::i16);
     SDValue SrcLo =
         DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i16, Op.getOperand(0),
@@ -298,25 +293,34 @@
     SDValue SrcHi =
         DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i16, Op.getOperand(0),
                     DAG.getConstant(1, dl, MVT::i16));
-    uint64_t ShiftAmount =
-        cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
-    if (ShiftAmount == 16) {
-      // Special case these two operations because they appear to be used by the
-      // generic codegen parts to lower 32-bit numbers.
-      // TODO: perhaps we can lower shift amounts bigger than 16 to a 16-bit
-      // shift of a part of the 32-bit value?
-      switch (Op.getOpcode()) {
-      case ISD::SHL: {
-        SDValue Zero = DAG.getConstant(0, dl, MVT::i16);
-        return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i32, Zero, SrcLo);
-      }
-      case ISD::SRL: {
-        SDValue Zero = DAG.getConstant(0, dl, MVT::i16);
-        return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i32, SrcHi, Zero);
-      }
+    SDValue Cnt;
+    if (isa<ConstantSDNode>(N->getOperand(1))) {
+      // The amount to shift is known at compile time, so we can create an
+      // optimized sequence of instructions to shift this value.
+      uint64_t ShiftAmount =
+          cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+      if (ShiftAmount == 16) {
+        // Special case these two operations because they appear to be used by
+        // the generic codegen parts to lower 32-bit numbers.
+        // TODO: perhaps we can lower shift amounts bigger than 16 to a 16-bit
+        // shift of a part of the 32-bit value?
+        switch (Op.getOpcode()) {
+        case ISD::SHL: {
+          SDValue Zero = DAG.getConstant(0, dl, MVT::i16);
+          return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i32, Zero, SrcLo);
+        }
+        case ISD::SRL: {
+          SDValue Zero = DAG.getConstant(0, dl, MVT::i16);
+          return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i32, SrcHi, Zero);
+        }
+        }
       }
+      Cnt = DAG.getTargetConstant(ShiftAmount, dl, MVT::i8);
+    } else {
+      // The shift is not known at compile time, so we have to emit this as a
+      // loop.
+      Cnt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(1));
     }
-    SDValue Cnt = DAG.getTargetConstant(ShiftAmount, dl, MVT::i8);
     unsigned Opc;
     switch (Op.getOpcode()) {
     default:
@@ -1917,20 +1921,20 @@
 // shifted.
 // For more information and background, see this blogpost:
 // https://aykevl.nl/2021/02/avr-bitshift
-static void insertMultibyteShift(MachineInstr &MI, MachineBasicBlock *BB,
+static void insertMultibyteShift(MachineBasicBlock::iterator MBBI,
+                                 MachineBasicBlock *BB, const DebugLoc &DL,
                                  MutableArrayRef<std::pair<Register, int>> Regs,
                                  ISD::NodeType Opc, int64_t ShiftAmt) {
   const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
   const AVRSubtarget &STI = BB->getParent()->getSubtarget<AVRSubtarget>();
   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-  const DebugLoc &dl = MI.getDebugLoc();
 
   const bool ShiftLeft = Opc == ISD::SHL;
   const bool ArithmeticShift = Opc == ISD::SRA;
 
   // Zero a register, for use in later operations.
   Register ZeroReg = MRI.createVirtualRegister(&AVR::GPR8RegClass);
-  BuildMI(*BB, MI, dl, TII.get(AVR::COPY), ZeroReg)
+  BuildMI(*BB, MBBI, DL, TII.get(AVR::COPY), ZeroReg)
       .addReg(STI.getZeroRegister());
 
   // Do a shift modulo 6 or 7. This is a bit more complicated than most shifts
@@ -1949,18 +1953,18 @@
 
     // Shift one to the right, keeping the least significant bit as the carry
     // bit.
-    insertMultibyteShift(MI, BB, ShiftRegs, ISD::SRL, 1);
+    insertMultibyteShift(MBBI, BB, DL, ShiftRegs, ISD::SRL, 1);
 
     // Rotate the least significant bit from the carry bit into a new register
     // (that starts out zero).
     Register LowByte = MRI.createVirtualRegister(&AVR::GPR8RegClass);
-    BuildMI(*BB, MI, dl, TII.get(AVR::RORRd), LowByte).addReg(ZeroReg);
+    BuildMI(*BB, MBBI, DL, TII.get(AVR::RORRd), LowByte).addReg(ZeroReg);
 
     // Shift one more to the right if this is a modulo-6 shift.
     if (ShiftAmt % 8 == 6) {
-      insertMultibyteShift(MI, BB, ShiftRegs, ISD::SRL, 1);
+      insertMultibyteShift(MBBI, BB, DL, ShiftRegs, ISD::SRL, 1);
       Register NewLowByte = MRI.createVirtualRegister(&AVR::GPR8RegClass);
-      BuildMI(*BB, MI, dl, TII.get(AVR::RORRd), NewLowByte).addReg(LowByte);
+      BuildMI(*BB, MBBI, DL, TII.get(AVR::RORRd), NewLowByte).addReg(LowByte);
       LowByte = NewLowByte;
     }
 
@@ -1988,7 +1992,7 @@
         Regs.slice(0, ShiftRegsSize);
 
     // Shift one to the left.
-    insertMultibyteShift(MI, BB, ShiftRegs, ISD::SHL, 1);
+    insertMultibyteShift(MBBI, BB, DL, ShiftRegs, ISD::SHL, 1);
 
     // Sign or zero extend the most significant register into a new register.
     // The HighByte is the byte that still has one (or two) bits from the
@@ -1998,7 +2002,7 @@
     Register ExtByte = 0;
     if (ArithmeticShift) {
       // Sign-extend bit that was shifted out last.
-      BuildMI(*BB, MI, dl, TII.get(AVR::SBCRdRr), HighByte)
+      BuildMI(*BB, MBBI, DL, TII.get(AVR::SBCRdRr), HighByte)
           .addReg(HighByte, RegState::Undef)
           .addReg(HighByte, RegState::Undef);
       ExtByte = HighByte;
@@ -2008,17 +2012,17 @@
       // Use the zero register for zero extending.
       ExtByte = ZeroReg;
       // Rotate most significant bit into a new register (that starts out zero).
-      BuildMI(*BB, MI, dl, TII.get(AVR::ADCRdRr), HighByte)
+      BuildMI(*BB, MBBI, DL, TII.get(AVR::ADCRdRr), HighByte)
           .addReg(ExtByte)
           .addReg(ExtByte);
     }
 
     // Shift one more to the left for modulo 6 shifts.
     if (ShiftAmt % 8 == 6) {
-      insertMultibyteShift(MI, BB, ShiftRegs, ISD::SHL, 1);
+      insertMultibyteShift(MBBI, BB, DL, ShiftRegs, ISD::SHL, 1);
       // Shift the topmost bit into the HighByte.
       Register NewExt = MRI.createVirtualRegister(&AVR::GPR8RegClass);
-      BuildMI(*BB, MI, dl, TII.get(AVR::ADCRdRr), NewExt)
+      BuildMI(*BB, MBBI, DL, TII.get(AVR::ADCRdRr), NewExt)
           .addReg(HighByte)
           .addReg(HighByte);
       HighByte = NewExt;
@@ -2063,10 +2067,10 @@
       // Sign extend the most significant register into ShrExtendReg.
       ShrExtendReg = MRI.createVirtualRegister(&AVR::GPR8RegClass);
       Register Tmp = MRI.createVirtualRegister(&AVR::GPR8RegClass);
-      BuildMI(*BB, MI, dl, TII.get(AVR::ADDRdRr), Tmp)
+      BuildMI(*BB, MBBI, DL, TII.get(AVR::ADDRdRr), Tmp)
           .addReg(Regs[0].first, 0, Regs[0].second)
           .addReg(Regs[0].first, 0, Regs[0].second);
-      BuildMI(*BB, MI, dl, TII.get(AVR::SBCRdRr), ShrExtendReg)
+      BuildMI(*BB, MBBI, DL, TII.get(AVR::SBCRdRr), ShrExtendReg)
           .addReg(Tmp)
           .addReg(Tmp);
     } else {
@@ -2112,22 +2116,22 @@
     for (size_t I = 0; I < Regs.size(); I++) {
       size_t Idx = ShiftLeft ? I : Regs.size() - I - 1;
       Register SwapReg = MRI.createVirtualRegister(&AVR::LD8RegClass);
-      BuildMI(*BB, MI, dl, TII.get(AVR::SWAPRd), SwapReg)
+      BuildMI(*BB, MBBI, DL, TII.get(AVR::SWAPRd), SwapReg)
           .addReg(Regs[Idx].first, 0, Regs[Idx].second);
       if (I != 0) {
         Register R = MRI.createVirtualRegister(&AVR::GPR8RegClass);
-        BuildMI(*BB, MI, dl, TII.get(AVR::EORRdRr), R)
+        BuildMI(*BB, MBBI, DL, TII.get(AVR::EORRdRr), R)
             .addReg(Prev)
             .addReg(SwapReg);
         Prev = R;
       }
       Register AndReg = MRI.createVirtualRegister(&AVR::LD8RegClass);
-      BuildMI(*BB, MI, dl, TII.get(AVR::ANDIRdK), AndReg)
+      BuildMI(*BB, MBBI, DL, TII.get(AVR::ANDIRdK), AndReg)
           .addReg(SwapReg)
           .addImm(ShiftLeft ? 0xf0 : 0x0f);
       if (I != 0) {
         Register R = MRI.createVirtualRegister(&AVR::GPR8RegClass);
-        BuildMI(*BB, MI, dl, TII.get(AVR::EORRdRr), R)
+        BuildMI(*BB, MBBI, DL, TII.get(AVR::EORRdRr), R)
             .addReg(Prev)
             .addReg(AndReg);
         size_t PrevIdx = ShiftLeft ? Idx - 1 : Idx + 1;
@@ -2148,11 +2152,11 @@
       Register In = Regs[I].first;
       Register InSubreg = Regs[I].second;
       if (I == (ssize_t)Regs.size() - 1) { // first iteration
-        BuildMI(*BB, MI, dl, TII.get(AVR::ADDRdRr), Out)
+        BuildMI(*BB, MBBI, DL, TII.get(AVR::ADDRdRr), Out)
             .addReg(In, 0, InSubreg)
             .addReg(In, 0, InSubreg);
       } else {
-        BuildMI(*BB, MI, dl, TII.get(AVR::ADCRdRr), Out)
+        BuildMI(*BB, MBBI, DL, TII.get(AVR::ADCRdRr), Out)
             .addReg(In, 0, InSubreg)
             .addReg(In, 0, InSubreg);
       }
@@ -2168,9 +2172,10 @@
       Register InSubreg = Regs[I].second;
       if (I == 0) {
         unsigned Opc = ArithmeticShift ? AVR::ASRRd : AVR::LSRRd;
-        BuildMI(*BB, MI, dl, TII.get(Opc), Out).addReg(In, 0, InSubreg);
+        BuildMI(*BB, MBBI, DL, TII.get(Opc), Out).addReg(In, 0, InSubreg);
       } else {
-        BuildMI(*BB, MI, dl, TII.get(AVR::RORRd), Out).addReg(In, 0, InSubreg);
+        BuildMI(*BB, MBBI, DL, TII.get(AVR::RORRd), Out)
+            .addReg(In, 0, InSubreg);
       }
       Regs[I] = std::pair(Out, 0);
     }
@@ -2182,16 +2187,94 @@
   }
 }
 
+// Do a multibyte shift by shifting one bit at a time in a loop. It works very
+// similar to insertMultibyteShift in that it modifies the Regs array in-place
+// (the output registers are stored in this array on return).
+static MachineBasicBlock *insertMultibyteShiftLoop(
+    MachineInstr &MI, MachineBasicBlock *BB, Register ShiftNum,
+    MutableArrayRef<std::pair<Register, int>> Regs, ISD::NodeType Opc) {
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineFunction *MF = BB->getParent();
+  const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+
+  MachineBasicBlock *EntryBB = BB;
+  MachineBasicBlock *CheckBB = MF->CreateMachineBasicBlock(BB->getBasicBlock());
+  MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(BB->getBasicBlock());
+
+  MF->push_back(CheckBB);
+  MF->push_back(LoopBB);
+  MachineBasicBlock *ExitBB = EntryBB->splitAt(MI, false);
+
+  CheckBB->moveAfter(EntryBB);
+  LoopBB->moveAfter(CheckBB);
+  ExitBB->moveAfter(LoopBB);
+
+  EntryBB->addSuccessor(CheckBB);
+  LoopBB->addSuccessor(CheckBB);
+  CheckBB->addSuccessor(LoopBB);
+  CheckBB->addSuccessor(ExitBB);
+  EntryBB->removeSuccessor(ExitBB);
+
+  // Create virtual registers for the value phi nodes.
+  SmallVector<Register, 4> PhiRegs;
+  SmallVector<std::pair<Register, int>, 4> PhiRegPairs;
+
+  for (size_t I = 0; I < Regs.size(); I++) {
+    Register Reg = MRI.createVirtualRegister(&AVR::GPR8RegClass);
+    PhiRegs.push_back(Reg);
+    PhiRegPairs.push_back(std::pair(Reg, 0));
+  }
+
+  // Shift the registers by one.
+  //
+  // Note that we build blocks kinda in a reversed-order (in reality LoopBB is
+  // *after* CheckBB), because in order to build CheckBB, we need to know the
+  // PHI nodes from LoopBB.
+  insertMultibyteShift(LoopBB->end(), LoopBB, DL, PhiRegPairs, Opc, 1);
+
+  // Jump back to the loop's body.
+  BuildMI(LoopBB, DL, TII.get(AVR::RJMPk)).addMBB(CheckBB);
+
+  // Create PHI nodes for the value that is shifted.
+  for (size_t I = 0; I < Regs.size(); I++) {
+    auto Pair = Regs[I];
+
+    BuildMI(CheckBB, DL, TII.get(AVR::PHI), PhiRegs[I])
+        .addReg(Pair.first, 0, Pair.second)
+        .addMBB(EntryBB)
+        .addReg(PhiRegPairs[I].first, 0, PhiRegPairs[I].second)
+        .addMBB(LoopBB);
+
+    Regs[I] = std::pair(PhiRegs[I], 0);
+  }
+
+  // Create a PHI node for the loop counter.
+  Register CntPhi = MRI.createVirtualRegister(&AVR::GPR8RegClass);
+  Register CntDec = MRI.createVirtualRegister(&AVR::GPR8RegClass);
+
+  BuildMI(CheckBB, DL, TII.get(AVR::PHI), CntPhi)
+      .addReg(ShiftNum)
+      .addMBB(EntryBB)
+      .addReg(CntDec)
+      .addMBB(LoopBB);
+
+  // Decrement the counter; if we're done, jump to the exit and otherwise fall
+  // through to the CheckBB.
+  BuildMI(CheckBB, DL, TII.get(AVR::DECRd), CntDec).addReg(CntPhi);
+  BuildMI(CheckBB, DL, TII.get(AVR::BRMIk)).addMBB(ExitBB);
+
+  return ExitBB;
+}
+
 // Do a wide (32-bit) shift.
 MachineBasicBlock *
 AVRTargetLowering::insertWideShift(MachineInstr &MI,
                                    MachineBasicBlock *BB) const {
   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
-  const DebugLoc &dl = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineBasicBlock::iterator MBBI(&MI);
 
-  // How much to shift to the right (meaning: a negative number indicates a left
-  // shift).
-  int64_t ShiftAmt = MI.getOperand(4).getImm();
   ISD::NodeType Opc;
   switch (MI.getOpcode()) {
   case AVR::Lsl32:
@@ -2214,7 +2297,19 @@
   };
 
   // Do the shift. The registers are modified in-place.
-  insertMultibyteShift(MI, BB, Registers, Opc, ShiftAmt);
+  int64_t ShiftAmt = 1;
+  if (MI.getOperand(4).isImm()) {
+    // The shift amount is known at compile time.
+    ShiftAmt = MI.getOperand(4).getImm();
+    insertMultibyteShift(MBBI, BB, MI.getDebugLoc(), Registers, Opc, ShiftAmt);
+  } else {
+    // The shift amount is not known at compile time. We need to create a loop.
+    Register ShiftNum = MI.getOperand(4).getReg();
+    BB = insertMultibyteShiftLoop(MI, BB, ShiftNum, Registers, Opc);
+
+    // Insert REG_SEQUENCE instructions at the beginning of ExitBB.
+    MBBI = BB->begin();
+  }
 
   // Combine the 8-bit registers into 16-bit register pairs.
   // This done either from LSB to MSB or from MSB to LSB, depending on the
@@ -2230,24 +2325,28 @@
   if (Opc != ISD::SHL &&
       (Opc != ISD::SRA || (ShiftAmt < 16 || ShiftAmt >= 22))) {
     // Use the resulting registers starting with the least significant byte.
-    BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(0).getReg())
+    BuildMI(*BB, MBBI, DL, TII.get(AVR::REG_SEQUENCE),
+            MI.getOperand(0).getReg())
         .addReg(Registers[3].first, 0, Registers[3].second)
         .addImm(AVR::sub_lo)
         .addReg(Registers[2].first, 0, Registers[2].second)
         .addImm(AVR::sub_hi);
-    BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(1).getReg())
+    BuildMI(*BB, MBBI, DL, TII.get(AVR::REG_SEQUENCE),
+            MI.getOperand(1).getReg())
         .addReg(Registers[1].first, 0, Registers[1].second)
         .addImm(AVR::sub_lo)
         .addReg(Registers[0].first, 0, Registers[0].second)
         .addImm(AVR::sub_hi);
   } else {
     // Use the resulting registers starting with the most significant byte.
-    BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(1).getReg())
+    BuildMI(*BB, MBBI, DL, TII.get(AVR::REG_SEQUENCE),
+            MI.getOperand(1).getReg())
         .addReg(Registers[0].first, 0, Registers[0].second)
         .addImm(AVR::sub_hi)
         .addReg(Registers[1].first, 0, Registers[1].second)
         .addImm(AVR::sub_lo);
-    BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(0).getReg())
+    BuildMI(*BB, MBBI, DL, TII.get(AVR::REG_SEQUENCE),
+            MI.getOperand(0).getReg())
         .addReg(Registers[2].first, 0, Registers[2].second)
         .addImm(AVR::sub_hi)
         .addReg(Registers[3].first, 0, Registers[3].second)
diff --git a/llvm/lib/Target/AVR/AVRShiftExpand.cpp b/llvm/lib/Target/AVR/AVRShiftExpand.cpp
deleted file mode 100644
--- a/llvm/lib/Target/AVR/AVRShiftExpand.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-//===- AVRShift.cpp - Shift Expansion Pass --------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// Expand 32-bit shift instructions (shl, lshr, ashr) to inline loops, just
-/// like avr-gcc. This must be done in IR because otherwise the type legalizer
-/// will turn 32-bit shifts into (non-existing) library calls such as __ashlsi3.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AVR.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstIterator.h"
-
-using namespace llvm;
-
-namespace {
-
-class AVRShiftExpand : public FunctionPass {
-public:
-  static char ID;
-
-  AVRShiftExpand() : FunctionPass(ID) {}
-
-  bool runOnFunction(Function &F) override;
-
-  StringRef getPassName() const override { return "AVR Shift Expansion"; }
-
-private:
-  void expand(BinaryOperator *BI);
-};
-
-} // end of anonymous namespace
-
-char AVRShiftExpand::ID = 0;
-
-INITIALIZE_PASS(AVRShiftExpand, "avr-shift-expand", "AVR Shift Expansion",
-                false, false)
-
-Pass *llvm::createAVRShiftExpandPass() { return new AVRShiftExpand(); }
-
-bool AVRShiftExpand::runOnFunction(Function &F) {
-  SmallVector<BinaryOperator *, 1> ShiftInsts;
-  auto &Ctx = F.getContext();
-  for (Instruction &I : instructions(F)) {
-    if (!I.isShift())
-      // Only expand shift instructions (shl, lshr, ashr).
-      continue;
-    if (I.getType() != Type::getInt32Ty(Ctx))
-      // Only expand plain i32 types.
-      continue;
-    if (isa<ConstantInt>(I.getOperand(1)))
-      // Only expand when the shift amount is not known.
-      // Known shift amounts are (currently) better expanded inline.
-      continue;
-    ShiftInsts.push_back(cast<BinaryOperator>(&I));
-  }
-
-  // The expanding itself needs to be done separately as expand() will remove
-  // these instructions. Removing instructions while iterating over a basic
-  // block is not a great idea.
-  for (auto *I : ShiftInsts) {
-    expand(I);
-  }
-
-  // Return whether this function expanded any shift instructions.
-  return ShiftInsts.size() > 0;
-}
-
-void AVRShiftExpand::expand(BinaryOperator *BI) {
-  auto &Ctx = BI->getContext();
-  IRBuilder<> Builder(BI);
-  Type *Int32Ty = Type::getInt32Ty(Ctx);
-  Type *Int8Ty = Type::getInt8Ty(Ctx);
-  Value *Int8Zero = ConstantInt::get(Int8Ty, 0);
-
-  // Split the current basic block at the point of the existing shift
-  // instruction and insert a new basic block for the loop.
-  BasicBlock *BB = BI->getParent();
-  Function *F = BB->getParent();
-  BasicBlock *EndBB = BB->splitBasicBlock(BI, "shift.done");
-  BasicBlock *LoopBB = BasicBlock::Create(Ctx, "shift.loop", F, EndBB);
-
-  // Truncate the shift amount to i8, which is trivially lowered to a single
-  // AVR register.
-  Builder.SetInsertPoint(&BB->back());
-  Value *ShiftAmount = Builder.CreateTrunc(BI->getOperand(1), Int8Ty);
-
-  // Replace the unconditional branch that splitBasicBlock created with a
-  // conditional branch.
-  Value *Cmp1 = Builder.CreateICmpEQ(ShiftAmount, Int8Zero);
-  Builder.CreateCondBr(Cmp1, EndBB, LoopBB);
-  BB->back().eraseFromParent();
-
-  // Create the loop body starting with PHI nodes.
-  Builder.SetInsertPoint(LoopBB);
-  PHINode *ShiftAmountPHI = Builder.CreatePHI(Int8Ty, 2);
-  ShiftAmountPHI->addIncoming(ShiftAmount, BB);
-  PHINode *ValuePHI = Builder.CreatePHI(Int32Ty, 2);
-  ValuePHI->addIncoming(BI->getOperand(0), BB);
-
-  // Subtract the shift amount by one, as we're shifting one this loop
-  // iteration.
-  Value *ShiftAmountSub =
-      Builder.CreateSub(ShiftAmountPHI, ConstantInt::get(Int8Ty, 1));
-  ShiftAmountPHI->addIncoming(ShiftAmountSub, LoopBB);
-
-  // Emit the actual shift instruction. The difference is that this shift
-  // instruction has a constant shift amount, which can be emitted inline
-  // without a library call.
-  Value *ValueShifted;
-  switch (BI->getOpcode()) {
-  case Instruction::Shl:
-    ValueShifted = Builder.CreateShl(ValuePHI, ConstantInt::get(Int32Ty, 1));
-    break;
-  case Instruction::LShr:
-    ValueShifted = Builder.CreateLShr(ValuePHI, ConstantInt::get(Int32Ty, 1));
-    break;
-  case Instruction::AShr:
-    ValueShifted = Builder.CreateAShr(ValuePHI, ConstantInt::get(Int32Ty, 1));
-    break;
-  default:
-    llvm_unreachable("asked to expand an instruction that is not a shift");
-  }
-  ValuePHI->addIncoming(ValueShifted, LoopBB);
-
-  // Branch to either the loop again (if there is more to shift) or to the
-  // basic block after the loop (if all bits are shifted).
-  Value *Cmp2 = Builder.CreateICmpEQ(ShiftAmountSub, Int8Zero);
-  Builder.CreateCondBr(Cmp2, EndBB, LoopBB);
-
-  // Collect the resulting value. This is necessary in the IR but won't produce
-  // any actual instructions.
-  Builder.SetInsertPoint(BI);
-  PHINode *Result = Builder.CreatePHI(Int32Ty, 2);
-  Result->addIncoming(BI->getOperand(0), BB);
-  Result->addIncoming(ValueShifted, LoopBB);
-
-  // Replace the original shift instruction.
-  BI->replaceAllUsesWith(Result);
-  BI->eraseFromParent();
-}
diff --git a/llvm/lib/Target/AVR/AVRTargetMachine.cpp b/llvm/lib/Target/AVR/AVRTargetMachine.cpp
--- a/llvm/lib/Target/AVR/AVRTargetMachine.cpp
+++ b/llvm/lib/Target/AVR/AVRTargetMachine.cpp
@@ -68,7 +68,6 @@
     return getTM<AVRTargetMachine>();
   }
 
-  void addIRPasses() override;
   bool addInstSelector() override;
   void addPreSched2() override;
   void addPreEmitPass() override;
@@ -79,22 +78,12 @@
   return new AVRPassConfig(*this, PM);
 }
 
-void AVRPassConfig::addIRPasses() {
-  // Expand instructions like
-  //   %result = shl i32 %n, %amount
-  // to a loop so that library calls are avoided.
-  addPass(createAVRShiftExpandPass());
-
-  TargetPassConfig::addIRPasses();
-}
-
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRTarget() {
   // Register the target.
   RegisterTargetMachine<AVRTargetMachine> X(getTheAVRTarget());
 
   auto &PR = *PassRegistry::getPassRegistry();
   initializeAVRExpandPseudoPass(PR);
-  initializeAVRShiftExpandPass(PR);
   initializeAVRDAGToDAGISelPass(PR);
 }
 
diff --git a/llvm/lib/Target/AVR/CMakeLists.txt b/llvm/lib/Target/AVR/CMakeLists.txt
--- a/llvm/lib/Target/AVR/CMakeLists.txt
+++ b/llvm/lib/Target/AVR/CMakeLists.txt
@@ -23,7 +23,6 @@
   AVRISelLowering.cpp
   AVRMCInstLower.cpp
   AVRRegisterInfo.cpp
-  AVRShiftExpand.cpp
   AVRSubtarget.cpp
   AVRTargetMachine.cpp
   AVRTargetObjectFile.cpp
diff --git a/llvm/test/CodeGen/AVR/shift-expand.ll b/llvm/test/CodeGen/AVR/shift-expand.ll
deleted file mode 100644
--- a/llvm/test/CodeGen/AVR/shift-expand.ll
+++ /dev/null
@@ -1,89 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -avr-shift-expand -S %s -o - | FileCheck %s
-
-; The avr-shift-expand pass expands large shifts with a non-constant shift
-; amount to a loop. These loops avoid generating a (non-existing) builtin such
-; as __ashlsi3.
-
-target datalayout = "e-P1-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8-a:8"
-target triple = "avr"
-
-define i32 @shl(i32 %value, i32 %amount) addrspace(1) {
-; CHECK-LABEL: @shl(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[AMOUNT:%.*]] to i8
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[TMP2]], label [[SHIFT_DONE:%.*]], label [[SHIFT_LOOP:%.*]]
-; CHECK:       shift.loop:
-; CHECK-NEXT:    [[TMP3:%.*]] = phi i8 [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[SHIFT_LOOP]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[VALUE:%.*]], [[TMP0]] ], [ [[TMP6:%.*]], [[SHIFT_LOOP]] ]
-; CHECK-NEXT:    [[TMP5]] = sub i8 [[TMP3]], 1
-; CHECK-NEXT:    [[TMP6]] = shl i32 [[TMP4]], 1
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i8 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[TMP7]], label [[SHIFT_DONE]], label [[SHIFT_LOOP]]
-; CHECK:       shift.done:
-; CHECK-NEXT:    [[TMP8:%.*]] = phi i32 [ [[VALUE]], [[TMP0]] ], [ [[TMP6]], [[SHIFT_LOOP]] ]
-; CHECK-NEXT:    ret i32 [[TMP8]]
-;
-  %result = shl i32 %value, %amount
-  ret i32 %result
-}
-
-define i32 @lshr(i32 %value, i32 %amount) addrspace(1) {
-; CHECK-LABEL: @lshr(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[AMOUNT:%.*]] to i8
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[TMP2]], label [[SHIFT_DONE:%.*]], label [[SHIFT_LOOP:%.*]]
-; CHECK:       shift.loop:
-; CHECK-NEXT:    [[TMP3:%.*]] = phi i8 [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[SHIFT_LOOP]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[VALUE:%.*]], [[TMP0]] ], [ [[TMP6:%.*]], [[SHIFT_LOOP]] ]
-; CHECK-NEXT:    [[TMP5]] = sub i8 [[TMP3]], 1
-; CHECK-NEXT:    [[TMP6]] = lshr i32 [[TMP4]], 1
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i8 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[TMP7]], label [[SHIFT_DONE]], label [[SHIFT_LOOP]]
-; CHECK:       shift.done:
-; CHECK-NEXT:    [[TMP8:%.*]] = phi i32 [ [[VALUE]], [[TMP0]] ], [ [[TMP6]], [[SHIFT_LOOP]] ]
-; CHECK-NEXT:    ret i32 [[TMP8]]
-;
-  %result = lshr i32 %value, %amount
-  ret i32 %result
-}
-
-define i32 @ashr(i32 %0, i32 %1) addrspace(1) {
-; CHECK-LABEL: @ashr(
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP1:%.*]] to i8
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i8 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[TMP4]], label [[SHIFT_DONE:%.*]], label [[SHIFT_LOOP:%.*]]
-; CHECK:       shift.loop:
-; CHECK-NEXT:    [[TMP5:%.*]] = phi i8 [ [[TMP3]], [[TMP2:%.*]] ], [ [[TMP7:%.*]], [[SHIFT_LOOP]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = phi i32 [ [[TMP0:%.*]], [[TMP2]] ], [ [[TMP8:%.*]], [[SHIFT_LOOP]] ]
-; CHECK-NEXT:    [[TMP7]] = sub i8 [[TMP5]], 1
-; CHECK-NEXT:    [[TMP8]] = ashr i32 [[TMP6]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i8 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[TMP9]], label [[SHIFT_DONE]], label [[SHIFT_LOOP]]
-; CHECK:       shift.done:
-; CHECK-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP0]], [[TMP2]] ], [ [[TMP8]], [[SHIFT_LOOP]] ]
-; CHECK-NEXT:    ret i32 [[TMP10]]
-;
-  %3 = ashr i32 %0, %1
-  ret i32 %3
-}
-
-; This function is not modified because it is not an i32.
-define i40 @shl40(i40 %value, i40 %amount) addrspace(1) {
-; CHECK-LABEL: @shl40(
-; CHECK-NEXT:    [[RESULT:%.*]] = shl i40 [[VALUE:%.*]], [[AMOUNT:%.*]]
-; CHECK-NEXT:    ret i40 [[RESULT]]
-;
-  %result = shl i40 %value, %amount
-  ret i40 %result
-}
-
-; This function isn't either, although perhaps it should.
-define i24 @shl24(i24 %value, i24 %amount) addrspace(1) {
-; CHECK-LABEL: @shl24(
-; CHECK-NEXT:    [[RESULT:%.*]] = shl i24 [[VALUE:%.*]], [[AMOUNT:%.*]]
-; CHECK-NEXT:    ret i24 [[RESULT]]
-;
-  %result = shl i24 %value, %amount
-  ret i24 %result
-}
diff --git a/llvm/test/CodeGen/AVR/shift-loop.ll b/llvm/test/CodeGen/AVR/shift-loop.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AVR/shift-loop.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc < %s -mtriple=avr -verify-machineinstrs -stop-after=dead-mi-elimination | FileCheck %s
+
+; This test shows the machine IR that is generated when lowering a shift
+; operation to a loop.
+
+define i32 @shl_i32_n(i32 %a, i32 %b) #0 {
+  ; CHECK-LABEL: name: shl_i32_n
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $r23r22, $r25r24, $r19r18
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:dregs = COPY $r19r18
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:dregs = COPY $r25r24
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:dregs = COPY $r23r22
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr8 = COPY [[COPY]].sub_lo
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr8 = PHI [[COPY1]].sub_hi, %bb.0, %15, %bb.2
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:gpr8 = PHI [[COPY1]].sub_lo, %bb.0, %14, %bb.2
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:gpr8 = PHI [[COPY2]].sub_hi, %bb.0, %13, %bb.2
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:gpr8 = PHI [[COPY2]].sub_lo, %bb.0, %12, %bb.2
+  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:gpr8 = PHI [[COPY3]], %bb.0, %17, %bb.2
+  ; CHECK-NEXT:   [[DECRd:%[0-9]+]]:gpr8 = DECRd [[PHI4]], implicit-def $sreg
+  ; CHECK-NEXT:   BRMIk %bb.3, implicit $sreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2 (%ir-block.0):
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[ADDRdRr:%[0-9]+]]:gpr8 = ADDRdRr [[PHI3]], [[PHI3]], implicit-def $sreg
+  ; CHECK-NEXT:   [[ADCRdRr:%[0-9]+]]:gpr8 = ADCRdRr [[PHI2]], [[PHI2]], implicit-def $sreg, implicit $sreg
+  ; CHECK-NEXT:   [[ADCRdRr1:%[0-9]+]]:gpr8 = ADCRdRr [[PHI1]], [[PHI1]], implicit-def $sreg, implicit $sreg
+  ; CHECK-NEXT:   [[ADCRdRr2:%[0-9]+]]:gpr8 = ADCRdRr [[PHI]], [[PHI]], implicit-def $sreg, implicit $sreg
+  ; CHECK-NEXT:   RJMPk %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3 (%ir-block.0):
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:dregs = REG_SEQUENCE [[PHI]], %subreg.sub_hi, [[PHI1]], %subreg.sub_lo
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:dregs = REG_SEQUENCE [[PHI2]], %subreg.sub_hi, [[PHI3]], %subreg.sub_lo
+  ; CHECK-NEXT:   $r23r22 = COPY [[REG_SEQUENCE1]]
+  ; CHECK-NEXT:   $r25r24 = COPY [[REG_SEQUENCE]]
+  ; CHECK-NEXT:   RET implicit $r23r22, implicit $r25r24, implicit $r1
+  %res = shl i32 %a, %b
+  ret i32 %res
+}
diff --git a/llvm/test/CodeGen/AVR/shift32.ll b/llvm/test/CodeGen/AVR/shift32.ll
--- a/llvm/test/CodeGen/AVR/shift32.ll
+++ b/llvm/test/CodeGen/AVR/shift32.ll
@@ -1,6 +1,67 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=avr -mattr=movw -verify-machineinstrs | FileCheck %s
 
+; Shift by a number unknown at compile time.
+; The 'optsize' attribute is set to avoid duplicating part of the loop.
+; TODO: it is more efficent to jump at the start and do the check where the
+; 'rjmp' is now. The branch relaxation pass puts them in this non-optimal order.
+
+define i32 @shl_i32_n(i32 %a, i32 %b) #0 {
+; CHECK-LABEL: shl_i32_n:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    dec r18
+; CHECK-NEXT:    brmi .LBB0_3
+; CHECK-NEXT:  ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    lsl r22
+; CHECK-NEXT:    rol r23
+; CHECK-NEXT:    rol r24
+; CHECK-NEXT:    rol r25
+; CHECK-NEXT:    rjmp .LBB0_1
+; CHECK-NEXT:  .LBB0_3:
+; CHECK-NEXT:    ret
+  %res = shl i32 %a, %b
+  ret i32 %res
+}
+
+define i32 @lshr_i32_n(i32 %a, i32 %b) #0 {
+; CHECK-LABEL: lshr_i32_n:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    dec r18
+; CHECK-NEXT:    brmi .LBB1_3
+; CHECK-NEXT:  ; %bb.2: ; in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    lsr r25
+; CHECK-NEXT:    ror r24
+; CHECK-NEXT:    ror r23
+; CHECK-NEXT:    ror r22
+; CHECK-NEXT:    rjmp .LBB1_1
+; CHECK-NEXT:  .LBB1_3:
+; CHECK-NEXT:    ret
+  %res = lshr i32 %a, %b
+  ret i32 %res
+}
+
+define i32 @ashr_i32_n(i32 %a, i32 %b) #0 {
+; CHECK-LABEL: ashr_i32_n:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    dec r18
+; CHECK-NEXT:    brmi .LBB2_3
+; CHECK-NEXT:  ; %bb.2: ; in Loop: Header=BB2_1 Depth=1
+; CHECK-NEXT:    asr r25
+; CHECK-NEXT:    ror r24
+; CHECK-NEXT:    ror r23
+; CHECK-NEXT:    ror r22
+; CHECK-NEXT:    rjmp .LBB2_1
+; CHECK-NEXT:  .LBB2_3:
+; CHECK-NEXT:    ret
+  %res = ashr i32 %a, %b
+  ret i32 %res
+}
+
+; Shift by a constant known at compile time.
+
 define i32 @shl_i32_1(i32 %a) {
 ; CHECK-LABEL: shl_i32_1:
 ; CHECK:       ; %bb.0:
@@ -575,3 +636,5 @@
   %res = ashr i32 %a, 31
   ret i32 %res
 }
+
+attributes #0 = { optsize }
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AVR/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AVR/BUILD.gn
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AVR/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AVR/BUILD.gn
@@ -37,7 +37,6 @@
     "AVRInstrInfo.cpp",
     "AVRMCInstLower.cpp",
     "AVRRegisterInfo.cpp",
-    "AVRShiftExpand.cpp",
     "AVRSubtarget.cpp",
     "AVRTargetMachine.cpp",
     "AVRTargetObjectFile.cpp",