diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -48,10 +48,99 @@
 
 namespace {
 
-class AMDGPULowerModuleLDS : public ModulePass {
+class LowerModuleLDSImpl {
+  Module &M;
+  LLVMContext &Ctx;
+  const DataLayout &DL;
+
+  // Sort by alignment, descending, to minimise padding. On ties, sort by size,
+  // descending, then by name, lexicographical.
+  void sortLocalVars(std::vector<GlobalVariable *> &FoundLocalVars) {
+    llvm::stable_sort(
+        FoundLocalVars,
+        [&](const GlobalVariable *LHS, const GlobalVariable *RHS) -> bool {
+          Align ALHS = AMDGPU::getAlign(DL, LHS);
+          Align ARHS = AMDGPU::getAlign(DL, RHS);
+          if (ALHS != ARHS) {
+            return ALHS > ARHS;
+          }
+
+          TypeSize SLHS = DL.getTypeAllocSize(LHS->getValueType());
+          TypeSize SRHS = DL.getTypeAllocSize(RHS->getValueType());
+          if (SLHS != SRHS) {
+            return SLHS > SRHS;
+          }
+
+          // By variable name on tie for predictable order in test cases.
+          return LHS->getName() < RHS->getName();
+        });
+  }
+
+  std::vector<GlobalVariable *> insertPaddingVarsWithinSortedLocalVarsList(
+      std::vector<GlobalVariable *> &FoundLocalVars) {
+    std::vector<GlobalVariable *> LocalVars;
+    LocalVars.reserve(FoundLocalVars.size()); // will be at least this large
+
+    // This usually won't need to insert any padding, perhaps avoid the alloc
+    uint64_t CurrentOffset = 0;
+    for (size_t I = 0; I < FoundLocalVars.size(); I++) {
+      GlobalVariable *FGV = FoundLocalVars[I];
+      Align DataAlign = AMDGPU::getAlign(DL, FGV);
+
+      uint64_t DataAlignV = DataAlign.value();
+      if (uint64_t Rem = CurrentOffset % DataAlignV) {
+        uint64_t Padding = DataAlignV - Rem;
+
+        // Append an array of padding bytes to meet alignment requested
+        // Note (o +      (a - (o % a)) ) % a == 0
+        //      (offset + Padding       ) % align == 0
+
+        Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding);
+        LocalVars.push_back(new GlobalVariable(
+            M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy),
+            "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
+            false));
+        CurrentOffset += Padding;
+      }
+
+      LocalVars.push_back(FGV);
+      CurrentOffset += DL.getTypeAllocSize(FGV->getValueType());
+    }
+
+    return LocalVars;
+  }
+
+  std::pair<StructType *, GlobalVariable *>
+  createNewStructTypeAndItsInstance(std::vector<GlobalVariable *> &LocalVars) {
+    std::vector<Type *> LocalVarTypes;
+    LocalVarTypes.reserve(LocalVars.size());
+    std::transform(
+        LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes),
+        [](const GlobalVariable *V) -> Type * { return V->getValueType(); });
+
+    StructType *LDSTy = StructType::create(
+        Ctx, LocalVarTypes, llvm::StringRef("llvm.amdgcn.module.lds.t"));
+
+    Align MaxAlign =
+        AMDGPU::getAlign(DL, LocalVars[0]); // was sorted on alignment
+
+    GlobalVariable *SGV = new GlobalVariable(
+        M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy),
+        "llvm.amdgcn.module.lds", nullptr, GlobalValue::NotThreadLocal,
+        AMDGPUAS::LOCAL_ADDRESS, false);
 
-  static void removeFromUsedList(Module &M, StringRef Name,
-                                 SmallPtrSetImpl<Constant *> &ToRemove) {
+    SGV->setAlignment(MaxAlign);
+
+    appendToCompilerUsed(
+        M, {static_cast<GlobalValue *>(
+               ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+                   cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))});
+
+    return std::make_pair(LDSTy, SGV);
+  }
+
+  void removeFromUsedList(Module &M, StringRef Name,
+                          SmallPtrSetImpl<Constant *> &ToRemove) {
     GlobalVariable *GV = M.getGlobalVariable(Name);
     if (!GV || ToRemove.empty()) {
       return;
@@ -83,9 +172,8 @@
     }
   }
 
-  static void
-  removeFromUsedLists(Module &M,
-                      const std::vector<GlobalVariable *> &LocalVars) {
+  void removeFromUsedLists(Module &M,
+                           const std::vector<GlobalVariable *> &LocalVars) {
     SmallPtrSet<Constant *, 32> LocalVarsSet;
     for (size_t I = 0; I < LocalVars.size(); I++) {
       if (Constant *C = dyn_cast<Constant>(LocalVars[I]->stripPointerCasts())) {
@@ -96,8 +184,26 @@
     removeFromUsedList(M, "llvm.compiler.used", LocalVarsSet);
   }
 
-  static void markUsedByKernel(IRBuilder<> &Builder, Function *Func,
-                               GlobalVariable *SGV) {
+  void replaceUsesOfLocalVars(std::vector<GlobalVariable *> &LocalVars,
+                              StructType *LDSTy, GlobalVariable *SGV) {
+    // Replace uses of ith variable with a constantexpr to the ith field of the
+    // instance that will be allocated by AMDGPUMachineFunction
+    Constant *InstanceAddress = Constant::getIntegerValue(
+        PointerType::get(LDSTy, AMDGPUAS::LOCAL_ADDRESS), APInt(32, 0));
+
+    Type *I32 = Type::getInt32Ty(Ctx);
+
+    for (size_t I = 0; I < LocalVars.size(); I++) {
+      GlobalVariable *GV = LocalVars[I];
+      Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
+      GV->replaceAllUsesWith(
+          ConstantExpr::getGetElementPtr(LDSTy, InstanceAddress, GEPIdx));
+      GV->eraseFromParent();
+    }
+  }
+
+  void markUsedByKernel(IRBuilder<> &Builder, Function *Func,
+                        GlobalVariable *SGV) {
     // The llvm.amdgcn.module.lds instance is implicitly used by all kernels
     // that might call a function which accesses a field within it. This is
     // presently approximated to 'all kernels' if there are any such functions
@@ -129,101 +235,44 @@
                        "");
   }
 
-public:
-  static char ID;
-
-  AMDGPULowerModuleLDS() : ModulePass(ID) {
-    initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry());
+  void markUsedByKernel(GlobalVariable *SGV) {
+    IRBuilder<> Builder(Ctx);
+    SmallPtrSet<Function *, 32> Kernels;
+    for (auto &I : M.functions()) {
+      Function *Func = &I;
+      if (AMDGPU::isKernelCC(Func) && !Kernels.contains(Func)) {
+        markUsedByKernel(Builder, Func, SGV);
+        Kernels.insert(Func);
+      }
+    }
   }
 
-  bool runOnModule(Module &M) override {
-    LLVMContext &Ctx = M.getContext();
-    const DataLayout &DL = M.getDataLayout();
-    SmallPtrSet<GlobalValue *, 32> UsedList = AMDGPU::getUsedList(M);
+public:
+  explicit LowerModuleLDSImpl(Module &M)
+      : M(M), Ctx(M.getContext()), DL(M.getDataLayout()) {}
 
-    // Find variables to move into new struct instance
+  bool lower() {
+    // Find variables to move into new struct instance.
     std::vector<GlobalVariable *> FoundLocalVars =
-        AMDGPU::findVariablesToLower(M, UsedList);
+        AMDGPU::findVariablesToLower(M);
 
     if (FoundLocalVars.empty()) {
       // No variables to rewrite, no changes made.
       return false;
     }
 
-    // Sort by alignment, descending, to minimise padding.
-    // On ties, sort by size, descending, then by name, lexicographical.
-    llvm::stable_sort(
-        FoundLocalVars,
-        [&](const GlobalVariable *LHS, const GlobalVariable *RHS) -> bool {
-          Align ALHS = AMDGPU::getAlign(DL, LHS);
-          Align ARHS = AMDGPU::getAlign(DL, RHS);
-          if (ALHS != ARHS) {
-            return ALHS > ARHS;
-          }
-
-          TypeSize SLHS = DL.getTypeAllocSize(LHS->getValueType());
-          TypeSize SRHS = DL.getTypeAllocSize(RHS->getValueType());
-          if (SLHS != SRHS) {
-            return SLHS > SRHS;
-          }
-
-          // By variable name on tie for predictable order in test cases.
-          return LHS->getName() < RHS->getName();
-        });
-
-    std::vector<GlobalVariable *> LocalVars;
-    LocalVars.reserve(FoundLocalVars.size()); // will be at least this large
-    {
-      // This usually won't need to insert any padding, perhaps avoid the alloc
-      uint64_t CurrentOffset = 0;
-      for (size_t I = 0; I < FoundLocalVars.size(); I++) {
-        GlobalVariable *FGV = FoundLocalVars[I];
-        Align DataAlign = AMDGPU::getAlign(DL, FGV);
-
-        uint64_t DataAlignV = DataAlign.value();
-        if (uint64_t Rem = CurrentOffset % DataAlignV) {
-          uint64_t Padding = DataAlignV - Rem;
-
-          // Append an array of padding bytes to meet alignment requested
-          // Note (o +      (a - (o % a)) ) % a == 0
-          //      (offset + Padding       ) % align == 0
-
-          Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding);
-          LocalVars.push_back(new GlobalVariable(
-              M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy),
-              "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
-              false));
-          CurrentOffset += Padding;
-        }
-
-        LocalVars.push_back(FGV);
-        CurrentOffset += DL.getTypeAllocSize(FGV->getValueType());
-      }
-    }
-
-    std::vector<Type *> LocalVarTypes;
-    LocalVarTypes.reserve(LocalVars.size());
-    std::transform(
-        LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes),
-        [](const GlobalVariable *V) -> Type * { return V->getValueType(); });
-
-    StructType *LDSTy = StructType::create(
-        Ctx, LocalVarTypes, llvm::StringRef("llvm.amdgcn.module.lds.t"));
+    // Sort by alignment, descending, to minimise padding. On ties, sort by
+    // size, descending, then by name, lexicographical.
+    sortLocalVars(FoundLocalVars);
 
-    Align MaxAlign =
-        AMDGPU::getAlign(DL, LocalVars[0]); // was sorted on alignment
-    Constant *InstanceAddress = Constant::getIntegerValue(
-        PointerType::get(LDSTy, AMDGPUAS::LOCAL_ADDRESS), APInt(32, 0));
+    // Insert needed padding variables.
+    std::vector<GlobalVariable *> LocalVars =
+        insertPaddingVarsWithinSortedLocalVarsList(FoundLocalVars);
 
-    GlobalVariable *SGV = new GlobalVariable(
-        M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy),
-        "llvm.amdgcn.module.lds", nullptr, GlobalValue::NotThreadLocal,
-        AMDGPUAS::LOCAL_ADDRESS, false);
-    SGV->setAlignment(MaxAlign);
-    appendToCompilerUsed(
-        M, {static_cast<GlobalValue *>(
-               ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-                   cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))});
+    // Construct new struct type and its global instance.
+    StructType *LDSTy;
+    GlobalVariable *SGV;
+    std::tie(LDSTy, SGV) = createNewStructTypeAndItsInstance(LocalVars);
 
     // The verifier rejects used lists containing an inttoptr of a constant
     // so remove the variables from these lists before replaceAllUsesWith
@@ -231,35 +280,31 @@
 
     // Replace uses of ith variable with a constantexpr to the ith field of the
     // instance that will be allocated by AMDGPUMachineFunction
-    Type *I32 = Type::getInt32Ty(Ctx);
-    for (size_t I = 0; I < LocalVars.size(); I++) {
-      GlobalVariable *GV = LocalVars[I];
-      Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
-      GV->replaceAllUsesWith(
-          ConstantExpr::getGetElementPtr(LDSTy, InstanceAddress, GEPIdx));
-      GV->eraseFromParent();
-    }
+    replaceUsesOfLocalVars(LocalVars, LDSTy, SGV);
 
     // Mark kernels with asm that reads the address of the allocated structure
     // This is not necessary for lowering. This lets other passes, specifically
     // PromoteAlloca, accurately calculate how much LDS will be used by the
     // kernel after lowering.
-    {
-      IRBuilder<> Builder(Ctx);
-      SmallPtrSet<Function *, 32> Kernels;
-      for (auto &I : M.functions()) {
-        Function *Func = &I;
-        if (AMDGPU::isKernelCC(Func) && !Kernels.contains(Func)) {
-          markUsedByKernel(Builder, Func, SGV);
-          Kernels.insert(Func);
-        }
-      }
-    }
+    markUsedByKernel(SGV);
+
     return true;
   }
 };
 
+class AMDGPULowerModuleLDS : public ModulePass {
+public:
+  static char ID;
+
+  AMDGPULowerModuleLDS() : ModulePass(ID) {
+    initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override;
+};
+
 } // namespace
+
 char AMDGPULowerModuleLDS::ID = 0;
 
 char &llvm::AMDGPULowerModuleLDSID = AMDGPULowerModuleLDS::ID;
@@ -268,12 +313,17 @@
                 "Lower uses of LDS variables from non-kernel functions", false,
                 false)
 
+bool AMDGPULowerModuleLDS::runOnModule(Module &M) {
+  LowerModuleLDSImpl Lowerer(M);
+  return Lowerer.lower();
+}
+
 ModulePass *llvm::createAMDGPULowerModuleLDSPass() {
   return new AMDGPULowerModuleLDS();
 }
 
 PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M,
                                                 ModuleAnalysisManager &) {
-  return AMDGPULowerModuleLDS().runOnModule(M) ? PreservedAnalyses::none()
-                                               : PreservedAnalyses::all();
+  LowerModuleLDSImpl Lowerer(M);
+  return Lowerer.lower() ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
@@ -26,8 +26,7 @@
 bool userRequiresLowering(const SmallPtrSetImpl<GlobalValue *> &UsedList,
                           User *InitialUser);
 
-std::vector<GlobalVariable *>
-findVariablesToLower(Module &M, const SmallPtrSetImpl<GlobalValue *> &UsedList);
+std::vector<GlobalVariable *> findVariablesToLower(Module &M);
 
 SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M);
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
@@ -72,9 +72,7 @@
   return false;
 }
 
-std::vector<GlobalVariable *>
-findVariablesToLower(Module &M,
-                     const SmallPtrSetImpl<GlobalValue *> &UsedList) {
+std::vector<GlobalVariable *> findVariablesToLower(Module &M) {
   std::vector<llvm::GlobalVariable *> LocalVars;
   for (auto &GV : M.globals()) {
     if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
@@ -99,7 +97,7 @@
       continue;
     }
     if (std::none_of(GV.user_begin(), GV.user_end(), [&](User *U) {
-          return userRequiresLowering(UsedList, U);
+          return userRequiresLowering(AMDGPU::getUsedList(M), U);
         })) {
       continue;
     }