diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -68,6 +68,7 @@
 FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetMachine *);
 FunctionPass *createAMDGPUUseNativeCallsPass();
 FunctionPass *createAMDGPUCodeGenPreparePass();
+FunctionPass *createAMDGPULateCodeGenPreparePass();
 FunctionPass *createAMDGPUMachineCFGStructurizerPass();
 FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *);
 ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *);
@@ -223,6 +224,9 @@
 void initializeAMDGPUCodeGenPreparePass(PassRegistry&);
 extern char &AMDGPUCodeGenPrepareID;
 
+void initializeAMDGPULateCodeGenPreparePass(PassRegistry &);
+extern char &AMDGPULateCodeGenPrepareID;
+
 void initializeSIAnnotateControlFlowPass(PassRegistry&);
 extern char &SIAnnotateControlFlowPassID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -0,0 +1,198 @@
+//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass does misc. AMDGPU optimizations on IR *just* before instruction
+/// selection.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <iterator>
+
+#define DEBUG_TYPE "amdgpu-late-codegenprepare"
+
+using namespace llvm;
+
+// Scalar load widening needs running after load-store-vectorizer as that pass
+// doesn't handle overlapping cases. In addition, this pass enhances the
+// widening to handle cases where scalar sub-dword loads are naturally aligned
+// only but not dword aligned.
+static cl::opt<bool>
+    WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
+               cl::desc("Widen sub-dword constant address space loads in "
+                        "AMDGPULateCodeGenPrepare"),
+               cl::ReallyHidden, cl::init(true));
+
+namespace {
+
+class AMDGPULateCodeGenPrepare
+    : public FunctionPass,
+      public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
+  Module *Mod = nullptr;
+  const DataLayout *DL = nullptr;
+
+  AssumptionCache *AC = nullptr;
+  LegacyDivergenceAnalysis *DA = nullptr;
+
+public:
+  static char ID;
+
+  AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}
+
+  StringRef getPassName() const override {
+    return "AMDGPU IR late optimizations";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<LegacyDivergenceAnalysis>();
+    AU.setPreservesAll();
+  }
+
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+
+  bool visitInstruction(Instruction &) { return false; }
+
+  // Check if the specified value is at least DWORD aligned.
+  bool isDWORDAligned(const Value *V) const {
+    KnownBits Known = computeKnownBits(V, *DL, 0, AC);
+    return Known.countMinTrailingZeros() >= 2;
+  }
+
+  bool canWidenScalarExtLoad(LoadInst &LI) const;
+  bool visitLoadInst(LoadInst &LI);
+};
+
+} // end anonymous namespace
+
+bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
+  Mod = &M;
+  DL = &Mod->getDataLayout();
+  return false;
+}
+
+bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  DA = &getAnalysis<LegacyDivergenceAnalysis>();
+
+  bool Changed = false;
+  for (auto &BB : F)
+    for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /*EMPTY*/) {
+      Instruction *I = &*BI++;
+      Changed |= visit(*I);
+    }
+
+  return Changed;
+}
+
+bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
+  unsigned AS = LI.getPointerAddressSpace();
+  // Skip non-constant address space.
+  if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
+      AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
+    return false;
+  // Skip non-simple loads.
+  if (!LI.isSimple())
+    return false;
+  auto *Ty = LI.getType();
+  // Skip aggregate types.
+  if (Ty->isAggregateType())
+    return false;
+  unsigned TySize = DL->getTypeStoreSize(Ty);
+  // Only handle sub-DWORD loads.
+  if (TySize >= 4)
+    return false;
+  // That load must be at least naturally aligned.
+  if (LI.getAlign() < DL->getABITypeAlign(Ty))
+    return false;
+  // It should be uniform, i.e. a scalar load.
+  return DA->isUniform(&LI);
+}
+
+bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
+  if (!WidenLoads)
+    return false;
+
+  // Skip if that load is already aligned on DWORD at least as it's handled in
+  // SDAG.
+  if (LI.getAlign() >= 4)
+    return false;
+
+  if (!canWidenScalarExtLoad(LI))
+    return false;
+
+  int64_t Offset = 0;
+  auto *Base =
+      GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL);
+  // If that base is not DWORD aligned, it's not safe to perform the following
+  // transforms.
+  if (!isDWORDAligned(Base))
+    return false;
+
+  int64_t Adjust = Offset & 0x3;
+  if (Adjust == 0) {
+    // With a zero adjust, the original alignment could be promoted with a
+    // better one.
+    LI.setAlignment(Align(4));
+    return true;
+  }
+
+  IRBuilder<> IRB(&LI);
+  IRB.SetCurrentDebugLocation(LI.getDebugLoc());
+
+  unsigned AS = LI.getPointerAddressSpace();
+  unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8;
+  auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
+
+  PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS);
+  PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS);
+  auto *NewPtr = IRB.CreateBitCast(
+      IRB.CreateConstGEP1_64(IRB.CreateBitCast(Base, Int8PtrTy),
+                             Offset - Adjust),
+      Int32PtrTy);
+  LoadInst *NewLd = IRB.CreateAlignedLoad(NewPtr, Align(4));
+  NewLd->copyMetadata(LI);
+  NewLd->setMetadata(LLVMContext::MD_range, nullptr);
+
+  unsigned ShAmt = Adjust * 8;
+  auto *NewVal = IRB.CreateBitCast(
+      IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
+  LI.replaceAllUsesWith(NewVal);
+  RecursivelyDeleteTriviallyDeadInstructions(&LI);
+
+  return true;
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
+                      "AMDGPU IR late optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
+INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
+                    "AMDGPU IR late optimizations", false, false)
+
+char AMDGPULateCodeGenPrepare::ID = 0;
+
+FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() {
+  return new AMDGPULateCodeGenPrepare();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -236,6 +236,7 @@
   initializeAMDGPUPromoteAllocaPass(*PR);
   initializeAMDGPUPromoteAllocaToVectorPass(*PR);
   initializeAMDGPUCodeGenPreparePass(*PR);
+  initializeAMDGPULateCodeGenPreparePass(*PR);
   initializeAMDGPUPropagateAttributesEarlyPass(*PR);
   initializeAMDGPUPropagateAttributesLatePass(*PR);
   initializeAMDGPURewriteOutArgumentsPass(*PR);
@@ -865,6 +866,7 @@
 bool GCNPassConfig::addPreISel() {
   AMDGPUPassConfig::addPreISel();
 
+  addPass(createAMDGPULateCodeGenPreparePass());
   if (EnableAtomicOptimizations) {
     addPass(createAMDGPUAtomicOptimizerPass());
   }
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -58,6 +58,7 @@
   AMDGPUISelDAGToDAG.cpp
   AMDGPUISelLowering.cpp
   AMDGPUGlobalISelUtils.cpp
+  AMDGPULateCodeGenPrepare.cpp
   AMDGPULegalizerInfo.cpp
   AMDGPULibCalls.cpp
   AMDGPULibFunc.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
@@ -14,6 +14,22 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}test2
+; GCN: enable_sgpr_dispatch_ptr = 1
+; GCN: s_load_dword s[[REG:[0-9]+]], s[4:5], 0x1
+; GCN: s_lshr_b32 s{{[0-9]+}}, s[[REG]], 16
+; GCN-NOT: load_ushort
+; GCN: s_endpgm
+define amdgpu_kernel void @test2(i32 addrspace(1)* %out) {
+  %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
+  %d1 = getelementptr inbounds i8, i8 addrspace(4)* %dispatch_ptr, i64 6
+  %h1 = bitcast i8 addrspace(4)* %d1 to i16 addrspace(4)*
+  %v1 = load i16, i16 addrspace(4)* %h1
+  %e1 = zext i16 %v1 to i32
+  store i32 %e1, i32 addrspace(1)* %out
+  ret void
+}
+
 declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
 
 attributes #0 = { readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/vectorize-loads.ll b/llvm/test/CodeGen/AMDGPU/vectorize-loads.ll
--- a/llvm/test/CodeGen/AMDGPU/vectorize-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/vectorize-loads.ll
@@ -22,6 +22,37 @@
   ret void
 }
 
+; A little more complicated case where more sub-dword loads could be coalesced
+; if they are not widening earlier.
+; GCN-LABEL: {{^}}load_4i16:
+; GCN: s_load_dwordx2 s{{\[}}[[D0:[0-9]+]]:[[D1:[0-9]+]]{{\]}}, s[4:5], 0x4
+; GCN-NOT: s_load_dword {{s[0-9]+}}, s[4:5], 0x4
+; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s[[D0]], 16
+; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s[[D1]], 16
+; GCN: s_endpgm
+define protected amdgpu_kernel void @load_4i16(i32 addrspace(1)* %out) {
+entry:
+  %disp = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %gep_x = getelementptr i8, i8 addrspace(4)* %disp, i64 4
+  %gep_x.cast = bitcast i8 addrspace(4)* %gep_x to i16 addrspace(4)*
+  %id_x = load i16, i16 addrspace(4)* %gep_x.cast, align 4, !invariant.load !0 ; load workgroup size x
+  %gep_y = getelementptr i8, i8 addrspace(4)* %disp, i64 6
+  %gep_y.cast = bitcast i8 addrspace(4)* %gep_y to i16 addrspace(4)*
+  %id_y = load i16, i16 addrspace(4)* %gep_y.cast, align 2, !invariant.load !0 ; load workgroup size y
+  %gep_z = getelementptr i8, i8 addrspace(4)* %disp, i64 8
+  %gep_z.cast = bitcast i8 addrspace(4)* %gep_z to i16 addrspace(4)*
+  %id_z = load i16, i16 addrspace(4)* %gep_z.cast, align 4, !invariant.load !0 ; load workgroup size x
+  %gep_w = getelementptr i8, i8 addrspace(4)* %disp, i64 10
+  %gep_w.cast = bitcast i8 addrspace(4)* %gep_w to i16 addrspace(4)*
+  %id_w = load i16, i16 addrspace(4)* %gep_w.cast, align 2, !invariant.load !0 ; load workgroup size y
+  %add = add nuw nsw i16 %id_y, %id_x
+  %add2 = add nuw nsw i16 %id_z, %id_w
+  %add3 = add nuw nsw i16 %add, %add2
+  %conv = zext i16 %add3 to i32
+  store i32 %conv, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
 declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 
 !0 = !{!0}