diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -331,6 +331,9 @@
 void initializeGCNPreRAOptimizationsPass(PassRegistry &);
 extern char &GCNPreRAOptimizationsID;
 
+FunctionPass *createAMDGPUSetWavePriorityPass();
+void initializeAMDGPUSetWavePriorityPass(PassRegistry &);
+
 namespace AMDGPU {
 enum TargetIndex {
   TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
@@ -0,0 +1,166 @@
+//===- AMDGPUSetWavePriority.cpp - Set wave priority ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Pass to temporarily raise the wave priority beginning the start of
+/// the shader function until its last VMEM instructions to allow younger
+/// waves to issue their VMEM instructions as well.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Allocator.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-set-wave-priority"
+
+namespace {
+
+struct MBBInfo {
+  MBBInfo() = default;
+  bool MayReachVMEMLoad = false;
+};
+
+using MBBInfoSet = DenseMap<const MachineBasicBlock *, MBBInfo>;
+
+class AMDGPUSetWavePriority : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPUSetWavePriority() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override { return "Set wave priority"; }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  MachineInstr *BuildSetprioMI(MachineFunction &MF, unsigned priority) const;
+
+  const SIInstrInfo *TII;
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(AMDGPUSetWavePriority, DEBUG_TYPE, "Set wave priority", false,
+                false)
+
+char AMDGPUSetWavePriority::ID = 0;
+
+FunctionPass *llvm::createAMDGPUSetWavePriorityPass() {
+  return new AMDGPUSetWavePriority();
+}
+
+MachineInstr *AMDGPUSetWavePriority::BuildSetprioMI(MachineFunction &MF,
+                                                    unsigned priority) const {
+  return BuildMI(MF, DebugLoc(), TII->get(AMDGPU::S_SETPRIO)).addImm(priority);
+}
+
+// Checks that for every predecessor Pred that can reach a VMEM load,
+// none of Pred's successors can reach a VMEM load.
+static bool CanLowerPriorityDirectlyInPredecessors(const MachineBasicBlock &MBB,
+                                                   MBBInfoSet &MBBInfos) {
+  for (const MachineBasicBlock *Pred : MBB.predecessors()) {
+    if (!MBBInfos[Pred].MayReachVMEMLoad)
+      continue;
+    for (const MachineBasicBlock *Succ : Pred->successors()) {
+      if (MBBInfos[Succ].MayReachVMEMLoad)
+        return false;
+    }
+  }
+  return true;
+}
+
+static bool isVMEMLoad(const MachineInstr &MI) {
+  return SIInstrInfo::isVMEM(MI) && MI.mayLoad();
+}
+
+bool AMDGPUSetWavePriority::runOnMachineFunction(MachineFunction &MF) {
+  const unsigned HighPriority = 3;
+  const unsigned LowPriority = 0;
+
+  Function &F = MF.getFunction();
+  if (skipFunction(F) || !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+    return false;
+
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  TII = ST.getInstrInfo();
+
+  MBBInfoSet MBBInfos;
+  SmallVector<const MachineBasicBlock *, 16> Worklist;
+  for (MachineBasicBlock &MBB : MF) {
+    if (any_of(MBB, isVMEMLoad))
+      Worklist.push_back(&MBB);
+  }
+
+  // Mark blocks from which control may reach VMEM loads.
+  while (!Worklist.empty()) {
+    const MachineBasicBlock *MBB = Worklist.pop_back_val();
+    MBBInfo &Info = MBBInfos[MBB];
+    if (!Info.MayReachVMEMLoad) {
+      Info.MayReachVMEMLoad = true;
+      Worklist.append(MBB->pred_begin(), MBB->pred_end());
+    }
+  }
+
+  MachineBasicBlock &Entry = MF.front();
+  if (!MBBInfos[&Entry].MayReachVMEMLoad)
+    return false;
+
+  // Raise the priority at the beginning of the shader.
+  MachineBasicBlock::iterator I = Entry.begin(), E = Entry.end();
+  while (I != E && !SIInstrInfo::isVALU(*I) && !I->isTerminator())
+    ++I;
+  Entry.insert(I, BuildSetprioMI(MF, HighPriority));
+
+  // Lower the priority on edges where control leaves blocks from which
+  // VMEM loads are reachable.
+  SmallSet<MachineBasicBlock *, 16> PriorityLoweringBlocks;
+  for (MachineBasicBlock &MBB : MF) {
+    if (MBBInfos[&MBB].MayReachVMEMLoad) {
+      if (MBB.succ_empty())
+        PriorityLoweringBlocks.insert(&MBB);
+      continue;
+    }
+
+    if (CanLowerPriorityDirectlyInPredecessors(MBB, MBBInfos)) {
+      for (MachineBasicBlock *Pred : MBB.predecessors()) {
+        if (MBBInfos[Pred].MayReachVMEMLoad)
+          PriorityLoweringBlocks.insert(Pred);
+      }
+      continue;
+    }
+
+    // Where lowering the priority in predecessors is not possible, the
+    // block receiving control either was not part of a loop in the first
+    // place or the loop simplification/canonicalization pass should have
+    // already tried to split the edge and insert a preheader, and if for
+    // whatever reason it failed to do so, then this leaves us with the
+    // only option of lowering the priority within the loop.
+    PriorityLoweringBlocks.insert(&MBB);
+  }
+
+  for (MachineBasicBlock *MBB : PriorityLoweringBlocks) {
+    MachineBasicBlock::iterator I = MBB->end(), B = MBB->begin();
+    while (I != B) {
+      if (isVMEMLoad(*--I)) {
+        ++I;
+        break;
+      }
+    }
+    MBB->insert(I, BuildSetprioMI(MF, LowPriority));
+  }
+
+  return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -277,6 +277,10 @@
     cl::init(true), cl::Hidden,
     cl::desc("Enable machine DCE inside regalloc"));
 
+static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
+                                           cl::desc("Adjust wave priority"),
+                                           cl::init(false), cl::Hidden);
+
 static cl::opt<bool> EnableScalarIRPasses(
   "amdgpu-scalar-ir-passes",
   cl::desc("Enable scalar IR passes"),
@@ -1360,6 +1364,8 @@
     addPass(&SIInsertHardClausesID);
 
   addPass(&SILateBranchLoweringPassID);
+  if (isPassEnabled(EnableSetWavePriority, CodeGenOpt::Less))
+    addPass(createAMDGPUSetWavePriorityPass());
   if (getOptLevel() > CodeGenOpt::None)
     addPass(&SIPreEmitPeepholeID);
   // The hazard recognizer that runs as part of the post-ra scheduler does not
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -89,6 +89,7 @@
   AMDGPUReplaceLDSUseWithPointer.cpp
   AMDGPUResourceUsageAnalysis.cpp
   AMDGPURewriteOutArguments.cpp
+  AMDGPUSetWavePriority.cpp
   AMDGPUSubtarget.cpp
   AMDGPUTargetMachine.cpp
   AMDGPUTargetObjectFile.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
@@ -0,0 +1,153 @@
+; RUN: llc -mtriple=amdgcn -amdgpu-set-wave-priority=true -o - %s | \
+; RUN:   FileCheck %s
+
+; CHECK-LABEL: no_setprio:
+; CHECK-NOT:       s_setprio
+; CHECK:           ; return to shader part epilog
+define amdgpu_ps <2 x float> @no_setprio() {
+  ret <2 x float> <float 0.0, float 0.0>
+}
+
+; CHECK-LABEL: vmem_in_exit_block:
+; CHECK:           s_setprio 3
+; CHECK:           buffer_load_dwordx2
+; CHECK-NEXT:      s_setprio 0
+; CHECK:           ; return to shader part epilog
+define amdgpu_ps <2 x float> @vmem_in_exit_block(<4 x i32> inreg %p) {
+  %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
+  ret <2 x float> %v
+}
+
+; CHECK-LABEL: branch:
+; CHECK:           s_setprio 3
+; CHECK:           s_cbranch_scc0 [[A:.*]]
+; CHECK:       {{.*}}:  ; %b
+; CHECK:           buffer_load_dwordx2
+; CHECK-NEXT:      s_setprio 0
+; CHECK:           s_branch [[EXIT:.*]]
+; CHECK:       [[A]]:  ; %a
+; CHECK-NEXT:      s_setprio 0
+; CHECK:           s_branch [[EXIT]]
+; CHECK-NEXT:  [[EXIT]]:
+define amdgpu_ps <2 x float> @branch(<4 x i32> inreg %p, i32 inreg %i) {
+  %cond = icmp eq i32 %i, 0
+  br i1 %cond, label %a, label %b
+
+a:
+  ret <2 x float> <float 0.0, float 0.0>
+
+b:
+  %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
+  ret <2 x float> %v
+}
+
+; CHECK-LABEL: setprio_follows_setprio:
+; CHECK:           s_setprio 3
+; CHECK:           buffer_load_dwordx2
+; CHECK:           s_cbranch_scc1 [[C:.*]]
+; CHECK:       {{.*}}:  ; %a
+; CHECK:           buffer_load_dwordx2
+; CHECK-NEXT:      s_setprio 0
+; CHECK:           s_cbranch_scc1 [[C]]
+; CHECK:       {{.*}}:  ; %b
+; CHECK-NOT:       s_setprio
+; CHECK:           s_branch [[EXIT:.*]]
+; CHECK:       [[C]]:  ; %c
+; CHECK-NEXT:      s_setprio 0
+; CHECK:           s_branch [[EXIT]]
+; CHECK:       [[EXIT]]:
+define amdgpu_ps <2 x float> @setprio_follows_setprio(<4 x i32> inreg %p, i32 inreg %i) {
+entry:
+  %v1 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
+  %cond1 = icmp ne i32 %i, 0
+  br i1 %cond1, label %a, label %c
+
+a:
+  %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 1, i32 0)
+  %cond2 = icmp ne i32 %i, 1
+  br i1 %cond2, label %b, label %c
+
+b:
+  ret <2 x float> %v2
+
+c:
+  %v3 = phi <2 x float> [%v1, %entry], [%v2, %a]
+  %v4 = fadd <2 x float> %v1, %v3
+  ret <2 x float> %v4
+}
+
+; CHECK-LABEL: loop:
+; CHECK:       {{.*}}:  ; %entry
+; CHECK:           s_setprio 3
+; CHECK-NOT:       s_setprio
+; CHECK:       [[LOOP:.*]]:  ; %loop
+; CHECK-NOT:       s_setprio
+; CHECK:           buffer_load_dwordx2
+; CHECK-NOT:       s_setprio
+; CHECK:           s_cbranch_scc1 [[LOOP]]
+; CHECK-NEXT:  {{.*}}:  ; %exit
+; CHECK-NEXT:      s_setprio 0
+define amdgpu_ps <2 x float> @loop(<4 x i32> inreg %p) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [0, %entry], [%i2, %loop]
+  %sum = phi <2 x float> [<float 0.0, float 0.0>, %entry], [%sum2, %loop]
+
+  %i2 = add i32 %i, 1
+
+  %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 %i, i32 0, i32 0, i32 0)
+  %sum2 = fadd <2 x float> %sum, %v
+
+  %cond = icmp ult i32 %i2, 5
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret <2 x float> %sum2
+}
+
+; CHECK-LABEL: edge_split:
+; CHECK:           s_setprio 3
+; CHECK:           buffer_load_dwordx2
+; CHECK-NOT:       s_setprio
+; CHECK:           s_cbranch_scc1 [[ANOTHER_LOAD:.*]]
+; CHECK:       {{.*}}:  ; %loop.preheader
+; CHECK-NEXT:      s_setprio 0
+; CHECK:       [[LOOP:.*]]:  ; %loop
+; CHECK-NOT:       s_setprio
+; CHECK:           s_cbranch_scc1 [[LOOP]]
+; CHECK        {{.*}}:  ; %exit
+; CHECK-NOT:       s_setprio
+; CHECK:           s_branch [[RET:.*]]
+; CHECK:       [[ANOTHER_LOAD]]:  ; %another_load
+; CHECK:           buffer_load_dwordx2
+; CHECK-NEXT:      s_setprio 0
+; CHECK:           s_branch [[RET]]
+; CHECK:       [[RET]]:
+define amdgpu_ps <2 x float> @edge_split(<4 x i32> inreg %p, i32 inreg %x) {
+entry:
+  %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
+  %cond = icmp ne i32 %x, 0
+  br i1 %cond, label %loop, label %another_load
+
+loop:
+  %i = phi i32 [0, %entry], [%i2, %loop]
+  %mul = phi <2 x float> [%v, %entry], [%mul2, %loop]
+
+  %i2 = add i32 %i, 1
+  %mul2 = fmul <2 x float> %mul, %v
+
+  %cond2 = icmp ult i32 %i2, 5
+  br i1 %cond2, label %loop, label %exit
+
+exit:
+  ret <2 x float> %mul2
+
+another_load:
+  %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 1, i32 0)
+  %sum = fadd <2 x float> %v, %v2
+  ret <2 x float> %sum
+}
+
+declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) nounwind