diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
@@ -26,11 +26,18 @@
 
 #define DEBUG_TYPE "amdgpu-set-wave-priority"
 
+static cl::opt<unsigned> DefaultVALUInstsThreshold(
+    "amdgpu-set-wave-priority-valu-insts-threshold",
+    cl::desc("VALU instruction count threshold for adjusting wave priority"),
+    cl::init(100), cl::Hidden);
+
 namespace {
 
 struct MBBInfo {
   MBBInfo() = default;
+  unsigned NumVALUInstsAtStart = 0;
   bool MayReachVMEMLoad = false;
+  MachineInstr *LastVMEMLoad = nullptr;
 };
 
 using MBBInfoSet = DenseMap<const MachineBasicBlock *, MBBInfo>;
@@ -46,7 +53,9 @@
   bool runOnMachineFunction(MachineFunction &MF) override;
 
 private:
-  MachineInstr *BuildSetprioMI(MachineFunction &MF, unsigned priority) const;
+  MachineInstr *BuildSetprioMI(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I,
+                               unsigned priority) const;
 
   const SIInstrInfo *TII;
 };
@@ -62,9 +71,12 @@
   return new AMDGPUSetWavePriority();
 }
 
-MachineInstr *AMDGPUSetWavePriority::BuildSetprioMI(MachineFunction &MF,
-                                                    unsigned priority) const {
-  return BuildMI(MF, DebugLoc(), TII->get(AMDGPU::S_SETPRIO)).addImm(priority);
+MachineInstr *
+AMDGPUSetWavePriority::BuildSetprioMI(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator I,
+                                      unsigned priority) const {
+  return BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_SETPRIO))
+      .addImm(priority);
 }
 
 // Checks that for every predecessor Pred that can reach a VMEM load,
@@ -97,21 +109,58 @@
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
 
+  unsigned VALUInstsThreshold = DefaultVALUInstsThreshold;
+  Attribute A = F.getFnAttribute("amdgpu-wave-priority-threshold");
+  if (A.isValid())
+    A.getValueAsString().getAsInteger(0, VALUInstsThreshold);
+
+  // Find VMEM loads that may be executed before long-enough sequences of
+  // VALU instructions. We currently assume that backedges/loops, branch
+  // probabilities and other details can be ignored, so we essentially
+  // determine the largest number of VALU instructions along every
+  // possible path from the start of the function that may potentially be
+  // executed provided no backedge is ever taken.
   MBBInfoSet MBBInfos;
-  SmallVector<const MachineBasicBlock *, 16> Worklist;
-  for (MachineBasicBlock &MBB : MF) {
-    if (any_of(MBB, isVMEMLoad))
-      Worklist.push_back(&MBB);
-  }
-
-  // Mark blocks from which control may reach VMEM loads.
-  while (!Worklist.empty()) {
-    const MachineBasicBlock *MBB = Worklist.pop_back_val();
+  for (MachineBasicBlock *MBB : post_order(&MF)) {
     MBBInfo &Info = MBBInfos[MBB];
-    if (!Info.MayReachVMEMLoad) {
-      Info.MayReachVMEMLoad = true;
-      Worklist.append(MBB->pred_begin(), MBB->pred_end());
+    bool AtStart = true;
+    unsigned MaxNumVALUInstsInMiddle = 0;
+    unsigned NumVALUInstsAtEnd = 0;
+    for (MachineInstr &MI : *MBB) {
+      if (isVMEMLoad(MI)) {
+        AtStart = false;
+        Info.NumVALUInstsAtStart = 0;
+        MaxNumVALUInstsInMiddle = 0;
+        NumVALUInstsAtEnd = 0;
+        Info.LastVMEMLoad = &MI;
+      } else if (SIInstrInfo::isDS(MI)) {
+        AtStart = false;
+        MaxNumVALUInstsInMiddle =
+            std::max(MaxNumVALUInstsInMiddle, NumVALUInstsAtEnd);
+        NumVALUInstsAtEnd = 0;
+      } else if (SIInstrInfo::isVALU(MI)) {
+        if (AtStart)
+          ++Info.NumVALUInstsAtStart;
+        ++NumVALUInstsAtEnd;
+      }
     }
+
+    bool SuccsMayReachVMEMLoad = false;
+    unsigned NumFollowingVALUInsts = 0;
+    for (const MachineBasicBlock *Succ : MBB->successors()) {
+      SuccsMayReachVMEMLoad |= MBBInfos[Succ].MayReachVMEMLoad;
+      NumFollowingVALUInsts =
+          std::max(NumFollowingVALUInsts, MBBInfos[Succ].NumVALUInstsAtStart);
+    }
+    if (AtStart)
+      Info.NumVALUInstsAtStart += NumFollowingVALUInsts;
+    NumVALUInstsAtEnd += NumFollowingVALUInsts;
+
+    unsigned MaxNumVALUInsts =
+        std::max(MaxNumVALUInstsInMiddle, NumVALUInstsAtEnd);
+    Info.MayReachVMEMLoad =
+        SuccsMayReachVMEMLoad ||
+        (Info.LastVMEMLoad && MaxNumVALUInsts >= VALUInstsThreshold);
   }
 
   MachineBasicBlock &Entry = MF.front();
@@ -122,10 +171,10 @@
   MachineBasicBlock::iterator I = Entry.begin(), E = Entry.end();
   while (I != E && !SIInstrInfo::isVALU(*I) && !I->isTerminator())
     ++I;
-  Entry.insert(I, BuildSetprioMI(MF, HighPriority));
+  BuildSetprioMI(Entry, I, HighPriority);
 
   // Lower the priority on edges where control leaves blocks from which
-  // VMEM loads are reachable.
+  // the VMEM loads are reachable.
   SmallSet<MachineBasicBlock *, 16> PriorityLoweringBlocks;
   for (MachineBasicBlock &MBB : MF) {
     if (MBBInfos[&MBB].MayReachVMEMLoad) {
@@ -152,14 +201,12 @@
   }
 
   for (MachineBasicBlock *MBB : PriorityLoweringBlocks) {
-    MachineBasicBlock::iterator I = MBB->end(), B = MBB->begin();
-    while (I != B) {
-      if (isVMEMLoad(*--I)) {
-        ++I;
-        break;
-      }
-    }
-    MBB->insert(I, BuildSetprioMI(MF, LowPriority));
+    BuildSetprioMI(
+        *MBB,
+        MBBInfos[MBB].LastVMEMLoad
+            ? std::next(MachineBasicBlock::iterator(MBBInfos[MBB].LastVMEMLoad))
+            : MBB->begin(),
+        LowPriority);
   }
 
   return true;
diff --git a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
--- a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
+++ b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
@@ -4,8 +4,9 @@
 ; CHECK-LABEL: no_setprio:
 ; CHECK-NOT:       s_setprio
 ; CHECK:           ; return to shader part epilog
-define amdgpu_ps <2 x float> @no_setprio() {
-  ret <2 x float> <float 0.0, float 0.0>
+define amdgpu_ps <2 x float> @no_setprio(<2 x float> %a, <2 x float> %b) "amdgpu-wave-priority-threshold"="1" {
+  %s = fadd <2 x float> %a, %b
+  ret <2 x float> %s
 }
 
 ; CHECK-LABEL: vmem_in_exit_block:
@@ -13,9 +14,10 @@
 ; CHECK:           buffer_load_dwordx2
 ; CHECK-NEXT:      s_setprio 0
 ; CHECK:           ; return to shader part epilog
-define amdgpu_ps <2 x float> @vmem_in_exit_block(<4 x i32> inreg %p) {
+define amdgpu_ps <2 x float> @vmem_in_exit_block(<4 x i32> inreg %p, <2 x float> %x) "amdgpu-wave-priority-threshold"="2" {
   %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
-  ret <2 x float> %v
+  %s = fadd <2 x float> %v, %x
+  ret <2 x float> %s
 }
 
 ; CHECK-LABEL: branch:
@@ -29,7 +31,7 @@
 ; CHECK-NEXT:      s_setprio 0
 ; CHECK:           s_branch [[EXIT]]
 ; CHECK-NEXT:  [[EXIT]]:
-define amdgpu_ps <2 x float> @branch(<4 x i32> inreg %p, i32 inreg %i) {
+define amdgpu_ps <2 x float> @branch(<4 x i32> inreg %p, i32 inreg %i, <2 x float> %x) "amdgpu-wave-priority-threshold"="2" {
   %cond = icmp eq i32 %i, 0
   br i1 %cond, label %a, label %b
 
@@ -38,7 +40,8 @@
 
 b:
   %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
-  ret <2 x float> %v
+  %s = fadd <2 x float> %v, %x
+  ret <2 x float> %s
 }
 
 ; CHECK-LABEL: setprio_follows_setprio:
@@ -48,7 +51,7 @@
 ; CHECK:       {{.*}}:  ; %a
 ; CHECK:           buffer_load_dwordx2
 ; CHECK-NEXT:      s_setprio 0
-; CHECK:           s_cbranch_scc1 [[C]]
+; CHECK:           s_cbranch_vccnz [[C]]
 ; CHECK:       {{.*}}:  ; %b
 ; CHECK-NOT:       s_setprio
 ; CHECK:           s_branch [[EXIT:.*]]
@@ -56,7 +59,7 @@
 ; CHECK-NEXT:      s_setprio 0
 ; CHECK:           s_branch [[EXIT]]
 ; CHECK:       [[EXIT]]:
-define amdgpu_ps <2 x float> @setprio_follows_setprio(<4 x i32> inreg %p, i32 inreg %i) {
+define amdgpu_ps <2 x float> @setprio_follows_setprio(<4 x i32> inreg %p, i32 inreg %i) "amdgpu-wave-priority-threshold"="3" {
 entry:
   %v1 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
   %cond1 = icmp ne i32 %i, 0
@@ -64,15 +67,16 @@
 
 a:
   %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 1, i32 0)
-  %cond2 = icmp ne i32 %i, 1
+  %v20 = extractelement <2 x float> %v2, i32 0
+  %v21 = extractelement <2 x float> %v2, i32 1
+  %cond2 = fcmp ult float %v20, %v21
   br i1 %cond2, label %b, label %c
 
 b:
   ret <2 x float> %v2
 
 c:
-  %v3 = phi <2 x float> [%v1, %entry], [%v2, %a]
-  %v4 = fadd <2 x float> %v1, %v3
+  %v4 = fadd <2 x float> %v1, %v1
   ret <2 x float> %v4
 }
 
@@ -87,7 +91,7 @@
 ; CHECK:           s_cbranch_scc1 [[LOOP]]
 ; CHECK-NEXT:  {{.*}}:  ; %exit
 ; CHECK-NEXT:      s_setprio 0
-define amdgpu_ps <2 x float> @loop(<4 x i32> inreg %p) {
+define amdgpu_ps <2 x float> @loop(<4 x i32> inreg %p) "amdgpu-wave-priority-threshold"="2" {
 entry:
   br label %loop
 
@@ -125,7 +129,7 @@
 ; CHECK-NEXT:      s_setprio 0
 ; CHECK:           s_branch [[RET]]
 ; CHECK:       [[RET]]:
-define amdgpu_ps <2 x float> @edge_split(<4 x i32> inreg %p, i32 inreg %x) {
+define amdgpu_ps <2 x float> @edge_split(<4 x i32> inreg %p, i32 inreg %x) "amdgpu-wave-priority-threshold"="2" {
 entry:
   %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
   %cond = icmp ne i32 %x, 0
@@ -150,4 +154,53 @@
   ret <2 x float> %sum
 }
 
+; CHECK-LABEL: valu_insts_threshold:
+; CHECK:           s_setprio 3
+; CHECK:           buffer_load_dwordx2
+; CHECK-NEXT:      s_setprio 0
+; CHECK-COUNT-4:   v_add_f32_e32
+; CHECK:           s_cbranch_scc0 [[A:.*]]
+; CHECK:       {{.*}}:  ; %b
+; CHECK-NEXT:      buffer_load_dwordx2
+; CHECK:           s_branch [[END:.*]]
+; CHECK:       [[A]]:  ; %a
+; CHECK:           s_branch [[END]]
+; CHECK:       [[END]]:
+define amdgpu_ps <2 x float> @valu_insts_threshold(<4 x i32> inreg %p, i32 inreg %i) "amdgpu-wave-priority-threshold"="4" {
+  %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
+  %add = fadd <2 x float> %v, %v
+  %add2 = fadd <2 x float> %add, %add
+
+  %cond = icmp eq i32 %i, 0
+  br i1 %cond, label %a, label %b
+
+a:
+  ret <2 x float> %add2
+
+b:
+  %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 1, i32 0, i32 0)
+  %sub = fsub <2 x float> %add2, %v2
+  ret <2 x float> %sub
+}
+
+; CHECK-LABEL: valu_insts_threshold2:
+; CHECK-NOT: s_setprio
+; CHECK: ; -- End function
+define amdgpu_ps <2 x float> @valu_insts_threshold2(<4 x i32> inreg %p, i32 inreg %i) "amdgpu-wave-priority-threshold"="5" {
+  %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
+  %add = fadd <2 x float> %v, %v
+  %add2 = fadd <2 x float> %add, %add
+
+  %cond = icmp eq i32 %i, 0
+  br i1 %cond, label %a, label %b
+
+a:
+  ret <2 x float> %add2
+
+b:
+  %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 1, i32 0, i32 0)
+  %sub = fsub <2 x float> %add2, %v2
+  ret <2 x float> %sub
+}
+
 declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) nounwind