diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2269,6 +2269,40 @@
   (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
 >;
 
+// Restrict the range to prevent using an additional VGPR
+// for the shifted value.
+def IMMBitSelRange : ImmLeaf <i32, [{
+  return Imm > 0 && Imm < 16;
+}]>;
+
+def IMMBitSelConst : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((1 << N->getZExtValue()), SDLoc(N),
+                                   MVT::i32);
+}]>;
+
+// Matching separate SRL and TRUNC instructions
+// with dependent operands (SRL dest is source of TRUNC)
+// generates three instructions. However, by using bit shifts,
+// the V_LSHRREV_B32_e64 result can be directly used in the
+// operand of the V_AND_B32_e64 instruction:
+// (trunc i32 (srl i32 $a, i32 $b)) ->
+// v_and_b32_e64 $a, (1 << $b), $a
+// v_cmp_eq_u32_e64 $a, (1 << $b), $a 
+
+// Handle the VALU case.
+def : GCNPat <
+  (i1 (DivergentUnaryFrag<trunc> (i32 (srl i32:$a, IMMBitSelRange:$b)))),
+  (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 (IMMBitSelConst $b)), $a),
+    (i32 (IMMBitSelConst $b)))
+>;
+
+// Handle the scalar case.
+def : GCNPat <
+  (i1 (UniformUnaryFrag<trunc> (i32 (srl i32:$a, IMMBitSelRange:$b)))),
+  (S_CMP_EQ_U32 (S_AND_B32 (i32 (IMMBitSelConst $b)), $a),
+    (i32 (IMMBitSelConst $b)))
+>;
+
 def : GCNPat <
   (i1 (DivergentUnaryFrag<trunc> i64:$a)),
   (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1),
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll
@@ -0,0 +1,40 @@
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -stop-after=amdgpu-isel -verify-machineinstrs -O0 < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: bb.0.entry:
+; GCN-NOT:      V_LSHRREV_B32_e64
+; GCN:          V_AND_B32_e64 2
+; GCN:          V_CMP_EQ_U32_e64 killed {{.*}}, 2
+define i32 @divergent_lshr_and_cmp(i32 %x) {
+entry:
+  %0 = and i32 %x, 2
+  %1 = icmp ne i32 %0, 0
+  ; Prevent removal of truncate in SDag by inserting llvm.amdgcn.if
+  br i1 %1, label %out.true, label %out.else
+
+out.true:
+  %2 = shl i32 %x, 2
+  ret i32 %2
+
+out.else:
+  ret i32 %x
+}
+
+; GCN-LABEL: bb.0.entry:
+; GCN:          S_AND_B32 2
+; GCN:          S_CMP_EQ_U32 killed %{{.*}}, 2
+define amdgpu_kernel void @uniform_opt_lshr_and_cmp(i1 addrspace(1)* %out, i32 %x) {
+entry:
+  %0 = and i32 %x, 2
+  %1 = icmp ne i32 %0, 0
+  ; Don't optimize the truncate in the SDag away.
+  br i1 %1, label %out.true, label %out.else
+
+out.true:
+  %2 = xor i1 %1, -1
+  store i1 %2, i1 addrspace(1)* %out
+  ret void
+
+out.else:
+  store i1 %1, i1 addrspace(1)* %out
+  ret void
+}
\ No newline at end of file