diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1348,7 +1348,8 @@
   Value *X, *Y, *Zero;
   if (!match(&I, m_ICmp(Pred, m_OneUse(m_IRem(m_Value(X), m_Value(Y))),
                         m_CombineAnd(m_Zero(), m_Value(Zero)))))
-    return nullptr;
+    // try to recognize a decomposed version
+    return foldIRemByPowerOfTwoDecomposedToBitTest(I);
   if (!isKnownToBeAPowerOfTwo(Y, /*OrZero*/ true, 0, &I))
     return nullptr;
   // This may increase instruction count, we don't enforce that Y is a constant.
@@ -1357,6 +1358,41 @@
   return ICmpInst::Create(Instruction::ICmp, Pred, Masked, Zero);
 }
 
+/// Fold decomposed version of "X % C == 0" to "X & C-1 == 0".
+/// "X % C" can also be represented as "X - (X / C) * C" which is optimized
+/// into "((X / -C1) >> C2)) + X" so the latter can be folded to
+/// "X & C-1" for icmp eq/ne 0
+Instruction *
+InstCombiner::foldIRemByPowerOfTwoDecomposedToBitTest(ICmpInst &I) {
+  // This fold is only valid for equality predicates.
+  if (!I.isEquality())
+    return nullptr;
+
+  ICmpInst::Predicate Pred;
+  Value *X, *Y, *Zero;
+  const APInt *C1, *C2;
+  if (!match(&I,
+             m_ICmp(Pred,
+                    m_Add(m_Shl(m_SDiv(m_Value(X), m_APInt(C1)), m_APInt(C2)),
+                          m_Value(Y)),
+                    m_Value(Zero))))
+    return nullptr;
+
+  // C1 should be some negative power of two number
+  if ((X != Y) || !C1->isNegative() || !C1->abs().isPowerOf2())
+    return nullptr;
+
+  // 1 << C2 == C1
+  APInt one(C2->getBitWidth(), 1);
+  if ((C1->abs() != one.shl(*C2)) || C2->sle(one))
+    return nullptr;
+
+  // Replace with "X & C-1 ==/!= 0"
+  uint64_t AndMask = C1->abs().getZExtValue() - 1;
+  Value *And = Builder.CreateAnd(X, ConstantInt::get(X->getType(), AndMask));
+  return new ICmpInst(Pred, And, Zero);
+}
+
 /// Fold equality-comparison between zero and any (maybe truncated) right-shift
 /// by one-less-than-bitwidth into a sign test on the original value.
 Instruction *InstCombiner::foldSignBitTest(ICmpInst &I) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -934,6 +934,7 @@
   Instruction *foldICmpBinOp(ICmpInst &Cmp, const SimplifyQuery &SQ);
   Instruction *foldICmpEquality(ICmpInst &Cmp);
   Instruction *foldIRemByPowerOfTwoToBitTest(ICmpInst &I);
+  Instruction *foldIRemByPowerOfTwoDecomposedToBitTest(ICmpInst &I);
   Instruction *foldSignBitTest(ICmpInst &I);
   Instruction *foldICmpWithZero(ICmpInst &Cmp);
 
diff --git a/llvm/test/Transforms/InstCombine/icmp-div-constant.ll b/llvm/test/Transforms/InstCombine/icmp-div-constant.ll
--- a/llvm/test/Transforms/InstCombine/icmp-div-constant.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-div-constant.ll
@@ -38,6 +38,19 @@
   ret i1 %r
 }
 
+define i1 @is_rem32_pos_decomposed_i8(i8 %x) {
+; CHECK-LABEL: @is_rem32_pos_i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[X:%.*]], -97
+; CHECK-NEXT:    [[R:%.*]] = icmp sgt i8 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %d = sdiv i8 %x, 32
+  %m = mul nsw i8 %d, 32
+  %s = sub nsw i8 %x, %m
+  %r = icmp eq i8 %s, 0
+  ret i1 %r
+}
+
 ; i16 -32765 == 32771 == 0b1000000000000011
 
 define i1 @is_rem4_neg_i16(i16 %x) {