Index: lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1593,10 +1593,21 @@
 /// processed with a bytemask of 255. BitMask is always in the local
 /// (OverallLeftShift) coordinate space.
 ///
+/// It is possible to create an IR pattern that results in exponential execution
+/// time here, by making a tree where every OR refers to one other OR via *both*
+/// operands:
+///
+///   for (i = 0; i < N; ++i)
+///     b |= b >> 1;  // Causes 2^N executions of CollectBitParts!
+///
+/// To defend against this (and because ORs are the only node that cause us to
+/// fork control flow) we keep a counter alive between all invocations of
+/// CollectBitParts, expected to be initialized to bitwidth(b).
 static bool CollectBitParts(Value *V, int OverallLeftShift, APInt BitMask,
                             SmallVectorImpl<Value *> &BitValues,
                             SmallVectorImpl<int> &BitProvenance,
-                            SmallPtrSetImpl<Value*> &OrValues) {
+                            SmallPtrSetImpl<Value*> &OrValues,
+                            unsigned &NumOrsRemaining) {
   if (Instruction *I = dyn_cast<Instruction>(V)) {
     // If this is a bitreverse intrinsic, it can obviously be part of a
     // bitreverse/bswap.
@@ -1622,11 +1633,16 @@
     }
     
     // If this is an or instruction, it may be an inner node of the bswap.
-    if (I->getOpcode() == Instruction::Or)
+    if (I->getOpcode() == Instruction::Or) {
+      if (NumOrsRemaining-- == 0)
+        return true;
       return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask,
-                             BitValues, BitProvenance, OrValues) ||
+                             BitValues, BitProvenance, OrValues,
+                             NumOrsRemaining) ||
              CollectBitParts(I->getOperand(1), OverallLeftShift, BitMask,
-                             BitValues, BitProvenance, OrValues);
+                             BitValues, BitProvenance, OrValues,
+                             NumOrsRemaining);
+    }
 
     // If this is a logical shift by a constant, recurse with OverallLeftShift
     // and BitMask adjusted.
@@ -1654,7 +1670,8 @@
         return true;
 
       return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask,
-                             BitValues, BitProvenance, OrValues);
+                             BitValues, BitProvenance, OrValues,
+                             NumOrsRemaining);
     }
 
     // If this is a logical 'and' with a mask that clears bits, clear the
@@ -1682,7 +1699,8 @@
       }
 
       return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask,
-                             BitValues, BitProvenance, OrValues);
+                             BitValues, BitProvenance, OrValues,
+                             NumOrsRemaining);
     }
   }
 
@@ -1754,7 +1772,9 @@
   // Try to find all the pieces corresponding to the bswap.
   APInt BitMask = APInt::getAllOnesValue(BitValues.size());
   SmallPtrSet<Value*, 2> OrValues;
-  if (CollectBitParts(&I, 0, BitMask, BitValues, BitProvenance, OrValues))
+  unsigned NumOrsRemaining = BitMask.getBitWidth();
+  if (CollectBitParts(&I, 0, BitMask, BitValues, BitProvenance, OrValues,
+                      NumOrsRemaining))
     return nullptr;
 
   // Check to see if all of the bits come from the same value.
Index: test/Transforms/InstCombine/bitreverse-hang.ll
===================================================================
--- /dev/null
+++ test/Transforms/InstCombine/bitreverse-hang.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s -loop-unroll -instcombine -S | FileCheck %s
+
+; This test is a worst-case scenario for bitreversal/byteswap detection.
+; After loop unrolling (the unrolled loop is unreadably large so it has been kept
+; rolled here), we have a binary tree of OR operands (as bitreversal detection
+; looks straight through shifts):
+;
+;  OR
+;  | \
+;  |  LSHR
+;  | /
+;  OR
+;  | \
+;  |  LSHR
+;  | /
+;  OR
+;
+; This results in exponential runtime. The loop here is 32 iterations which will
+; totally hang if we don't deal with this case cleverly.
+
+@b = common global i32 0, align 4
+
+; CHECK: define i32 @fn1
+define i32 @fn1() #0 {
+entry:
+  %b.promoted = load i32, i32* @b, align 4, !tbaa !2
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %or4 = phi i32 [ %b.promoted, %entry ], [ %or, %for.body ]
+  %i.03 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %shr = lshr i32 %or4, 1
+  %or = or i32 %shr, %or4
+  %inc = add nuw nsw i32 %i.03, 1
+  %exitcond = icmp eq i32 %inc, 32
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  store i32 %or, i32* @b, align 4, !tbaa !2
+  ret i32 undef
+}
+
+attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 3.8.0 (http://llvm.org/git/clang.git eb70f4e9cc9a4dc3dd57b032fb858d56b4b64a0e)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}