diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -40,21 +40,25 @@
   let PrintMethod = "printBrList";
 }
 
-// TODO: SelectionDAG's lowering insists on using a pointer as the index for
-// jump tables, so in practice we don't ever use BR_TABLE_I64 in wasm32 mode
-// currently.
-let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
+// Duplicating a BR_TABLE is almost never a good idea. In particular, it can
+// lead to some nasty irreducibility due to tail merging when the br_table is in
+// a loop.
+let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1, isNotDuplicable = 1 in {
+
 defm BR_TABLE_I32 : I<(outs), (ins I32:$index, variable_ops),
                       (outs), (ins brlist:$brl),
                       [(WebAssemblybr_table I32:$index)],
                       "br_table \t$index", "br_table \t$brl",
                       0x0e>;
+// TODO: SelectionDAG's lowering insists on using a pointer as the index for
+// jump tables, so in practice we don't ever use BR_TABLE_I64 in wasm32 mode
+// currently.
 defm BR_TABLE_I64 : I<(outs), (ins I64:$index, variable_ops),
                       (outs), (ins brlist:$brl),
                       [(WebAssemblybr_table I64:$index)],
                       "br_table \t$index", "br_table \t$brl",
                       0x0e>;
-} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
+} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1, isNotDuplicable = 1
 
 // This is technically a control-flow instruction, since all it affects is the
 // IP.
diff --git a/llvm/test/CodeGen/WebAssembly/indirectbr.ll b/llvm/test/CodeGen/WebAssembly/indirectbr.ll
--- a/llvm/test/CodeGen/WebAssembly/indirectbr.ll
+++ b/llvm/test/CodeGen/WebAssembly/indirectbr.ll
@@ -13,36 +13,21 @@
 
 ; Just check the barest skeleton of the structure
 ; CHECK-LABEL: test1:
-; CHECK: block
-; CHECK: block
-; CHECK: block
-; CHECK: block
 ; CHECK: i32.load
 ; CHECK: i32.load
-; CHECK: i32.const
-; CHECK: i32.add $push[[DEST:.+]]=
-; CHECK: br_table $pop[[DEST]]
-; CHECK: end_block
-; CHECK: end_block
-; CHECK: end_block
-; CHECK: end_block
 ; CHECK: loop
 ; CHECK: block
 ; CHECK: block
 ; CHECK: block
 ; CHECK: block
-; CHECK: br_table ${{[^,]+}}, 0, 1, 2, 2
-; CHECK: end_block
-; CHECK: end_block
-; CHECK: end_block
-; CHECK: block
-; CHECK: block
-; CHECK: block
 ; CHECK: br_table ${{[^,]+}}, 1, 2, 0
 ; CHECK: end_block
 ; CHECK: end_block
 ; CHECK: end_block
+; CHECK: end_block
+; CHECK: br
 ; CHECK: end_loop
+; CHECK: end_function
 ; CHECK: test1.targets:
 ; CHECK-NEXT: .int32
 ; CHECK-NEXT: .int32
diff --git a/llvm/test/CodeGen/WebAssembly/switch-in-loop.ll b/llvm/test/CodeGen/WebAssembly/switch-in-loop.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/switch-in-loop.ll
@@ -0,0 +1,77 @@
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs | FileCheck %s
+
+;; Test that a small but nontrivial switch in a loop (like in a
+;; bytecode interpreter) lowers reasonably without any irreducible
+;; control flow being introduced.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32"
+
+declare void @a(i32*)
+declare void @b(i32*)
+
+; CHECK-LABEL: switch_in_loop:
+; CHECK-NEXT: .functype switch_in_loop (i32, i32) -> (i32)
+; CHECK:    global.get __stack_pointer
+; CHECK:    global.set __stack_pointer
+; CHECK:    block
+; CHECK:    br_if 0
+; CHECK: .LBB0_2:
+; CHECK:    loop
+; CHECK:    block
+; CHECK:    block
+; CHECK:    block
+; CHECK:    br_table {0, 1, 2}
+; CHECK: .LBB0_3:
+; CHECK:    end_block
+; CHECK:    call a
+; CHECK:    br 1
+; CHECK: .LBB0_4:
+; CHECK:    end_block
+; CHECK:    call b
+; CHECK: .LBB0_5:
+; CHECK:    end_block
+; CHECK:    br_if 0
+; CHECK:    end_loop
+; CHECK: .LBB0_7:
+; CHECK:    end_block
+; CHECK:    global.set __stack_pointer
+; CHECK:    end_function
+define i32 @switch_in_loop(i32* %ops, i32 %len) {
+entry:
+  %res = alloca i32
+  %0 = bitcast i32* %res to i8*
+  store i32 0, i32* %res
+  %cmp6 = icmp sgt i32 %len, 0
+  br i1 %cmp6, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup.loopexit:                        ; preds = %sw.epilog
+  %.pre = load i32, i32* %res
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %1 = phi i32 [ %.pre, %for.cond.cleanup.loopexit ], [ 0, %entry ]
+  ret i32 %1
+
+for.body:                                         ; preds = %entry, %sw.epilog
+  %i.07 = phi i32 [ %inc, %sw.epilog ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %ops, i32 %i.07
+  %2 = load i32, i32* %arrayidx
+  switch i32 %2, label %sw.epilog [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+  ]
+
+sw.bb:                                            ; preds = %for.body
+  call void @a(i32* nonnull %res)
+  br label %sw.epilog
+
+sw.bb1:                                           ; preds = %for.body
+  call void @b(i32* nonnull %res)
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %for.body, %sw.bb1, %sw.bb
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, %len
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}