diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -76,6 +76,8 @@
   bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
   bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
   bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+  bool shouldSinkOperands(Instruction *I,
+                          SmallVectorImpl<Use *> &Ops) const override;
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                          EVT VT) const override;
   bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -32,6 +32,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsWebAssembly.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
@@ -833,6 +834,30 @@
   return isa<Function>(GV) ? false : TargetLowering::isOffsetFoldingLegal(GA);
 }
 
+bool WebAssemblyTargetLowering::shouldSinkOperands(
+    Instruction *I, SmallVectorImpl<Use *> &Ops) const {
+  using namespace llvm::PatternMatch;
+
+  if (!I->getType()->isVectorTy() || !I->isShift())
+    return false;
+
+  Value *V = I->getOperand(1);
+  // We dont need to sink constant splat.
+  if (dyn_cast<Constant>(V))
+    return false;
+
+  if (match(V, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()),
+                         m_Value(), m_ZeroMask()))) {
+    // Sink insert
+    Ops.push_back(&cast<Instruction>(V)->getOperandUse(0));
+    // Sink shuffle
+    Ops.push_back(&I->getOperandUse(1));
+    return true;
+  }
+
+  return false;
+}
+
 EVT WebAssemblyTargetLowering::getSetCCResultType(const DataLayout &DL,
                                                   LLVMContext &C,
                                                   EVT VT) const {
diff --git a/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll b/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s
+
+; Test that SIMD shifts can be lowered correctly even when shift
+; values are exported from outside blocks.
+
+target triple = "wasm32-unknown-unknown"
+
+define void @shl_loop(ptr %a, i8 %shift, i32 %count) {
+; CHECK-LABEL: shl_loop:
+; CHECK:         .functype shl_loop (i32, i32, i32) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:  .LBB0_1: # %body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    loop # label0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 0:p2align=0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shl
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.set 0
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i32.const -1
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.tee 2
+; CHECK-NEXT:    i32.eqz
+; CHECK-NEXT:    br_if 0 # 0: up to label0
+; CHECK-NEXT:  # %bb.2: # %exit
+; CHECK-NEXT:    end_loop
+; CHECK-NEXT:    # fallthrough-return
+entry:
+ %t1 = insertelement <16 x i8> undef, i8 %shift, i32 0
+ %vshift = shufflevector <16 x i8> %t1, <16 x i8> undef, <16 x i32> zeroinitializer
+ br label %body
+body:
+ %out = phi ptr [%a, %entry], [%b, %body]
+ %i = phi i32 [0, %entry], [%next, %body]
+ %v = load <16 x i8>, ptr %out, align 1
+ %r = shl <16 x i8> %v, %vshift
+ %b = getelementptr inbounds i8, ptr %out, i32 16
+ store <16 x i8> %r, ptr %b
+ %next = add i32 %i, 1
+ %i.cmp = icmp eq i32 %next, %count
+ br i1 %i.cmp, label %body, label %exit
+exit:
+ ret void
+}
+
+; Test that SIMD shifts can be lowered correctly when shift value
+; is a phi inside loop body.
+
+define void @shl_phi_loop(ptr %a, i8 %shift, i32 %count) {
+; CHECK-LABEL: shl_phi_loop:
+; CHECK:         .functype shl_phi_loop (i32, i32, i32) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:  .LBB1_1: # %body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    loop # label1:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 0:p2align=0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shl
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.const 1
+; CHECK-NEXT:    i32.and
+; CHECK-NEXT:    local.set 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.set 0
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i32.const -1
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.tee 2
+; CHECK-NEXT:    i32.eqz
+; CHECK-NEXT:    br_if 0 # 0: up to label1
+; CHECK-NEXT:  # %bb.2: # %exit
+; CHECK-NEXT:    end_loop
+; CHECK-NEXT:    # fallthrough-return
+entry:
+ br label %body
+body:
+ %out = phi ptr [%a, %entry], [%b, %body]
+ %i = phi i32 [0, %entry], [%next, %body]
+ %t1 = phi i8 [%shift, %entry], [%sand, %body]
+ %t2 = insertelement <16 x i8> undef, i8 %t1, i32 0
+ %vshift = shufflevector <16 x i8> %t2, <16 x i8> undef, <16 x i32> zeroinitializer
+ %v = load <16 x i8>, ptr %out, align 1
+ %r = shl <16 x i8> %v, %vshift
+ %b = getelementptr inbounds i8, ptr %out, i32 16
+ store <16 x i8> %r, ptr %b
+ %sand = and i8 %t1, 1
+ %next = add i32 %i, 1
+ %i.cmp = icmp eq i32 %next, %count
+ br i1 %i.cmp, label %body, label %exit
+exit:
+ ret void
+}