diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def
--- a/clang/include/clang/Basic/BuiltinsWebAssembly.def
+++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def
@@ -119,6 +119,7 @@
 TARGET_BUILTIN(__builtin_wasm_avgr_u_i16x8, "V8sV8sV8s", "nc", "simd128")
 
 TARGET_BUILTIN(__builtin_wasm_bitselect, "V4iV4iV4iV4i", "nc", "simd128")
+TARGET_BUILTIN(__builtin_wasm_shuffle_v8x16, "V16cV16cV16cIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIi", "nc", "simd128")
 
 TARGET_BUILTIN(__builtin_wasm_any_true_i8x16, "iV16c", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_any_true_i16x8, "iV8s", "nc", "simd128")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -16044,6 +16044,20 @@
         CGM.getIntrinsic(IntNo, {ConvertType(E->getType()), Vec->getType()});
     return Builder.CreateCall(Callee, Vec);
   }
+  case WebAssembly::BI__builtin_wasm_shuffle_v8x16: {
+    Value *Ops[18];
+    size_t OpIdx = 0;
+    Ops[OpIdx++] = EmitScalarExpr(E->getArg(0));
+    Ops[OpIdx++] = EmitScalarExpr(E->getArg(1));
+    while (OpIdx < 18) {
+      llvm::APSInt LaneConst;
+      if (!E->getArg(OpIdx)->isIntegerConstantExpr(LaneConst, getContext()))
+        llvm_unreachable("Constant arg isn't actually constant?");
+      Ops[OpIdx++] = llvm::ConstantInt::get(getLLVMContext(), LaneConst);
+    }
+    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_shuffle);
+    return Builder.CreateCall(Callee, Ops);
+  }
   default:
     return nullptr;
   }
diff --git a/clang/lib/Headers/wasm_simd128.h b/clang/lib/Headers/wasm_simd128.h
--- a/clang/lib/Headers/wasm_simd128.h
+++ b/clang/lib/Headers/wasm_simd128.h
@@ -1020,23 +1020,31 @@
 #define wasm_v8x16_shuffle(__a, __b, __c0, __c1, __c2, __c3, __c4, __c5, __c6, \
                            __c7, __c8, __c9, __c10, __c11, __c12, __c13,       \
                            __c14, __c15)                                       \
-  ((v128_t)(__builtin_shufflevector(                                           \
-      (__u8x16)(__a), (__u8x16)(__b), __c0, __c1, __c2, __c3, __c4, __c5,      \
-      __c6, __c7, __c8, __c9, __c10, __c11, __c12, __c13, __c14, __c15)))
+  ((v128_t)__builtin_wasm_shuffle_v8x16(                                       \
+      (__i8x16)(__a), (__i8x16)(__b), __c0, __c1, __c2, __c3, __c4, __c5,      \
+      __c6, __c7, __c8, __c9, __c10, __c11, __c12, __c13, __c14, __c15))
 
 #define wasm_v16x8_shuffle(__a, __b, __c0, __c1, __c2, __c3, __c4, __c5, __c6, \
                            __c7)                                               \
-  ((v128_t)(__builtin_shufflevector((__u16x8)(__a), (__u16x8)(__b), __c0,      \
-                                    __c1, __c2, __c3, __c4, __c5, __c6,        \
-                                    __c7)))
+  ((v128_t)__builtin_wasm_shuffle_v8x16(                                       \
+      (__i8x16)(__a), (__i8x16)(__b), __c0 * 2, __c0 * 2 + 1, __c1 * 2,        \
+      __c1 * 2 + 1, __c2 * 2, __c2 * 2 + 1, __c3 * 2, __c3 * 2 + 1, __c4 * 2,  \
+      __c4 * 2 + 1, __c5 * 2, __c5 * 2 + 1, __c6 * 2, __c6 * 2 + 1, __c7 * 2,  \
+      __c7 * 2 + 1))
 
 #define wasm_v32x4_shuffle(__a, __b, __c0, __c1, __c2, __c3)                   \
-  ((v128_t)(__builtin_shufflevector((__u32x4)(__a), (__u32x4)(__b), __c0,      \
-                                    __c1, __c2, __c3)))
+  ((v128_t)__builtin_wasm_shuffle_v8x16(                                       \
+      (__i8x16)(__a), (__i8x16)(__b), __c0 * 4, __c0 * 4 + 1, __c0 * 4 + 2,    \
+      __c0 * 4 + 3, __c1 * 4, __c1 * 4 + 1, __c1 * 4 + 2, __c1 * 4 + 3,        \
+      __c2 * 4, __c2 * 4 + 1, __c2 * 4 + 2, __c2 * 4 + 3, __c3 * 4,            \
+      __c3 * 4 + 1, __c3 * 4 + 2, __c3 * 4 + 3))
 
 #define wasm_v64x2_shuffle(__a, __b, __c0, __c1)                               \
-  ((v128_t)(                                                                   \
-      __builtin_shufflevector((__u64x2)(__a), (__u64x2)(__b), __c0, __c1)))
+  ((v128_t)__builtin_wasm_shuffle_v8x16(                                       \
+      (__i8x16)(__a), (__i8x16)(__b), __c0 * 8, __c0 * 8 + 1, __c0 * 8 + 2,    \
+      __c0 * 8 + 3, __c0 * 8 + 4, __c0 * 8 + 5, __c0 * 8 + 6, __c0 * 8 + 7,    \
+      __c1 * 8, __c1 * 8 + 1, __c1 * 8 + 2, __c1 * 8 + 3, __c1 * 8 + 4,        \
+      __c1 * 8 + 5, __c1 * 8 + 6, __c1 * 8 + 7))
 
 #ifdef __wasm_unimplemented_simd128__
 
diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c
--- a/clang/test/CodeGen/builtins-wasm.c
+++ b/clang/test/CodeGen/builtins-wasm.c
@@ -724,5 +724,14 @@
 i8x16 swizzle_v8x16(i8x16 x, i8x16 y) {
   return __builtin_wasm_swizzle_v8x16(x, y);
   // WEBASSEMBLY: call <16 x i8> @llvm.wasm.swizzle(<16 x i8> %x, <16 x i8> %y)
+}
+
+i8x16 shuffle(i8x16 x, i8x16 y) {
+  return __builtin_wasm_shuffle_v8x16(x, y, 0, 1, 2, 3, 4, 5, 6, 7,
+                                      8, 9, 10, 11, 12, 13, 14, 15);
+  // WEBASSEMBLY: call <16 x i8> @llvm.wasm.shuffle(<16 x i8> %x, <16 x i8> %y,
+  // WEBASSEMBLY-SAME: i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+  // WEBASSEMBLY-SAME: i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14,
+  // WEBASSEMBLY-SAME: i32 15
   // WEBASSEMBLY-NEXT: ret
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
--- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -104,6 +104,13 @@
   Intrinsic<[llvm_v16i8_ty],
             [llvm_v16i8_ty, llvm_v16i8_ty],
             [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_shuffle :
+  Intrinsic<[llvm_v16i8_ty],
+            [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty, llvm_i32_ty,
+             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem, IntrSpeculatable]>;
 def int_wasm_sub_saturate_signed :
   Intrinsic<[llvm_anyvector_ty],
             [LLVMMatchType<0>, LLVMMatchType<0>],
@@ -116,7 +123,6 @@
   Intrinsic<[llvm_anyvector_ty],
             [LLVMMatchType<0>, LLVMMatchType<0>],
             [IntrNoMem, IntrSpeculatable]>;
-
 def int_wasm_bitselect :
   Intrinsic<[llvm_anyvector_ty],
             [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
@@ -170,7 +176,6 @@
             [llvm_anyvector_ty],
             [IntrNoMem, IntrSpeculatable]>;
 
-
 //===----------------------------------------------------------------------===//
 // Bulk memory intrinsics
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -1353,6 +1353,24 @@
                            Op.getOperand(3)  // thrown value
                        });
   }
+
+  case Intrinsic::wasm_shuffle: {
+    // Drop in-chain and replace undefs, but otherwise pass through unchanged
+    SDValue Ops[18];
+    size_t OpIdx = 0;
+    Ops[OpIdx++] = Op.getOperand(1);
+    Ops[OpIdx++] = Op.getOperand(2);
+    while (OpIdx < 18) {
+      const SDValue &MaskIdx = Op.getOperand(OpIdx + 1);
+      if (MaskIdx.isUndef() ||
+          cast<ConstantSDNode>(MaskIdx.getNode())->getZExtValue() >= 32) {
+        Ops[OpIdx++] = DAG.getConstant(0, DL, MVT::i32);
+      } else {
+        Ops[OpIdx++] = MaskIdx;
+      }
+    }
+    return DAG.getNode(WebAssemblyISD::SHUFFLE, DL, Op.getValueType(), Ops);
+  }
   }
 }
 
diff --git a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
--- a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
@@ -141,6 +141,36 @@
   ret <16 x i8> %a
 }
 
+; CHECK-LABEL: shuffle_v16i8:
+; NO-SIMD128-NOT: v8x16
+; SIMD128-NEXT: .functype shuffle_v16i8 (v128, v128) -> (v128){{$}}
+; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $1,
+; SIMD128-SAME: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+declare <16 x i8> @llvm.wasm.shuffle(
+  <16 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,
+  i32, i32, i32, i32, i32)
+define <16 x i8> @shuffle_v16i8(<16 x i8> %x, <16 x i8> %y) {
+  %res = call <16 x i8> @llvm.wasm.shuffle(<16 x i8> %x, <16 x i8> %y,
+      i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+      i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 35)
+  ret <16 x i8> %res
+}
+
+; CHECK-LABEL: shuffle_undef_v16i8:
+; NO-SIMD128-NOT: v8x16
+; SIMD128-NEXT: .functype shuffle_undef_v16i8 (v128, v128) -> (v128){{$}}
+; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $1,
+; SIMD128-SAME: 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @shuffle_undef_v16i8(<16 x i8> %x, <16 x i8> %y) {
+  %res = call <16 x i8> @llvm.wasm.shuffle(<16 x i8> %x, <16 x i8> %y,
+      i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
+      i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
+      i32 undef, i32 undef, i32 undef, i32 2)
+  ret <16 x i8> %res
+}
+
 ; ==============================================================================
 ; 8 x i16
 ; ==============================================================================