Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -19012,20 +19012,6 @@
     Value *Splat = Constant::getNullValue(TruncT);
     return Builder.CreateShuffleVector(Trunc, Splat, ArrayRef<int>{0, 1, 2, 3});
   }
-  case WebAssembly::BI__builtin_wasm_shuffle_i8x16: {
-    Value *Ops[18];
-    size_t OpIdx = 0;
-    Ops[OpIdx++] = EmitScalarExpr(E->getArg(0));
-    Ops[OpIdx++] = EmitScalarExpr(E->getArg(1));
-    while (OpIdx < 18) {
-      Optional<llvm::APSInt> LaneConst =
-          E->getArg(OpIdx)->getIntegerConstantExpr(getContext());
-      assert(LaneConst && "Constant arg isn't actually constant?");
-      Ops[OpIdx++] = llvm::ConstantInt::get(getLLVMContext(), *LaneConst);
-    }
-    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_shuffle);
-    return Builder.CreateCall(Callee, Ops);
-  }
   case WebAssembly::BI__builtin_wasm_relaxed_madd_f32x4:
   case WebAssembly::BI__builtin_wasm_relaxed_nmadd_f32x4:
   case WebAssembly::BI__builtin_wasm_relaxed_madd_f64x2:
Index: clang/lib/Headers/wasm_simd128.h
===================================================================
--- clang/lib/Headers/wasm_simd128.h
+++ clang/lib/Headers/wasm_simd128.h
@@ -1429,31 +1429,31 @@
 #define wasm_i8x16_shuffle(__a, __b, __c0, __c1, __c2, __c3, __c4, __c5, __c6, \
                            __c7, __c8, __c9, __c10, __c11, __c12, __c13,       \
                            __c14, __c15)                                       \
-  ((v128_t)__builtin_wasm_shuffle_i8x16(                                       \
-      (__i8x16)(__a), (__i8x16)(__b), __c0, __c1, __c2, __c3, __c4, __c5,      \
-      __c6, __c7, __c8, __c9, __c10, __c11, __c12, __c13, __c14, __c15))
+  ((v128_t)(__builtin_shufflevector(                                           \
+      (__u8x16)(__a), (__u8x16)(__b), __c0, __c1, __c2, __c3, __c4, __c5,      \
+      __c6, __c7, __c8, __c9, __c10, __c11, __c12, __c13, __c14, __c15)))
 
 #define wasm_i16x8_shuffle(__a, __b, __c0, __c1, __c2, __c3, __c4, __c5, __c6, \
                            __c7)                                               \
-  ((v128_t)__builtin_wasm_shuffle_i8x16(                                       \
+  ((v128_t)(__builtin_shufflevector(                                           \
       (__i8x16)(__a), (__i8x16)(__b), (__c0)*2, (__c0)*2 + 1, (__c1)*2,        \
       (__c1)*2 + 1, (__c2)*2, (__c2)*2 + 1, (__c3)*2, (__c3)*2 + 1, (__c4)*2,  \
       (__c4)*2 + 1, (__c5)*2, (__c5)*2 + 1, (__c6)*2, (__c6)*2 + 1, (__c7)*2,  \
-      (__c7)*2 + 1))
+      (__c7)*2 + 1)))
 
 #define wasm_i32x4_shuffle(__a, __b, __c0, __c1, __c2, __c3)                   \
-  ((v128_t)__builtin_wasm_shuffle_i8x16(                                       \
+  ((v128_t)(__builtin_shufflevector(                                           \
       (__i8x16)(__a), (__i8x16)(__b), (__c0)*4, (__c0)*4 + 1, (__c0)*4 + 2,    \
       (__c0)*4 + 3, (__c1)*4, (__c1)*4 + 1, (__c1)*4 + 2, (__c1)*4 + 3,        \
       (__c2)*4, (__c2)*4 + 1, (__c2)*4 + 2, (__c2)*4 + 3, (__c3)*4,            \
-      (__c3)*4 + 1, (__c3)*4 + 2, (__c3)*4 + 3))
+      (__c3)*4 + 1, (__c3)*4 + 2, (__c3)*4 + 3)))
 
 #define wasm_i64x2_shuffle(__a, __b, __c0, __c1)                               \
-  ((v128_t)__builtin_wasm_shuffle_i8x16(                                       \
+  ((v128_t)(__builtin_shufflevector(                                           \
       (__i8x16)(__a), (__i8x16)(__b), (__c0)*8, (__c0)*8 + 1, (__c0)*8 + 2,    \
       (__c0)*8 + 3, (__c0)*8 + 4, (__c0)*8 + 5, (__c0)*8 + 6, (__c0)*8 + 7,    \
       (__c1)*8, (__c1)*8 + 1, (__c1)*8 + 2, (__c1)*8 + 3, (__c1)*8 + 4,        \
-      (__c1)*8 + 5, (__c1)*8 + 6, (__c1)*8 + 7))
+      (__c1)*8 + 5, (__c1)*8 + 6, (__c1)*8 + 7)))
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_swizzle(v128_t __a,
                                                                v128_t __b) {
Index: clang/test/CodeGen/builtins-wasm.c
===================================================================
--- clang/test/CodeGen/builtins-wasm.c
+++ clang/test/CodeGen/builtins-wasm.c
@@ -649,15 +649,6 @@
 i8x16 swizzle_i8x16(i8x16 x, i8x16 y) {
   return __builtin_wasm_swizzle_i8x16(x, y);
   // WEBASSEMBLY: call <16 x i8> @llvm.wasm.swizzle(<16 x i8> %x, <16 x i8> %y)
-}
-
-i8x16 shuffle(i8x16 x, i8x16 y) {
-  return __builtin_wasm_shuffle_i8x16(x, y, 0, 1, 2, 3, 4, 5, 6, 7,
-                                      8, 9, 10, 11, 12, 13, 14, 15);
-  // WEBASSEMBLY: call <16 x i8> @llvm.wasm.shuffle(<16 x i8> %x, <16 x i8> %y,
-  // WEBASSEMBLY-SAME: i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-  // WEBASSEMBLY-SAME: i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14,
-  // WEBASSEMBLY-SAME: i32 15
   // WEBASSEMBLY-NEXT: ret
 }
 
Index: clang/test/Headers/wasm.c
===================================================================
--- clang/test/Headers/wasm.c
+++ clang/test/Headers/wasm.c
@@ -2706,48 +2706,48 @@
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.shuffle(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i32> <i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 v128_t test_i8x16_shuffle(v128_t a, v128_t b) {
-  return wasm_i8x16_shuffle(a, b, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return wasm_i8x16_shuffle(a, b, 23, 22, 21, 20, 19, 18, 17, 16, 7, 6, 5, 4, 3, 2, 1, 0);
 }
 
 // CHECK-LABEL: @test_i16x8_shuffle(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.shuffle(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 14, i32 15, i32 12, i32 13, i32 10, i32 11, i32 8, i32 9, i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i32> <i32 22, i32 23, i32 20, i32 21, i32 18, i32 19, i32 16, i32 17, i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 v128_t test_i16x8_shuffle(v128_t a, v128_t b) {
-  return wasm_i16x8_shuffle(a, b, 7, 6, 5, 4, 3, 2, 1, 0);
+  return wasm_i16x8_shuffle(a, b, 11, 10, 9, 8, 3, 2, 1, 0);
 }
 
 // CHECK-LABEL: @test_i32x4_shuffle(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.shuffle(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3)
+// CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i32> <i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 v128_t test_i32x4_shuffle(v128_t a, v128_t b) {
-  return wasm_i32x4_shuffle(a, b, 3, 2, 1, 0);
+  return wasm_i32x4_shuffle(a, b, 5, 4, 1, 0);
 }
 
 // CHECK-LABEL: @test_i64x2_shuffle(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.shuffle(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7)
+// CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 v128_t test_i64x2_shuffle(v128_t a, v128_t b) {
-  return wasm_i64x2_shuffle(a, b, 1, 0);
+  return wasm_i64x2_shuffle(a, b, 2, 0);
 }
 
 // CHECK-LABEL: @test_i8x16_swizzle(
Index: llvm/include/llvm/IR/IntrinsicsWebAssembly.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsWebAssembly.td
+++ llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -168,14 +168,6 @@
   DefaultAttrsIntrinsic<[llvm_v16i8_ty],
                         [llvm_v16i8_ty, llvm_v16i8_ty],
                         [IntrNoMem, IntrSpeculatable]>;
-def int_wasm_shuffle :
-  DefaultAttrsIntrinsic<[llvm_v16i8_ty],
-                        [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty,
-                         llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-                         llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-                         llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-                         llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-                        [IntrNoMem, IntrSpeculatable]>;
 def int_wasm_sub_sat_signed :
   DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                         [LLVMMatchType<0>, LLVMMatchType<0>],
Index: llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
===================================================================
--- llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -1806,24 +1806,6 @@
     SDValue Node = DAG.getTargetExternalSymbol(SymName, PtrVT);
     return DAG.getNode(WebAssemblyISD::Wrapper, DL, PtrVT, Node);
   }
-
-  case Intrinsic::wasm_shuffle: {
-    // Drop in-chain and replace undefs, but otherwise pass through unchanged
-    SDValue Ops[18];
-    size_t OpIdx = 0;
-    Ops[OpIdx++] = Op.getOperand(1);
-    Ops[OpIdx++] = Op.getOperand(2);
-    while (OpIdx < 18) {
-      const SDValue &MaskIdx = Op.getOperand(OpIdx + 1);
-      if (MaskIdx.isUndef() ||
-          cast<ConstantSDNode>(MaskIdx.getNode())->getZExtValue() >= 32) {
-        Ops[OpIdx++] = DAG.getConstant(0, DL, MVT::i32);
-      } else {
-        Ops[OpIdx++] = MaskIdx;
-      }
-    }
-    return DAG.getNode(WebAssemblyISD::SHUFFLE, DL, Op.getValueType(), Ops);
-  }
   }
 }
 
Index: llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
===================================================================
--- llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
+++ llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
@@ -150,36 +150,6 @@
   ret <16 x i8> %a
 }
 
-; CHECK-LABEL: shuffle_v16i8:
-; NO-CHECK-NOT: i8x16
-; CHECK-NEXT: .functype shuffle_v16i8 (v128, v128) -> (v128){{$}}
-; CHECK-NEXT: i8x16.shuffle $push[[R:[0-9]+]]=, $0, $1,
-; CHECK-SAME: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0{{$}}
-; CHECK-NEXT: return $pop[[R]]{{$}}
-declare <16 x i8> @llvm.wasm.shuffle(
-  <16 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,
-  i32, i32, i32, i32, i32)
-define <16 x i8> @shuffle_v16i8(<16 x i8> %x, <16 x i8> %y) {
-  %res = call <16 x i8> @llvm.wasm.shuffle(<16 x i8> %x, <16 x i8> %y,
-      i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-      i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 35)
-  ret <16 x i8> %res
-}
-
-; CHECK-LABEL: shuffle_undef_v16i8:
-; NO-CHECK-NOT: i8x16
-; CHECK-NEXT: .functype shuffle_undef_v16i8 (v128, v128) -> (v128){{$}}
-; CHECK-NEXT: i8x16.shuffle $push[[R:[0-9]+]]=, $0, $1,
-; CHECK-SAME: 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2{{$}}
-; CHECK-NEXT: return $pop[[R]]{{$}}
-define <16 x i8> @shuffle_undef_v16i8(<16 x i8> %x, <16 x i8> %y) {
-  %res = call <16 x i8> @llvm.wasm.shuffle(<16 x i8> %x, <16 x i8> %y,
-      i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
-      i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
-      i32 undef, i32 undef, i32 undef, i32 2)
-  ret <16 x i8> %res
-}
-
 ; CHECK-LABEL: laneselect_v16i8:
 ; CHECK-NEXT: .functype laneselect_v16i8 (v128, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i8x16.relaxed_laneselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}}
Index: llvm/test/CodeGen/WebAssembly/simd-shuffle.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/WebAssembly/simd-shuffle.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128,+relaxed-simd | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128,+relaxed-simd -fast-isel | FileCheck %s
+
+; Test SIMD128 shuffle lowering.
+
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: shuffle_v16i8:
+; NO-CHECK-NOT: i8x16
+; CHECK-NEXT: .functype shuffle_v16i8 (v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i8x16.shuffle $push[[R:[0-9]+]]=, $0, $1,
+; CHECK-SAME: 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @shuffle_v16i8(<16 x i8> %x, <16 x i8> %y) {
+  %res = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  ret <16 x i8> %res
+}
+
+; CHECK-LABEL: shuffle_undef_v16i8:
+; NO-CHECK-NOT: i8x16
+; CHECK-NEXT: .functype shuffle_undef_v16i8 (v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i8x16.shuffle $push[[R:[0-9]+]]=, $0, $1,
+; CHECK-SAME: 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @shuffle_undef_v16i8(<16 x i8> %x, <16 x i8> %y) {
+  %res = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> <
+      i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
+      i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
+      i32 undef, i32 undef, i32 undef, i32 25>
+  ret <16 x i8> %res
+}