diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -636,6 +636,7 @@
 def build_vector : SDNode<"ISD::BUILD_VECTOR", SDTypeProfile<1, -1, []>, []>;
 def scalar_to_vector : SDNode<"ISD::SCALAR_TO_VECTOR", SDTypeProfile<1, 1, []>,
                               []>;
+def splat_vector : SDNode<"ISD::SPLAT_VECTOR", SDTypeProfile<1, 1, []>, []>;
 
 // vector_extract/vector_insert are deprecated. extractelt/insertelt
 // are preferred.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -159,6 +159,14 @@
   while (N->getOpcode() == ISD::BITCAST)
     N = N->getOperand(0).getNode();
 
+  unsigned EltSize = N->getValueType(0).getScalarSizeInBits();
+
+  // Accept splat_vectors that will create all-ones vectors as well
+  if (N->getOpcode() == ISD::SPLAT_VECTOR)
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(0)))
+      if (CN->getAPIntValue().countTrailingOnes() >= EltSize)
+        return true;
+
   if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
 
   unsigned i = 0, e = N->getNumOperands();
@@ -179,7 +187,6 @@
   // we care if the resultant vector is all ones, not whether the individual
   // constants are.
   SDValue NotZero = N->getOperand(i);
-  unsigned EltSize = N->getValueType(0).getScalarSizeInBits();
   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(NotZero)) {
     if (CN->getAPIntValue().countTrailingOnes() < EltSize)
       return false;
@@ -203,6 +210,14 @@
   while (N->getOpcode() == ISD::BITCAST)
     N = N->getOperand(0).getNode();
 
+  unsigned EltSize = N->getValueType(0).getScalarSizeInBits();
+
+  // Accept splat_vectors that will create all-zeros vectors as well
+  if (N->getOpcode() == ISD::SPLAT_VECTOR)
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(0)))
+      if (CN->getAPIntValue().countTrailingZeros() >= EltSize)
+        return true;
+
   if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
 
   bool IsAllUndef = true;
@@ -218,7 +233,6 @@
     // We only want to check enough bits to cover the vector elements, because
     // we care if the resultant vector is all zeros, not whether the individual
     // constants are.
-    unsigned EltSize = N->getValueType(0).getScalarSizeInBits();
     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op)) {
       if (CN->getAPIntValue().countTrailingZeros() < EltSize)
         return false;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -32,6 +32,3 @@
 HANDLE_NODETYPE(THROW)
 HANDLE_NODETYPE(MEMORY_COPY)
 HANDLE_NODETYPE(MEMORY_FILL)
-
-// Memory intrinsics
-HANDLE_MEM_NODETYPE(LOAD_SPLAT)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -123,6 +123,9 @@
     // Hoist bitcasts out of shuffles
     setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
 
+    // Recognize potential swizzles early
+    setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+
     // Support saturating add for i8x16 and i16x8
     for (auto Op : {ISD::SADDSAT, ISD::UADDSAT})
       for (auto T : {MVT::v16i8, MVT::v8i16})
@@ -132,6 +135,11 @@
     for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
       setOperationAction(ISD::ABS, T, Legal);
 
+    // Splats are supported
+    for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32, MVT::v2i64,
+                   MVT::v2f64})
+      setOperationAction(ISD::SPLAT_VECTOR, T, Legal);
+
     // Custom lower BUILD_VECTORs to minimize number of replace_lanes
     for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32, MVT::v2i64,
                    MVT::v2f64})
@@ -1553,18 +1561,7 @@
     };
   }
   if (!Result) {
-    // Use a splat, but possibly a load_splat
-    LoadSDNode *SplattedLoad;
-    if ((SplattedLoad = dyn_cast<LoadSDNode>(SplatValue)) &&
-        SplattedLoad->getMemoryVT() == VecT.getVectorElementType()) {
-      Result = DAG.getMemIntrinsicNode(
-          WebAssemblyISD::LOAD_SPLAT, DL, DAG.getVTList(VecT),
-          {SplattedLoad->getChain(), SplattedLoad->getBasePtr(),
-           SplattedLoad->getOffset()},
-          SplattedLoad->getMemoryVT(), SplattedLoad->getMemOperand());
-    } else {
-      Result = DAG.getSplatBuildVector(VecT, DL, SplatValue);
-    }
+    Result = DAG.getNode(ISD::SPLAT_VECTOR, DL, VecT, SplatValue);
     IsLaneConstructed = [&](size_t _, const SDValue &Lane) {
       return Lane == SplatValue;
     };
@@ -1730,6 +1727,56 @@
   return DAG.getBitcast(DstType, NewShuffle);
 }
 
+static SDValue
+performINSERT_VECTOR_ELTCombine(SDNode *N,
+                                TargetLowering::DAGCombinerInfo &DCI) {
+  auto &DAG = DCI.DAG;
+
+  // If this is an insert into an undef vector that could be implemented as a
+  // swizzle, perform that combine. If target-specific combines were run before
+  // generic combines, this combine would prevent the insert from being combined
+  // into a splat, which has a much worse lowering because it cannot be
+  // turned back into a swizzle.
+  //
+  // We are looking for the following pattern:
+  //   (insert undef,
+  //     (extract Vec1,
+  //       (sext (extract Vec2, Index))
+  //     ),
+  //     Index
+  //   )
+  //
+  // To combine to: (swizzle vec1, vec2)
+  if (!N->getOperand(0).isUndef() || N->getValueType(0) != MVT::v16i8)
+    return SDValue();
+  auto Index1 = N->getOperand(2);
+  auto Extract1 = N->getOperand(1);
+  if (Extract1.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+    return SDValue();
+  auto Vec1 = Extract1.getOperand(0);
+  if (Vec1.getValueType() != MVT::v16i8)
+    return SDValue();
+  auto SExt = Extract1.getOperand(1);
+  if (SExt.getOpcode() != ISD::SIGN_EXTEND)
+    return SDValue();
+  auto Extract2 = SExt.getOperand(0);
+  if (Extract2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+    return SDValue();
+  auto Vec2 = Extract2.getOperand(0);
+  if (Vec2.getValueType() != MVT::v16i8)
+    return SDValue();
+  auto Index2 = Extract2.getOperand(1);
+
+  auto *CN1 = dyn_cast<ConstantSDNode>(Index1);
+  auto *CN2 = dyn_cast<ConstantSDNode>(Index2);
+  if (Index1 == Index2 ||
+      (CN1 && CN2 && CN1->getZExtValue() == CN2->getZExtValue()))
+    return DAG.getNode(WebAssemblyISD::SWIZZLE, SDLoc(N), MVT::v16i8, Vec1,
+                       Vec2);
+
+  return SDValue();
+}
+
 SDValue
 WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
@@ -1738,5 +1785,7 @@
     return SDValue();
   case ISD::VECTOR_SHUFFLE:
     return performVECTOR_SHUFFLECombine(N, DCI);
+  case ISD::INSERT_VECTOR_ELT:
+    return performINSERT_VECTOR_ELTCombine(N, DCI);
   }
 }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -87,29 +87,41 @@
 defm "" : SIMDLoadSplat<"v32x4", 9>;
 defm "" : SIMDLoadSplat<"v64x2", 10>;
 
-def wasm_load_splat_t : SDTypeProfile<1, 1, [SDTCisPtrTy<1>]>;
-def wasm_load_splat : SDNode<"WebAssemblyISD::LOAD_SPLAT", wasm_load_splat_t,
-                             [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
-def load_splat : PatFrag<(ops node:$addr), (wasm_load_splat node:$addr)>;
-
-foreach args = [["v16i8", "v8x16"], ["v8i16", "v16x8"], ["v4i32", "v32x4"],
-                ["v2i64", "v64x2"], ["v4f32", "v32x4"], ["v2f64", "v64x2"]] in {
+def load_splat_i8 :
+  PatFrag<(ops node:$addr), (splat_vector (i32 (extloadi8 node:$addr)))>;
+def load_splat_i16 :
+  PatFrag<(ops node:$addr), (splat_vector (i32 (extloadi16 node:$addr)))>;
+def load_splat_i32 :
+  PatFrag<(ops node:$addr), (splat_vector (i32 (load node:$addr)))>;
+def load_splat_i64 :
+  PatFrag<(ops node:$addr), (splat_vector (i64 (load node:$addr)))>;
+def load_splat_f32 :
+  PatFrag<(ops node:$addr), (splat_vector (f32 (load node:$addr)))>;
+def load_splat_f64 :
+  PatFrag<(ops node:$addr), (splat_vector (f64 (load node:$addr)))>;
+
+foreach args = [["v16i8", "v8x16", "load_splat_i8"],
+                ["v8i16", "v16x8", "load_splat_i16"],
+                ["v4i32", "v32x4", "load_splat_i32"],
+                ["v2i64", "v64x2", "load_splat_i64"],
+                ["v4f32", "v32x4", "load_splat_f32"],
+                ["v2f64", "v64x2", "load_splat_f64"]] in {
 defm : LoadPatNoOffset<!cast<ValueType>(args[0]),
-                       load_splat,
+                       !cast<PatFrag>(args[2]),
                        "LOAD_SPLAT_"#args[1]>;
 defm : LoadPatImmOff<!cast<ValueType>(args[0]),
-                     load_splat,
+                     !cast<PatFrag>(args[2]),
                      regPlusImm,
                      "LOAD_SPLAT_"#args[1]>;
 defm : LoadPatImmOff<!cast<ValueType>(args[0]),
-                     load_splat,
+                     !cast<PatFrag>(args[2]),
                      or_is_add,
                      "LOAD_SPLAT_"#args[1]>;
 defm : LoadPatOffsetOnly<!cast<ValueType>(args[0]),
-                         load_splat,
+                         !cast<PatFrag>(args[2]),
                          "LOAD_SPLAT_"#args[1]>;
 defm : LoadPatGlobalAddrOffOnly<!cast<ValueType>(args[0]),
-                                load_splat,
+                                !cast<PatFrag>(args[2]),
                                 "LOAD_SPLAT_"#args[1]>;
 }
 
@@ -192,8 +204,7 @@
 
 // Constant: v128.const
 multiclass ConstVec<ValueType vec_t, dag ops, dag pat, string args> {
-  let isMoveImm = 1, isReMaterializable = 1,
-      Predicates = [HasUnimplementedSIMD128] in
+  let isMoveImm = 1, isReMaterializable = 1 in
   defm CONST_V128_#vec_t : SIMD_I<(outs V128:$dst), ops, (outs), ops,
                                   [(set V128:$dst, (vec_t pat))],
                                   "v128.const\t$dst, "#args,
@@ -246,6 +257,35 @@
                   (build_vector (f64 fpimm:$i0), (f64 fpimm:$i1)),
                   "$i0, $i1">;
 
+// Prefer v128.const over splats when possible
+let Predicates = [HasUnimplementedSIMD128] in {
+def : Pat<(v16i8 (splat_vector (i32 imm:$x))),
+          (v16i8 (CONST_V128_v16i8
+            (i32 imm:$x), (i32 imm:$x), (i32 imm:$x), (i32 imm:$x),
+            (i32 imm:$x), (i32 imm:$x), (i32 imm:$x), (i32 imm:$x),
+            (i32 imm:$x), (i32 imm:$x), (i32 imm:$x), (i32 imm:$x),
+            (i32 imm:$x), (i32 imm:$x), (i32 imm:$x), (i32 imm:$x)))>;
+
+def : Pat<(v8i16 (splat_vector (i32 imm:$x))),
+          (v8i16 (CONST_V128_v8i16
+            (i32 imm:$x), (i32 imm:$x), (i32 imm:$x), (i32 imm:$x),
+            (i32 imm:$x), (i32 imm:$x), (i32 imm:$x), (i32 imm:$x)))>;
+
+def : Pat<(v4i32 (splat_vector (i32 imm:$x))),
+          (v4i32 (CONST_V128_v4i32
+            (i32 imm:$x), (i32 imm:$x), (i32 imm:$x), (i32 imm:$x)))>;
+
+def : Pat<(v2i64 (splat_vector (i64 imm:$x))),
+          (v2i64 (CONST_V128_v2i64 (i64 imm:$x), (i64 imm:$x)))>;
+
+def : Pat<(v4f32 (splat_vector (f32 fpimm:$x))),
+          (v4f32 (CONST_V128_v4f32
+            (f32 fpimm:$x), (f32 fpimm:$x), (f32 fpimm:$x), (f32 fpimm:$x)))>;
+
+def : Pat<(v2f64 (splat_vector (f64 fpimm:$x))),
+          (v2f64 (CONST_V128_v2f64 (f64 fpimm:$x), (f64 fpimm:$x)))>;
+} // Predicates = [HasUnimplementedSIMD128]
+
 // Shuffle lanes: shuffle
 defm SHUFFLE :
   SIMD_I<(outs V128:$dst),
@@ -314,46 +354,19 @@
           (SWIZZLE V128:$src, V128:$mask)>;
 
 // Create vector with identical lanes: splat
-def splat2 : PatFrag<(ops node:$x), (build_vector node:$x, node:$x)>;
-def splat4 : PatFrag<(ops node:$x), (build_vector
-                       node:$x, node:$x, node:$x, node:$x)>;
-def splat8 : PatFrag<(ops node:$x), (build_vector
-                       node:$x, node:$x, node:$x, node:$x,
-                       node:$x, node:$x, node:$x, node:$x)>;
-def splat16 : PatFrag<(ops node:$x), (build_vector
-                        node:$x, node:$x, node:$x, node:$x,
-                        node:$x, node:$x, node:$x, node:$x,
-                        node:$x, node:$x, node:$x, node:$x,
-                        node:$x, node:$x, node:$x, node:$x)>;
-
 multiclass Splat<ValueType vec_t, string vec, WebAssemblyRegClass reg_t,
-                 PatFrag splat_pat, bits<32> simdop> {
-  // Prefer splats over v128.const for const splats (65 is lowest that works)
-  let AddedComplexity = 65 in
+                 bits<32> simdop> {
   defm SPLAT_#vec_t : SIMD_I<(outs V128:$dst), (ins reg_t:$x), (outs), (ins),
-                             [(set (vec_t V128:$dst), (splat_pat reg_t:$x))],
+                             [(set (vec_t V128:$dst), (splat_vector reg_t:$x))],
                              vec#".splat\t$dst, $x", vec#".splat", simdop>;
 }
 
-defm "" : Splat<v16i8, "i8x16", I32, splat16, 15>;
-defm "" : Splat<v8i16, "i16x8", I32, splat8, 16>;
-defm "" : Splat<v4i32, "i32x4", I32, splat4, 17>;
-defm "" : Splat<v2i64, "i64x2", I64, splat2, 18>;
-defm "" : Splat<v4f32, "f32x4", F32, splat4, 19>;
-defm "" : Splat<v2f64, "f64x2", F64, splat2, 20>;
-
-// scalar_to_vector leaves high lanes undefined, so can be a splat
-class ScalarSplatPat<ValueType vec_t, ValueType lane_t,
-                     WebAssemblyRegClass reg_t> :
-  Pat<(vec_t (scalar_to_vector (lane_t reg_t:$x))),
-      (!cast<Instruction>("SPLAT_"#vec_t) reg_t:$x)>;
-
-def : ScalarSplatPat<v16i8, i32, I32>;
-def : ScalarSplatPat<v8i16, i32, I32>;
-def : ScalarSplatPat<v4i32, i32, I32>;
-def : ScalarSplatPat<v2i64, i64, I64>;
-def : ScalarSplatPat<v4f32, f32, F32>;
-def : ScalarSplatPat<v2f64, f64, F64>;
+defm "" : Splat<v16i8, "i8x16", I32, 15>;
+defm "" : Splat<v8i16, "i16x8", I32, 16>;
+defm "" : Splat<v4i32, "i32x4", I32, 17>;
+defm "" : Splat<v2i64, "i64x2", I64, 18>;
+defm "" : Splat<v4f32, "f32x4", F32, 19>;
+defm "" : Splat<v2f64, "f64x2", F64, 20>;
 
 //===----------------------------------------------------------------------===//
 // Accessing lanes
@@ -746,15 +759,15 @@
                       (add node:$lhs, node:$rhs),
                       "return N->getFlags().hasNoUnsignedWrap();">;
 
-foreach nodes = [[v16i8, splat16], [v8i16, splat8]] in
+foreach type = [v16i8, v8i16] in
 def : Pat<(wasm_shr_u
             (add_nuw
-              (add_nuw (nodes[0] V128:$lhs), (nodes[0] V128:$rhs)),
-              (nodes[1] (i32 1))
+              (add_nuw (type V128:$lhs), (type V128:$rhs)),
+              (type (splat_vector (i32 1)))
             ),
             (i32 1)
           ),
-          (!cast<NI>("AVGR_U_"#nodes[0]) V128:$lhs, V128:$rhs)>;
+          (!cast<NI>("AVGR_U_"#type) V128:$lhs, V128:$rhs)>;
 
 // Widening dot product: i32x4.dot_i16x8_s
 let isCommutable = 1 in
diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
--- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
@@ -1278,9 +1278,8 @@
 ; CHECK-LABEL: min_unordered_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .functype min_unordered_v4f32 (v128) -> (v128){{$}}
-; SIMD128-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.4p2
-; SIMD128-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
-; SIMD128-NEXT: f32x4.min $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
+; SIMD128-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.4p2, 0x1.4p2, 0x1.4p2, 0x1.4p2{{$}}
+; SIMD128-NEXT: f32x4.min $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @min_unordered_v4f32(<4 x float> %x) {
   %cmps = fcmp ule <4 x float> %x, <float 5., float 5., float 5., float 5.>
@@ -1292,9 +1291,8 @@
 ; CHECK-LABEL: max_unordered_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .functype max_unordered_v4f32 (v128) -> (v128){{$}}
-; SIMD128-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.4p2
-; SIMD128-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
-; SIMD128-NEXT: f32x4.max $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
+; SIMD128-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.4p2, 0x1.4p2, 0x1.4p2, 0x1.4p2
+; SIMD128-NEXT: f32x4.max $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @max_unordered_v4f32(<4 x float> %x) {
   %cmps = fcmp uge <4 x float> %x, <float 5., float 5., float 5., float 5.>
@@ -1306,9 +1304,8 @@
 ; CHECK-LABEL: min_ordered_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .functype min_ordered_v4f32 (v128) -> (v128){{$}}
-; SIMD128-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.4p2
-; SIMD128-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
-; SIMD128-NEXT: f32x4.min $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
+; SIMD128-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.4p2, 0x1.4p2, 0x1.4p2, 0x1.4p2{{$}}
+; SIMD128-NEXT: f32x4.min $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @min_ordered_v4f32(<4 x float> %x) {
   %cmps = fcmp ole <4 x float> <float 5., float 5., float 5., float 5.>, %x
@@ -1320,9 +1317,8 @@
 ; CHECK-LABEL: max_ordered_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .functype max_ordered_v4f32 (v128) -> (v128){{$}}
-; SIMD128-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.4p2
-; SIMD128-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
-; SIMD128-NEXT: f32x4.max $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
+; SIMD128-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.4p2, 0x1.4p2, 0x1.4p2, 0x1.4p2{{$}}
+; SIMD128-NEXT: f32x4.max $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @max_ordered_v4f32(<4 x float> %x) {
   %cmps = fcmp oge <4 x float> <float 5., float 5., float 5., float 5.>, %x
@@ -1378,8 +1374,7 @@
 ; CHECK-LABEL: min_const_intrinsic_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .functype min_const_intrinsic_v4f32 () -> (v128){{$}}
-; SIMD128-NEXT: f32.const $push[[L:[0-9]+]]=, 0x1.4p2{{$}}
-; SIMD128-NEXT: f32x4.splat $push[[R:[0-9]+]]=, $pop[[L]]{{$}}
+; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=, 0x1.4p2, 0x1.4p2, 0x1.4p2, 0x1.4p2{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @min_const_intrinsic_v4f32() {
   %a = call <4 x float> @llvm.minimum.v4f32(
@@ -1392,8 +1387,7 @@
 ; CHECK-LABEL: max_const_intrinsic_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .functype max_const_intrinsic_v4f32 () -> (v128){{$}}
-; SIMD128-NEXT: f32.const $push[[L:[0-9]+]]=, 0x1.5p5{{$}}
-; SIMD128-NEXT: f32x4.splat $push[[R:[0-9]+]]=, $pop[[L]]{{$}}
+; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=, 0x1.5p5, 0x1.5p5, 0x1.5p5, 0x1.5p5{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @max_const_intrinsic_v4f32() {
   %a = call <4 x float> @llvm.maximum.v4f32(
@@ -1482,9 +1476,8 @@
 ; CHECK-LABEL: min_unordered_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-NEXT: .functype min_unordered_v2f64 (v128) -> (v128){{$}}
-; SIMD128-NEXT: f64.const $push[[L0:[0-9]+]]=, 0x1.4p2
-; SIMD128-NEXT: f64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
-; SIMD128-NEXT: f64x2.min $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
+; SIMD128-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.4p2, 0x1.4p2{{$}}
+; SIMD128-NEXT: f64x2.min $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @min_unordered_v2f64(<2 x double> %x) {
   %cmps = fcmp ule <2 x double> %x, <double 5., double 5.>
@@ -1496,9 +1489,8 @@
 ; CHECK-LABEL: max_unordered_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-NEXT: .functype max_unordered_v2f64 (v128) -> (v128){{$}}
-; SIMD128-NEXT: f64.const $push[[L0:[0-9]+]]=, 0x1.4p2
-; SIMD128-NEXT: f64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
-; SIMD128-NEXT: f64x2.max $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
+; SIMD128-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.4p2, 0x1.4p2{{$}}
+; SIMD128-NEXT: f64x2.max $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @max_unordered_v2f64(<2 x double> %x) {
   %cmps = fcmp uge <2 x double> %x, <double 5., double 5.>
@@ -1510,9 +1502,8 @@
 ; CHECK-LABEL: min_ordered_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-NEXT: .functype min_ordered_v2f64 (v128) -> (v128){{$}}
-; SIMD128-NEXT: f64.const $push[[L0:[0-9]+]]=, 0x1.4p2
-; SIMD128-NEXT: f64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
-; SIMD128-NEXT: f64x2.min $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
+; SIMD128-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.4p2, 0x1.4p2{{$}}
+; SIMD128-NEXT: f64x2.min $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @min_ordered_v2f64(<2 x double> %x) {
   %cmps = fcmp ole <2 x double> <double 5., double 5.>, %x
@@ -1524,9 +1515,8 @@
 ; CHECK-LABEL: max_ordered_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-NEXT: .functype max_ordered_v2f64 (v128) -> (v128){{$}}
-; SIMD128-NEXT: f64.const $push[[L0:[0-9]+]]=, 0x1.4p2
-; SIMD128-NEXT: f64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
-; SIMD128-NEXT: f64x2.max $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
+; SIMD128-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.4p2, 0x1.4p2{{$}}
+; SIMD128-NEXT: f64x2.max $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @max_ordered_v2f64(<2 x double> %x) {
   %cmps = fcmp oge <2 x double> <double 5., double 5.>, %x
@@ -1560,8 +1550,7 @@
 ; CHECK-LABEL: min_const_intrinsic_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-NEXT: .functype min_const_intrinsic_v2f64 () -> (v128){{$}}
-; SIMD128-NEXT: f64.const $push[[L:[0-9]+]]=, 0x1.4p2{{$}}
-; SIMD128-NEXT: f64x2.splat $push[[R:[0-9]+]]=, $pop[[L]]{{$}}
+; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=, 0x1.4p2, 0x1.4p2{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @min_const_intrinsic_v2f64() {
   %a = call <2 x double> @llvm.minimum.v2f64(
@@ -1574,8 +1563,7 @@
 ; CHECK-LABEL: max_const_intrinsic_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-NEXT: .functype max_const_intrinsic_v2f64 () -> (v128){{$}}
-; SIMD128-NEXT: f64.const $push[[L:[0-9]+]]=, 0x1.5p5{{$}}
-; SIMD128-NEXT: f64x2.splat $push[[R:[0-9]+]]=, $pop[[L]]{{$}}
+; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=, 0x1.5p5, 0x1.5p5{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @max_const_intrinsic_v2f64() {
   %a = call <2 x double> @llvm.maximum.v2f64(
diff --git a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll
--- a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll
@@ -93,10 +93,25 @@
   ret <8 x i16> %v7
 }
 
+;; TODO: This would be a single v8x16.swizzle if the target-specific
+;; DAG combine for turning insert_vector_elts into swizzles were run
+;; before the generic DAG combine that turns them into
+;; splat_vectors. This could be fixed by disabling the generic combine
+;; and performing it ourselves at a later stage.
+
 ; CHECK-LABEL: swizzle_one_i8x16:
 ; CHECK-NEXT:  .functype       swizzle_one_i8x16 (v128, v128) -> (v128)
-; CHECK-NEXT:  v8x16.swizzle   $push[[L0:[0-9]+]]=, $0, $1
-; CHECK-NEXT:  return          $pop[[L0]]
+; CHECK-NEXT: global.get      $push5=, __stack_pointer
+; CHECK-NEXT: i32.const       $push6=, 16
+; CHECK-NEXT: i32.sub         $push8=, $pop5, $pop6
+; CHECK-NEXT: local.tee       $push7=, $2=, $pop8
+; CHECK-NEXT: v128.store      0($pop7), $0
+; CHECK-NEXT: i8x16.extract_lane_u    $push0=, $1, 0
+; CHECK-NEXT: i32.const       $push1=, 15
+; CHECK-NEXT: i32.and         $push2=, $pop0, $pop1
+; CHECK-NEXT: i32.or          $push3=, $2, $pop2
+; CHECK-NEXT: v8x16.load_splat        $push4=, 0($pop3)
+; CHECK-NEXT: return  $pop4
 define <16 x i8> @swizzle_one_i8x16(<16 x i8> %src, <16 x i8> %mask) {
   %m0 = extractelement <16 x i8> %mask, i32 0
   %s0 = extractelement <16 x i8> %src, i8 %m0
@@ -104,6 +119,18 @@
   ret <16 x i8> %v0
 }
 
+; CHECK-LABEL: swizzle_one_var_i8x16:
+; CHECK-NEXT:  .functype       swizzle_one_var_i8x16 (v128, v128, i32) -> (v128)
+; CHECK-NEXT:  v8x16.swizzle   $push[[L0:[0-9]+]]=, $0, $1
+; CHECK-NEXT:  return          $pop[[L0]]
+define <16 x i8> @swizzle_one_var_i8x16(<16 x i8> %src, <16 x i8> %mask,
+                                        i32 %idx) {
+  %m0 = extractelement <16 x i8> %mask, i32 %idx
+  %s0 = extractelement <16 x i8> %src, i8 %m0
+  %v0 = insertelement <16 x i8> undef, i8 %s0, i32 %idx
+  ret <16 x i8> %v0
+}
+
 ; CHECK-LABEL: swizzle_all_i8x16:
 ; CHECK-NEXT:  .functype       swizzle_all_i8x16 (v128, v128) -> (v128)
 ; CHECK-NEXT:  v8x16.swizzle   $push[[L0:[0-9]+]]=, $0, $1
@@ -245,7 +272,7 @@
 
 ; CHECK-LABEL: undef_const_insert_f32x4:
 ; CHECK-NEXT:  .functype       undef_const_insert_f32x4 () -> (v128)
-; UNIMP-NEXT:  v128.const      $push[[L0:[0-9]+]]=, 0x0p0, 0x1.5p5, 0x0p0, 0x0p0
+; UNIMP-NEXT:  v128.const      $push[[L0:[0-9]+]]=, 0x1.5p5, 0x1.5p5, 0x1.5p5, 0x1.5p5
 ; UNIMP-NEXT:  return          $pop[[L0]]
 ; SIMD-VM: f32x4.splat
 define <4 x float> @undef_const_insert_f32x4() {
diff --git a/llvm/test/CodeGen/WebAssembly/simd-load-splat.ll b/llvm/test/CodeGen/WebAssembly/simd-load-splat.ll
--- a/llvm/test/CodeGen/WebAssembly/simd-load-splat.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-load-splat.ll
@@ -8,10 +8,11 @@
 
 ; CHECK-LABEL: load_splat:
 ; CHECK-NEXT: .functype load_splat (i32, i32) -> (i32)
-; CHECK-NEXT: i32.load8_u $[[E:[0-9]+]]=, 0($0){{$}}
-; CHECK-NEXT: v8x16.load_splat $push[[V:[0-9]+]]=, 0($0){{$}}
-; CHECK-NEXT: v128.store 0($1), $pop[[V]]{{$}}
-; CHECK-NEXT: return $[[E]]{{$}}
+; CHECK-NEXT: i32.load8_u $push[[L0:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: local.tee $push[[L1:[0-9]+]]=, $0=, $pop[[L0]]{{$}}
+; CHECK-NEXT: i8x16.splat $push[[L2:[0-9]+]]=, $pop[[L1]]{{$}}
+; CHECK-NEXT: v128.store 0($1), $pop[[L2]]{{$}}
+; CHECK-NEXT: return $0{{$}}
 define i8 @load_splat(i8* %p, <16 x i8>* %out) {
   %e = load i8, i8* %p
   %v1 = insertelement <16 x i8> undef, i8 %e, i32 0
diff --git a/llvm/test/CodeGen/WebAssembly/simd.ll b/llvm/test/CodeGen/WebAssembly/simd.ll
--- a/llvm/test/CodeGen/WebAssembly/simd.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd.ll
@@ -36,7 +36,7 @@
 }
 
 ; CHECK-LABEL: const_splat_v16i8:
-; SIMD128: i8x16.splat
+; SIMD128: v128.const $push0=, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42{{$}}
 define <16 x i8> @const_splat_v16i8() {
   ret <16 x i8> <i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42,
                  i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42>
@@ -299,7 +299,7 @@
 }
 
 ; CHECK-LABEL: const_splat_v8i16:
-; SIMD128: i16x8.splat
+; SIMD128: v128.const $push0=, 42, 42, 42, 42, 42, 42, 42, 42{{$}}
 define <8 x i16> @const_splat_v8i16() {
   ret <8 x i16> <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42>
 }
@@ -547,7 +547,7 @@
 }
 
 ; CHECK-LABEL: const_splat_v4i32:
-; SIMD128: i32x4.splat
+; SIMD128: v128.const $push0=, 42, 42, 42, 42{{$}}
 define <4 x i32> @const_splat_v4i32() {
   ret <4 x i32> <i32 42, i32 42, i32 42, i32 42>
 }
@@ -698,7 +698,7 @@
 }
 
 ; CHECK-LABEL: const_splat_v2i64:
-; SIMD128: i64x2.splat
+; SIMD128: v128.const $push0=, 42, 42{{$}}
 define <2 x i64> @const_splat_v2i64() {
   ret <2 x i64> <i64 42, i64 42>
 }
@@ -847,7 +847,7 @@
 }
 
 ; CHECK-LABEL: const_splat_v4f32
-; SIMD128: f32x4.splat
+; SIMD128: v128.const $push0=, 0x1.5p5, 0x1.5p5, 0x1.5p5, 0x1.5p5{{$}}
 define <4 x float> @const_splat_v4f32() {
   ret <4 x float> <float 42., float 42., float 42., float 42.>
 }
@@ -998,7 +998,7 @@
 }
 
 ; CHECK-LABEL: const_splat_v2f64:
-; SIMD128: f64x2.splat
+; SIMD128: v128.const $push0=, 0x1.5p5, 0x1.5p5{{$}}
 define <2 x double> @const_splat_v2f64() {
   ret <2 x double> <double 42., double 42.>
 }