diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -636,6 +636,7 @@ def build_vector : SDNode<"ISD::BUILD_VECTOR", SDTypeProfile<1, -1, []>, []>; def scalar_to_vector : SDNode<"ISD::SCALAR_TO_VECTOR", SDTypeProfile<1, 1, []>, []>; +def splat_vector : SDNode<"ISD::SPLAT_VECTOR", SDTypeProfile<1, 1, []>, []>; // vector_extract/vector_insert are deprecated. extractelt/insertelt // are preferred. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -159,6 +159,14 @@ while (N->getOpcode() == ISD::BITCAST) N = N->getOperand(0).getNode(); + unsigned EltSize = N->getValueType(0).getScalarSizeInBits(); + + // Accept splat_vectors that will create all-ones vectors as well + if (N->getOpcode() == ISD::SPLAT_VECTOR) + if (ConstantSDNode *CN = dyn_cast(N->getOperand(0))) + if (CN->getAPIntValue().countTrailingOnes() >= EltSize) + return true; + if (N->getOpcode() != ISD::BUILD_VECTOR) return false; unsigned i = 0, e = N->getNumOperands(); @@ -179,7 +187,6 @@ // we care if the resultant vector is all ones, not whether the individual // constants are. SDValue NotZero = N->getOperand(i); - unsigned EltSize = N->getValueType(0).getScalarSizeInBits(); if (ConstantSDNode *CN = dyn_cast(NotZero)) { if (CN->getAPIntValue().countTrailingOnes() < EltSize) return false; @@ -203,6 +210,14 @@ while (N->getOpcode() == ISD::BITCAST) N = N->getOperand(0).getNode(); + unsigned EltSize = N->getValueType(0).getScalarSizeInBits(); + + // Accept splat_vectors that will create all-zeros vectors as well + if (N->getOpcode() == ISD::SPLAT_VECTOR) + if (ConstantSDNode *CN = dyn_cast(N->getOperand(0))) + if (CN->getAPIntValue().countTrailingZeros() >= EltSize) + return true; + if (N->getOpcode() != ISD::BUILD_VECTOR) return false; bool IsAllUndef = true; @@ -218,7 +233,6 @@ // We only want to check enough bits to cover the vector elements, because // we care if the resultant vector is all zeros, not whether the individual // constants are. - unsigned EltSize = N->getValueType(0).getScalarSizeInBits(); if (ConstantSDNode *CN = dyn_cast(Op)) { if (CN->getAPIntValue().countTrailingZeros() < EltSize) return false; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def --- a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def @@ -32,6 +32,3 @@ HANDLE_NODETYPE(THROW) HANDLE_NODETYPE(MEMORY_COPY) HANDLE_NODETYPE(MEMORY_FILL) - -// Memory intrinsics -HANDLE_MEM_NODETYPE(LOAD_SPLAT) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -123,6 +123,9 @@ // Hoist bitcasts out of shuffles setTargetDAGCombine(ISD::VECTOR_SHUFFLE); + // Recognize potential swizzles early + setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + // Support saturating add for i8x16 and i16x8 for (auto Op : {ISD::SADDSAT, ISD::UADDSAT}) for (auto T : {MVT::v16i8, MVT::v8i16}) @@ -132,6 +135,11 @@ for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) setOperationAction(ISD::ABS, T, Legal); + // Splats are supported + for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32, MVT::v2i64, + MVT::v2f64}) + setOperationAction(ISD::SPLAT_VECTOR, T, Legal); + // Custom lower BUILD_VECTORs to minimize number of replace_lanes for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32, MVT::v2i64, MVT::v2f64}) @@ -1553,18 +1561,7 @@ }; } if (!Result) { - // Use a splat, but possibly a load_splat - LoadSDNode *SplattedLoad; - if ((SplattedLoad = dyn_cast(SplatValue)) && - SplattedLoad->getMemoryVT() == VecT.getVectorElementType()) { - Result = DAG.getMemIntrinsicNode( - WebAssemblyISD::LOAD_SPLAT, DL, DAG.getVTList(VecT), - {SplattedLoad->getChain(), SplattedLoad->getBasePtr(), - SplattedLoad->getOffset()}, - SplattedLoad->getMemoryVT(), SplattedLoad->getMemOperand()); - } else { - Result = DAG.getSplatBuildVector(VecT, DL, SplatValue); - } + Result = DAG.getNode(ISD::SPLAT_VECTOR, DL, VecT, SplatValue); IsLaneConstructed = [&](size_t _, const SDValue &Lane) { return Lane == SplatValue; }; @@ -1730,6 +1727,56 @@ return DAG.getBitcast(DstType, NewShuffle); } +static SDValue +performINSERT_VECTOR_ELTCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + auto &DAG = DCI.DAG; + + // If this is an insert into an undef vector that could be implemented as a + // swizzle, perform that combine. If target-specific combines were run before + // generic combines, this combine would prevent the insert from being combined + // into a splat, which has a much worse lowering because it cannot be + // turned back into a swizzle. + // + // We are looking for the following pattern: + // (insert undef, + // (extract Vec1, + // (sext (extract Vec2, Index)) + // ), + // Index + // ) + // + // To combine to: (swizzle vec1, vec2) + if (!N->getOperand(0).isUndef() || N->getValueType(0) != MVT::v16i8) + return SDValue(); + auto Index1 = N->getOperand(2); + auto Extract1 = N->getOperand(1); + if (Extract1.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + auto Vec1 = Extract1.getOperand(0); + if (Vec1.getValueType() != MVT::v16i8) + return SDValue(); + auto SExt = Extract1.getOperand(1); + if (SExt.getOpcode() != ISD::SIGN_EXTEND) + return SDValue(); + auto Extract2 = SExt.getOperand(0); + if (Extract2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + auto Vec2 = Extract2.getOperand(0); + if (Vec2.getValueType() != MVT::v16i8) + return SDValue(); + auto Index2 = Extract2.getOperand(1); + + auto *CN1 = dyn_cast(Index1); + auto *CN2 = dyn_cast(Index2); + if (Index1 == Index2 || + (CN1 && CN2 && CN1->getZExtValue() == CN2->getZExtValue())) + return DAG.getNode(WebAssemblyISD::SWIZZLE, SDLoc(N), MVT::v16i8, Vec1, + Vec2); + + return SDValue(); +} + SDValue WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { @@ -1738,5 +1785,7 @@ return SDValue(); case ISD::VECTOR_SHUFFLE: return performVECTOR_SHUFFLECombine(N, DCI); + case ISD::INSERT_VECTOR_ELT: + return performINSERT_VECTOR_ELTCombine(N, DCI); } } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -87,29 +87,41 @@ defm "" : SIMDLoadSplat<"v32x4", 9>; defm "" : SIMDLoadSplat<"v64x2", 10>; -def wasm_load_splat_t : SDTypeProfile<1, 1, [SDTCisPtrTy<1>]>; -def wasm_load_splat : SDNode<"WebAssemblyISD::LOAD_SPLAT", wasm_load_splat_t, - [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; -def load_splat : PatFrag<(ops node:$addr), (wasm_load_splat node:$addr)>; - -foreach args = [["v16i8", "v8x16"], ["v8i16", "v16x8"], ["v4i32", "v32x4"], - ["v2i64", "v64x2"], ["v4f32", "v32x4"], ["v2f64", "v64x2"]] in { +def load_splat_i8 : + PatFrag<(ops node:$addr), (splat_vector (i32 (extloadi8 node:$addr)))>; +def load_splat_i16 : + PatFrag<(ops node:$addr), (splat_vector (i32 (extloadi16 node:$addr)))>; +def load_splat_i32 : + PatFrag<(ops node:$addr), (splat_vector (i32 (load node:$addr)))>; +def load_splat_i64 : + PatFrag<(ops node:$addr), (splat_vector (i64 (load node:$addr)))>; +def load_splat_f32 : + PatFrag<(ops node:$addr), (splat_vector (f32 (load node:$addr)))>; +def load_splat_f64 : + PatFrag<(ops node:$addr), (splat_vector (f64 (load node:$addr)))>; + +foreach args = [["v16i8", "v8x16", "load_splat_i8"], + ["v8i16", "v16x8", "load_splat_i16"], + ["v4i32", "v32x4", "load_splat_i32"], + ["v2i64", "v64x2", "load_splat_i64"], + ["v4f32", "v32x4", "load_splat_f32"], + ["v2f64", "v64x2", "load_splat_f64"]] in { defm : LoadPatNoOffset(args[0]), - load_splat, + !cast(args[2]), "LOAD_SPLAT_"#args[1]>; defm : LoadPatImmOff(args[0]), - load_splat, + !cast(args[2]), regPlusImm, "LOAD_SPLAT_"#args[1]>; defm : LoadPatImmOff(args[0]), - load_splat, + !cast(args[2]), or_is_add, "LOAD_SPLAT_"#args[1]>; defm : LoadPatOffsetOnly(args[0]), - load_splat, + !cast(args[2]), "LOAD_SPLAT_"#args[1]>; defm : LoadPatGlobalAddrOffOnly(args[0]), - load_splat, + !cast(args[2]), "LOAD_SPLAT_"#args[1]>; } @@ -192,8 +204,7 @@ // Constant: v128.const multiclass ConstVec { - let isMoveImm = 1, isReMaterializable = 1, - Predicates = [HasUnimplementedSIMD128] in + let isMoveImm = 1, isReMaterializable = 1 in defm CONST_V128_#vec_t : SIMD_I<(outs V128:$dst), ops, (outs), ops, [(set V128:$dst, (vec_t pat))], "v128.const\t$dst, "#args, @@ -246,6 +257,35 @@ (build_vector (f64 fpimm:$i0), (f64 fpimm:$i1)), "$i0, $i1">; +// Prefer v128.const over splats when possible +let Predicates = [HasUnimplementedSIMD128] in { +def : Pat<(v16i8 (splat_vector (i32 imm:$x))), + (v16i8 (CONST_V128_v16i8 + (i32 imm:$x), (i32 imm:$x), (i32 imm:$x), (i32 imm:$x), + (i32 imm:$x), (i32 imm:$x), (i32 imm:$x), (i32 imm:$x), + (i32 imm:$x), (i32 imm:$x), (i32 imm:$x), (i32 imm:$x), + (i32 imm:$x), (i32 imm:$x), (i32 imm:$x), (i32 imm:$x)))>; + +def : Pat<(v8i16 (splat_vector (i32 imm:$x))), + (v8i16 (CONST_V128_v8i16 + (i32 imm:$x), (i32 imm:$x), (i32 imm:$x), (i32 imm:$x), + (i32 imm:$x), (i32 imm:$x), (i32 imm:$x), (i32 imm:$x)))>; + +def : Pat<(v4i32 (splat_vector (i32 imm:$x))), + (v4i32 (CONST_V128_v4i32 + (i32 imm:$x), (i32 imm:$x), (i32 imm:$x), (i32 imm:$x)))>; + +def : Pat<(v2i64 (splat_vector (i64 imm:$x))), + (v2i64 (CONST_V128_v2i64 (i64 imm:$x), (i64 imm:$x)))>; + +def : Pat<(v4f32 (splat_vector (f32 fpimm:$x))), + (v4f32 (CONST_V128_v4f32 + (f32 fpimm:$x), (f32 fpimm:$x), (f32 fpimm:$x), (f32 fpimm:$x)))>; + +def : Pat<(v2f64 (splat_vector (f64 fpimm:$x))), + (v2f64 (CONST_V128_v2f64 (f64 fpimm:$x), (f64 fpimm:$x)))>; +} // Predicates = [HasUnimplementedSIMD128] + // Shuffle lanes: shuffle defm SHUFFLE : SIMD_I<(outs V128:$dst), @@ -314,46 +354,19 @@ (SWIZZLE V128:$src, V128:$mask)>; // Create vector with identical lanes: splat -def splat2 : PatFrag<(ops node:$x), (build_vector node:$x, node:$x)>; -def splat4 : PatFrag<(ops node:$x), (build_vector - node:$x, node:$x, node:$x, node:$x)>; -def splat8 : PatFrag<(ops node:$x), (build_vector - node:$x, node:$x, node:$x, node:$x, - node:$x, node:$x, node:$x, node:$x)>; -def splat16 : PatFrag<(ops node:$x), (build_vector - node:$x, node:$x, node:$x, node:$x, - node:$x, node:$x, node:$x, node:$x, - node:$x, node:$x, node:$x, node:$x, - node:$x, node:$x, node:$x, node:$x)>; - multiclass Splat simdop> { - // Prefer splats over v128.const for const splats (65 is lowest that works) - let AddedComplexity = 65 in + bits<32> simdop> { defm SPLAT_#vec_t : SIMD_I<(outs V128:$dst), (ins reg_t:$x), (outs), (ins), - [(set (vec_t V128:$dst), (splat_pat reg_t:$x))], + [(set (vec_t V128:$dst), (splat_vector reg_t:$x))], vec#".splat\t$dst, $x", vec#".splat", simdop>; } -defm "" : Splat; -defm "" : Splat; -defm "" : Splat; -defm "" : Splat; -defm "" : Splat; -defm "" : Splat; - -// scalar_to_vector leaves high lanes undefined, so can be a splat -class ScalarSplatPat : - Pat<(vec_t (scalar_to_vector (lane_t reg_t:$x))), - (!cast("SPLAT_"#vec_t) reg_t:$x)>; - -def : ScalarSplatPat; -def : ScalarSplatPat; -def : ScalarSplatPat; -def : ScalarSplatPat; -def : ScalarSplatPat; -def : ScalarSplatPat; +defm "" : Splat; +defm "" : Splat; +defm "" : Splat; +defm "" : Splat; +defm "" : Splat; +defm "" : Splat; //===----------------------------------------------------------------------===// // Accessing lanes @@ -746,15 +759,15 @@ (add node:$lhs, node:$rhs), "return N->getFlags().hasNoUnsignedWrap();">; -foreach nodes = [[v16i8, splat16], [v8i16, splat8]] in +foreach type = [v16i8, v8i16] in def : Pat<(wasm_shr_u (add_nuw - (add_nuw (nodes[0] V128:$lhs), (nodes[0] V128:$rhs)), - (nodes[1] (i32 1)) + (add_nuw (type V128:$lhs), (type V128:$rhs)), + (type (splat_vector (i32 1))) ), (i32 1) ), - (!cast("AVGR_U_"#nodes[0]) V128:$lhs, V128:$rhs)>; + (!cast("AVGR_U_"#type) V128:$lhs, V128:$rhs)>; // Widening dot product: i32x4.dot_i16x8_s let isCommutable = 1 in diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll --- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll @@ -1278,9 +1278,8 @@ ; CHECK-LABEL: min_unordered_v4f32: ; NO-SIMD128-NOT: f32x4 ; SIMD128-NEXT: .functype min_unordered_v4f32 (v128) -> (v128){{$}} -; SIMD128-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.4p2 -; SIMD128-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]] -; SIMD128-NEXT: f32x4.min $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}} +; SIMD128-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.4p2, 0x1.4p2, 0x1.4p2, 0x1.4p2{{$}} +; SIMD128-NEXT: f32x4.min $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}} ; SIMD128-NEXT: return $pop[[R]]{{$}} define <4 x float> @min_unordered_v4f32(<4 x float> %x) { %cmps = fcmp ule <4 x float> %x, @@ -1292,9 +1291,8 @@ ; CHECK-LABEL: max_unordered_v4f32: ; NO-SIMD128-NOT: f32x4 ; SIMD128-NEXT: .functype max_unordered_v4f32 (v128) -> (v128){{$}} -; SIMD128-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.4p2 -; SIMD128-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]] -; SIMD128-NEXT: f32x4.max $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}} +; SIMD128-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.4p2, 0x1.4p2, 0x1.4p2, 0x1.4p2 +; SIMD128-NEXT: f32x4.max $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}} ; SIMD128-NEXT: return $pop[[R]]{{$}} define <4 x float> @max_unordered_v4f32(<4 x float> %x) { %cmps = fcmp uge <4 x float> %x, @@ -1306,9 +1304,8 @@ ; CHECK-LABEL: min_ordered_v4f32: ; NO-SIMD128-NOT: f32x4 ; SIMD128-NEXT: .functype min_ordered_v4f32 (v128) -> (v128){{$}} -; SIMD128-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.4p2 -; SIMD128-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]] -; SIMD128-NEXT: f32x4.min $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}} +; SIMD128-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.4p2, 0x1.4p2, 0x1.4p2, 0x1.4p2{{$}} +; SIMD128-NEXT: f32x4.min $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}} ; SIMD128-NEXT: return $pop[[R]]{{$}} define <4 x float> @min_ordered_v4f32(<4 x float> %x) { %cmps = fcmp ole <4 x float> , %x @@ -1320,9 +1317,8 @@ ; CHECK-LABEL: max_ordered_v4f32: ; NO-SIMD128-NOT: f32x4 ; SIMD128-NEXT: .functype max_ordered_v4f32 (v128) -> (v128){{$}} -; SIMD128-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.4p2 -; SIMD128-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]] -; SIMD128-NEXT: f32x4.max $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}} +; SIMD128-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.4p2, 0x1.4p2, 0x1.4p2, 0x1.4p2{{$}} +; SIMD128-NEXT: f32x4.max $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}} ; SIMD128-NEXT: return $pop[[R]]{{$}} define <4 x float> @max_ordered_v4f32(<4 x float> %x) { %cmps = fcmp oge <4 x float> , %x @@ -1378,8 +1374,7 @@ ; CHECK-LABEL: min_const_intrinsic_v4f32: ; NO-SIMD128-NOT: f32x4 ; SIMD128-NEXT: .functype min_const_intrinsic_v4f32 () -> (v128){{$}} -; SIMD128-NEXT: f32.const $push[[L:[0-9]+]]=, 0x1.4p2{{$}} -; SIMD128-NEXT: f32x4.splat $push[[R:[0-9]+]]=, $pop[[L]]{{$}} +; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=, 0x1.4p2, 0x1.4p2, 0x1.4p2, 0x1.4p2{{$}} ; SIMD128-NEXT: return $pop[[R]]{{$}} define <4 x float> @min_const_intrinsic_v4f32() { %a = call <4 x float> @llvm.minimum.v4f32( @@ -1392,8 +1387,7 @@ ; CHECK-LABEL: max_const_intrinsic_v4f32: ; NO-SIMD128-NOT: f32x4 ; SIMD128-NEXT: .functype max_const_intrinsic_v4f32 () -> (v128){{$}} -; SIMD128-NEXT: f32.const $push[[L:[0-9]+]]=, 0x1.5p5{{$}} -; SIMD128-NEXT: f32x4.splat $push[[R:[0-9]+]]=, $pop[[L]]{{$}} +; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=, 0x1.5p5, 0x1.5p5, 0x1.5p5, 0x1.5p5{{$}} ; SIMD128-NEXT: return $pop[[R]]{{$}} define <4 x float> @max_const_intrinsic_v4f32() { %a = call <4 x float> @llvm.maximum.v4f32( @@ -1482,9 +1476,8 @@ ; CHECK-LABEL: min_unordered_v2f64: ; NO-SIMD128-NOT: f64x2 ; SIMD128-NEXT: .functype min_unordered_v2f64 (v128) -> (v128){{$}} -; SIMD128-NEXT: f64.const $push[[L0:[0-9]+]]=, 0x1.4p2 -; SIMD128-NEXT: f64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]] -; SIMD128-NEXT: f64x2.min $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}} +; SIMD128-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.4p2, 0x1.4p2{{$}} +; SIMD128-NEXT: f64x2.min $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}} ; SIMD128-NEXT: return $pop[[R]]{{$}} define <2 x double> @min_unordered_v2f64(<2 x double> %x) { %cmps = fcmp ule <2 x double> %x, @@ -1496,9 +1489,8 @@ ; CHECK-LABEL: max_unordered_v2f64: ; NO-SIMD128-NOT: f64x2 ; SIMD128-NEXT: .functype max_unordered_v2f64 (v128) -> (v128){{$}} -; SIMD128-NEXT: f64.const $push[[L0:[0-9]+]]=, 0x1.4p2 -; SIMD128-NEXT: f64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]] -; SIMD128-NEXT: f64x2.max $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}} +; SIMD128-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.4p2, 0x1.4p2{{$}} +; SIMD128-NEXT: f64x2.max $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}} ; SIMD128-NEXT: return $pop[[R]]{{$}} define <2 x double> @max_unordered_v2f64(<2 x double> %x) { %cmps = fcmp uge <2 x double> %x, @@ -1510,9 +1502,8 @@ ; CHECK-LABEL: min_ordered_v2f64: ; NO-SIMD128-NOT: f64x2 ; SIMD128-NEXT: .functype min_ordered_v2f64 (v128) -> (v128){{$}} -; SIMD128-NEXT: f64.const $push[[L0:[0-9]+]]=, 0x1.4p2 -; SIMD128-NEXT: f64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]] -; SIMD128-NEXT: f64x2.min $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}} +; SIMD128-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.4p2, 0x1.4p2{{$}} +; SIMD128-NEXT: f64x2.min $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}} ; SIMD128-NEXT: return $pop[[R]]{{$}} define <2 x double> @min_ordered_v2f64(<2 x double> %x) { %cmps = fcmp ole <2 x double> , %x @@ -1524,9 +1515,8 @@ ; CHECK-LABEL: max_ordered_v2f64: ; NO-SIMD128-NOT: f64x2 ; SIMD128-NEXT: .functype max_ordered_v2f64 (v128) -> (v128){{$}} -; SIMD128-NEXT: f64.const $push[[L0:[0-9]+]]=, 0x1.4p2 -; SIMD128-NEXT: f64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]] -; SIMD128-NEXT: f64x2.max $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}} +; SIMD128-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.4p2, 0x1.4p2{{$}} +; SIMD128-NEXT: f64x2.max $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}} ; SIMD128-NEXT: return $pop[[R]]{{$}} define <2 x double> @max_ordered_v2f64(<2 x double> %x) { %cmps = fcmp oge <2 x double> , %x @@ -1560,8 +1550,7 @@ ; CHECK-LABEL: min_const_intrinsic_v2f64: ; NO-SIMD128-NOT: f64x2 ; SIMD128-NEXT: .functype min_const_intrinsic_v2f64 () -> (v128){{$}} -; SIMD128-NEXT: f64.const $push[[L:[0-9]+]]=, 0x1.4p2{{$}} -; SIMD128-NEXT: f64x2.splat $push[[R:[0-9]+]]=, $pop[[L]]{{$}} +; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=, 0x1.4p2, 0x1.4p2{{$}} ; SIMD128-NEXT: return $pop[[R]]{{$}} define <2 x double> @min_const_intrinsic_v2f64() { %a = call <2 x double> @llvm.minimum.v2f64( @@ -1574,8 +1563,7 @@ ; CHECK-LABEL: max_const_intrinsic_v2f64: ; NO-SIMD128-NOT: f64x2 ; SIMD128-NEXT: .functype max_const_intrinsic_v2f64 () -> (v128){{$}} -; SIMD128-NEXT: f64.const $push[[L:[0-9]+]]=, 0x1.5p5{{$}} -; SIMD128-NEXT: f64x2.splat $push[[R:[0-9]+]]=, $pop[[L]]{{$}} +; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=, 0x1.5p5, 0x1.5p5{{$}} ; SIMD128-NEXT: return $pop[[R]]{{$}} define <2 x double> @max_const_intrinsic_v2f64() { %a = call <2 x double> @llvm.maximum.v2f64( diff --git a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll --- a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll @@ -93,10 +93,25 @@ ret <8 x i16> %v7 } +;; TODO: This would be a single v8x16.swizzle if the target-specific +;; DAG combine for turning insert_vector_elts into swizzles were run +;; before the generic DAG combine that turns them into +;; splat_vectors. This could be fixed by disabling the generic combine +;; and performing it ourselves at a later stage. + ; CHECK-LABEL: swizzle_one_i8x16: ; CHECK-NEXT: .functype swizzle_one_i8x16 (v128, v128) -> (v128) -; CHECK-NEXT: v8x16.swizzle $push[[L0:[0-9]+]]=, $0, $1 -; CHECK-NEXT: return $pop[[L0]] +; CHECK-NEXT: global.get $push5=, __stack_pointer +; CHECK-NEXT: i32.const $push6=, 16 +; CHECK-NEXT: i32.sub $push8=, $pop5, $pop6 +; CHECK-NEXT: local.tee $push7=, $2=, $pop8 +; CHECK-NEXT: v128.store 0($pop7), $0 +; CHECK-NEXT: i8x16.extract_lane_u $push0=, $1, 0 +; CHECK-NEXT: i32.const $push1=, 15 +; CHECK-NEXT: i32.and $push2=, $pop0, $pop1 +; CHECK-NEXT: i32.or $push3=, $2, $pop2 +; CHECK-NEXT: v8x16.load_splat $push4=, 0($pop3) +; CHECK-NEXT: return $pop4 define <16 x i8> @swizzle_one_i8x16(<16 x i8> %src, <16 x i8> %mask) { %m0 = extractelement <16 x i8> %mask, i32 0 %s0 = extractelement <16 x i8> %src, i8 %m0 @@ -104,6 +119,18 @@ ret <16 x i8> %v0 } +; CHECK-LABEL: swizzle_one_var_i8x16: +; CHECK-NEXT: .functype swizzle_one_var_i8x16 (v128, v128, i32) -> (v128) +; CHECK-NEXT: v8x16.swizzle $push[[L0:[0-9]+]]=, $0, $1 +; CHECK-NEXT: return $pop[[L0]] +define <16 x i8> @swizzle_one_var_i8x16(<16 x i8> %src, <16 x i8> %mask, + i32 %idx) { + %m0 = extractelement <16 x i8> %mask, i32 %idx + %s0 = extractelement <16 x i8> %src, i8 %m0 + %v0 = insertelement <16 x i8> undef, i8 %s0, i32 %idx + ret <16 x i8> %v0 +} + ; CHECK-LABEL: swizzle_all_i8x16: ; CHECK-NEXT: .functype swizzle_all_i8x16 (v128, v128) -> (v128) ; CHECK-NEXT: v8x16.swizzle $push[[L0:[0-9]+]]=, $0, $1 @@ -245,7 +272,7 @@ ; CHECK-LABEL: undef_const_insert_f32x4: ; CHECK-NEXT: .functype undef_const_insert_f32x4 () -> (v128) -; UNIMP-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x0p0, 0x1.5p5, 0x0p0, 0x0p0 +; UNIMP-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.5p5, 0x1.5p5, 0x1.5p5, 0x1.5p5 ; UNIMP-NEXT: return $pop[[L0]] ; SIMD-VM: f32x4.splat define <4 x float> @undef_const_insert_f32x4() { diff --git a/llvm/test/CodeGen/WebAssembly/simd-load-splat.ll b/llvm/test/CodeGen/WebAssembly/simd-load-splat.ll --- a/llvm/test/CodeGen/WebAssembly/simd-load-splat.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-load-splat.ll @@ -8,10 +8,11 @@ ; CHECK-LABEL: load_splat: ; CHECK-NEXT: .functype load_splat (i32, i32) -> (i32) -; CHECK-NEXT: i32.load8_u $[[E:[0-9]+]]=, 0($0){{$}} -; CHECK-NEXT: v8x16.load_splat $push[[V:[0-9]+]]=, 0($0){{$}} -; CHECK-NEXT: v128.store 0($1), $pop[[V]]{{$}} -; CHECK-NEXT: return $[[E]]{{$}} +; CHECK-NEXT: i32.load8_u $push[[L0:[0-9]+]]=, 0($0){{$}} +; CHECK-NEXT: local.tee $push[[L1:[0-9]+]]=, $0=, $pop[[L0]]{{$}} +; CHECK-NEXT: i8x16.splat $push[[L2:[0-9]+]]=, $pop[[L1]]{{$}} +; CHECK-NEXT: v128.store 0($1), $pop[[L2]]{{$}} +; CHECK-NEXT: return $0{{$}} define i8 @load_splat(i8* %p, <16 x i8>* %out) { %e = load i8, i8* %p %v1 = insertelement <16 x i8> undef, i8 %e, i32 0 diff --git a/llvm/test/CodeGen/WebAssembly/simd.ll b/llvm/test/CodeGen/WebAssembly/simd.ll --- a/llvm/test/CodeGen/WebAssembly/simd.ll +++ b/llvm/test/CodeGen/WebAssembly/simd.ll @@ -36,7 +36,7 @@ } ; CHECK-LABEL: const_splat_v16i8: -; SIMD128: i8x16.splat +; SIMD128: v128.const $push0=, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42{{$}} define <16 x i8> @const_splat_v16i8() { ret <16 x i8> @@ -299,7 +299,7 @@ } ; CHECK-LABEL: const_splat_v8i16: -; SIMD128: i16x8.splat +; SIMD128: v128.const $push0=, 42, 42, 42, 42, 42, 42, 42, 42{{$}} define <8 x i16> @const_splat_v8i16() { ret <8 x i16> } @@ -547,7 +547,7 @@ } ; CHECK-LABEL: const_splat_v4i32: -; SIMD128: i32x4.splat +; SIMD128: v128.const $push0=, 42, 42, 42, 42{{$}} define <4 x i32> @const_splat_v4i32() { ret <4 x i32> } @@ -698,7 +698,7 @@ } ; CHECK-LABEL: const_splat_v2i64: -; SIMD128: i64x2.splat +; SIMD128: v128.const $push0=, 42, 42{{$}} define <2 x i64> @const_splat_v2i64() { ret <2 x i64> } @@ -847,7 +847,7 @@ } ; CHECK-LABEL: const_splat_v4f32 -; SIMD128: f32x4.splat +; SIMD128: v128.const $push0=, 0x1.5p5, 0x1.5p5, 0x1.5p5, 0x1.5p5{{$}} define <4 x float> @const_splat_v4f32() { ret <4 x float> } @@ -998,7 +998,7 @@ } ; CHECK-LABEL: const_splat_v2f64: -; SIMD128: f64x2.splat +; SIMD128: v128.const $push0=, 0x1.5p5, 0x1.5p5{{$}} define <2 x double> @const_splat_v2f64() { ret <2 x double> }