diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -31,6 +31,7 @@ #include "llvm/IR/IntrinsicsWebAssembly.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetOptions.h" using namespace llvm; @@ -1565,6 +1566,7 @@ }; } else if (NumConstantLanes >= NumSplatLanes && Subtarget->hasUnimplementedSIMD128()) { + // If we support v128.const, emit it directly SmallVector ConstLanes; for (const SDValue &Lane : Op->op_values()) { if (IsConstant(Lane)) { @@ -1576,11 +1578,59 @@ } } Result = DAG.getBuildVector(VecT, DL, ConstLanes); - IsLaneConstructed = [&](size_t _, const SDValue &Lane) { + IsLaneConstructed = [&IsConstant](size_t _, const SDValue &Lane) { return IsConstant(Lane); }; - } - if (!Result) { + } else if (NumConstantLanes >= NumSplatLanes && VecT.isInteger()) { + // Otherwise, if this is an integer vector, pack the lane values together so + // we can construct the 128-bit constant from a pair of i64s using a splat + // followed by at most one i64x2.replace_lane. Also keep track of the lanes + // that actually matter so we can avoid the replace_lane in more cases. + std::array I64s{{0, 0}}; + std::array ConstLaneMasks{{0, 0}}; + size_t LaneBits = 128 / Lanes; + size_t HalfLanes = Lanes / 2; + for (size_t I = 0; I < Lanes; ++I) { + const SDValue &Lane = Op.getOperand(I); + if (IsConstant(Lane)) { + // How much we need to shift Val to position it in an i64 + auto Shift = LaneBits * (I % HalfLanes); + auto Mask = maskTrailingOnes(LaneBits); + auto Val = cast(Lane.getNode())->getZExtValue() & Mask; + I64s[I / HalfLanes] |= Val << Shift; + ConstLaneMasks[I / HalfLanes] |= Mask << Shift; + } + } + // Check whether all constant lanes in the second half of the vector are + // equivalent in the first half or vice versa to determine whether splatting + // either side will be sufficient to materialize the constant. As a special + // case, if the first and second halves have no constant lanes in common, we + // can just combine them. + bool FirstHalfSufficient = (I64s[0] & ConstLaneMasks[1]) == I64s[1]; + bool SecondHalfSufficient = (I64s[1] & ConstLaneMasks[0]) == I64s[0]; + bool CombinedSufficient = (ConstLaneMasks[0] & ConstLaneMasks[1]) == 0; + + uint64_t Splatted; + if (SecondHalfSufficient) { + Splatted = I64s[1]; + } else if (CombinedSufficient) { + Splatted = I64s[0] | I64s[1]; + } else { + Splatted = I64s[0]; + } + + Result = DAG.getSplatBuildVector(MVT::v2i64, DL, + DAG.getConstant(Splatted, DL, MVT::i64)); + if (!FirstHalfSufficient && !SecondHalfSufficient && !CombinedSufficient) { + Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2i64, Result, + DAG.getConstant(I64s[1], DL, MVT::i64), + DAG.getConstant(1, DL, MVT::i32)); + } + Result = DAG.getBitcast(VecT, Result); + IsLaneConstructed = [&IsConstant](size_t _, const SDValue &Lane) { + return IsConstant(Lane); + }; + } else { // Use a splat, but possibly a load_splat LoadSDNode *SplattedLoad; if ((SplattedLoad = dyn_cast(SplatValue)) && @@ -1593,11 +1643,14 @@ } else { Result = DAG.getSplatBuildVector(VecT, DL, SplatValue); } - IsLaneConstructed = [&](size_t _, const SDValue &Lane) { + IsLaneConstructed = [&SplatValue](size_t _, const SDValue &Lane) { return Lane == SplatValue; }; } + assert(Result); + assert(IsLaneConstructed); + // Add replace_lane instructions for any unhandled values for (size_t I = 0; I < Lanes; ++I) { const SDValue &Lane = Op->getOperand(I); diff --git a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll --- a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll @@ -8,12 +8,95 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" target triple = "wasm32-unknown-unknown" +; CHECK-LABEL: emulated_const_trivial_splat: +; CHECK-NEXT: .functype emulated_const_trivial_splat () -> (v128) +; SIMD-VM-NEXT: i64.const $push0=, 8589934593 +; SIMD-VM-NEXT: i64x2.splat $push1=, $pop0 +; SIMD-VM-NEXT: return $pop1 +; UNIMP: v128.const +define <4 x i32> @emulated_const_trivial_splat() { + ret <4 x i32> +} + +; CHECK-LABEL: emulated_const_first_sufficient: +; CHECK-NEXT: .functype emulated_const_first_sufficient () -> (v128) +; SIMD-VM-NEXT: i64.const $push0=, 8589934593 +; SIMD-VM-NEXT: i64x2.splat $push1=, $pop0 +; SIMD-VM-NEXT: return $pop1 +; UNIMP: v128.const +define <4 x i32> @emulated_const_first_sufficient() { + ret <4 x i32> +} + +; CHECK-LABEL: emulated_const_second_sufficient: +; CHECK-NEXT: .functype emulated_const_second_sufficient () -> (v128) +; SIMD-VM-NEXT: i64.const $push0=, 8589934593 +; SIMD-VM-NEXT: i64x2.splat $push1=, $pop0 +; SIMD-VM-NEXT: return $pop1 +; UNIMP: v128.const +define <4 x i32> @emulated_const_second_sufficient() { + ret <4 x i32> +} + +; CHECK-LABEL: emulated_const_combined_sufficient: +; CHECK-NEXT: .functype emulated_const_combined_sufficient () -> (v128) +; SIMD-VM-NEXT: i64.const $push0=, 8589934593 +; SIMD-VM-NEXT: i64x2.splat $push1=, $pop0 +; SIMD-VM-NEXT: return $pop1 +; UNIMP: v128.const +define <4 x i32> @emulated_const_combined_sufficient() { + ret <4 x i32> +} + +; CHECK-LABEL: emulated_const_either_sufficient: +; CHECK-NEXT: .functype emulated_const_either_sufficient () -> (v128) +; SIMD-VM-NEXT: i64.const $push0=, 1 +; SIMD-VM-NEXT: i64x2.splat $push1=, $pop0 +; SIMD-VM-NEXT: return $pop1 +; UNIMP: v128.const +define <4 x i32> @emulated_const_either_sufficient() { + ret <4 x i32> +} + +; CHECK-LABEL: emulated_const_neither_sufficient: +; CHECK-NEXT: .functype emulated_const_neither_sufficient () -> (v128) +; SIMD-VM-NEXT: i64.const $push0=, 8589934593 +; SIMD-VM-NEXT: i64x2.splat $push1=, $pop0 +; SIMD-VM-NEXT: i64.const $push2=, 17179869184 +; SIMD-VM-NEXT: i64x2.replace_lane $push3=, $pop1, 1, $pop2 +; SIMD-VM-NEXT: return $pop3 +define <4 x i32> @emulated_const_neither_sufficient() { + ret <4 x i32> +} + +; CHECK-LABEL: emulated_const_combined_sufficient_large: +; CHECK-NEXT: .functype emulated_const_combined_sufficient_large () -> (v128) +; SIMD-VM-NEXT: i64.const $push0=, 506097522914230528 +; SIMD-VM-NEXT: i64x2.splat $push1=, $pop0 +; SIMD-VM-NEXT: return $pop1 +define <16 x i8> @emulated_const_combined_sufficient_large() { + ret <16 x i8> +} + +; CHECK-LABEL: emulated_const_neither_sufficient_large: +; CHECK-NEXT: .functype emulated_const_neither_sufficient_large () -> (v128) +; SIMD-VM-NEXT: i64.const $push0=, -70368726997663744 +; SIMD-VM-NEXT: i64x2.splat $push1=, $pop0 +; SIMD-VM-NEXT: i64.const $push2=, 504408655873966336 +; SIMD-VM-NEXT: i64x2.replace_lane $push3=, $pop1, 1, $pop2 +; SIMD-VM-NEXT: return $pop3 +define <16 x i8> @emulated_const_neither_sufficient_large() { + ret <16 x i8> +} + ; CHECK-LABEL: same_const_one_replaced_i16x8: ; CHECK-NEXT: .functype same_const_one_replaced_i16x8 (i32) -> (v128) ; UNIMP-NEXT: v128.const $push[[L0:[0-9]+]]=, 42, 42, 42, 42, 42, 0, 42, 42 ; UNIMP-NEXT: i16x8.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 5, $0 ; UNIMP-NEXT: return $pop[[L1]] -; SIMD-VM: i16x8.splat +; SIMD-VM: i64x2.splat define <8 x i16> @same_const_one_replaced_i16x8(i16 %x) { %v = insertelement <8 x i16> , @@ -27,7 +110,7 @@ ; UNIMP-NEXT: v128.const $push[[L0:[0-9]+]]=, 1, -2, 3, -4, 5, 0, 7, -8 ; UNIMP-NEXT: i16x8.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 5, $0 ; UNIMP-NEXT: return $pop[[L1]] -; SIMD-VM: i16x8.splat +; SIMD-VM: i64x2.splat define <8 x i16> @different_const_one_replaced_i16x8(i16 %x) { %v = insertelement <8 x i16> , @@ -68,7 +151,7 @@ ; CHECK-NEXT: .functype splat_common_const_i32x4 () -> (v128) ; UNIMP-NEXT: v128.const $push[[L0:[0-9]+]]=, 0, 3, 3, 1 ; UNIMP-NEXT: return $pop[[L0]] -; SIMD-VM: i32x4.splat +; SIMD-VM: i64x2.splat define <4 x i32> @splat_common_const_i32x4() { ret <4 x i32> } @@ -206,7 +289,7 @@ ; UNIMP: i8x16.replace_lane ; UNIMP: i8x16.replace_lane ; UNIMP: return -; SIMD-VM: i8x16.splat +; SIMD-VM: i64x2.splat define <16 x i8> @mashup_const_i8x16(<16 x i8> %src, <16 x i8> %mask, i8 %splatted) { ; swizzle 0 %m0 = extractelement <16 x i8> %mask, i32 0