diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -1604,8 +1604,8 @@ // TODO: Tune this. For example, lanewise swizzling is very expensive, so // swizzled lanes should be given greater weight. - // TODO: Investigate building vectors by shuffling together vectors built by - // separately specialized means. + // TODO: Investigate looping rather than always extracting/replacing specific + // lanes to fill gaps. auto IsConstant = [](const SDValue &V) { return V.getOpcode() == ISD::Constant || V.getOpcode() == ISD::ConstantFP; @@ -1636,12 +1636,30 @@ return std::make_pair(SwizzleSrc, SwizzleIndices); }; + // If the lane is extracted from another vector at a constant index, return + // that vector. The source vector must not have more lanes than the dest + // because the shufflevector indices are in terms of the destination lanes and + // would not be able to address the smaller individual source lanes. + auto GetShuffleSrc = [&](const SDValue &Lane) { + if (Lane->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + if (!isa(Lane->getOperand(1).getNode())) + return SDValue(); + if (Lane->getOperand(0).getValueType().getVectorNumElements() > + VecT.getVectorNumElements()) + return SDValue(); + return Lane->getOperand(0); + }; + using ValueEntry = std::pair; SmallVector SplatValueCounts; using SwizzleEntry = std::pair, size_t>; SmallVector SwizzleCounts; + using ShuffleEntry = std::pair; + SmallVector ShuffleCounts; + auto AddCount = [](auto &Counts, const auto &Val) { auto CountIt = llvm::find_if(Counts, [&Val](auto E) { return E.first == Val; }); @@ -1670,9 +1688,11 @@ AddCount(SplatValueCounts, Lane); - if (IsConstant(Lane)) { + if (IsConstant(Lane)) NumConstantLanes++; - } else if (CanSwizzle) { + if (auto ShuffleSrc = GetShuffleSrc(Lane)) + AddCount(ShuffleCounts, ShuffleSrc); + if (CanSwizzle) { auto SwizzleSrcs = GetSwizzleSrcs(I, Lane); if (SwizzleSrcs.first) AddCount(SwizzleCounts, SwizzleSrcs); @@ -1690,18 +1710,81 @@ std::forward_as_tuple(std::tie(SwizzleSrc, SwizzleIndices), NumSwizzleLanes) = GetMostCommon(SwizzleCounts); + // Shuffles can draw from up to two vectors, so find the two most common + // sources. + SDValue ShuffleSrc1, ShuffleSrc2; + size_t NumShuffleLanes = 0; + if (ShuffleCounts.size()) { + std::tie(ShuffleSrc1, NumShuffleLanes) = GetMostCommon(ShuffleCounts); + ShuffleCounts.erase(std::remove_if(ShuffleCounts.begin(), + ShuffleCounts.end(), + [&](const auto &Pair) { + return Pair.first == ShuffleSrc1; + }), + ShuffleCounts.end()); + } + if (ShuffleCounts.size()) { + size_t AdditionalShuffleLanes; + std::tie(ShuffleSrc2, AdditionalShuffleLanes) = + GetMostCommon(ShuffleCounts); + NumShuffleLanes += AdditionalShuffleLanes; + } + // Predicate returning true if the lane is properly initialized by the // original instruction std::function IsLaneConstructed; SDValue Result; - // Prefer swizzles over vector consts over splats - if (NumSwizzleLanes >= NumSplatLanes && NumSwizzleLanes >= NumConstantLanes) { + // Prefer swizzles over shuffles over vector consts over splats + if (NumSwizzleLanes >= NumShuffleLanes && + NumSwizzleLanes >= NumConstantLanes && NumSwizzleLanes >= NumSplatLanes) { Result = DAG.getNode(WebAssemblyISD::SWIZZLE, DL, VecT, SwizzleSrc, SwizzleIndices); auto Swizzled = std::make_pair(SwizzleSrc, SwizzleIndices); IsLaneConstructed = [&, Swizzled](size_t I, const SDValue &Lane) { return Swizzled == GetSwizzleSrcs(I, Lane); }; + } else if (NumShuffleLanes >= NumConstantLanes && + NumShuffleLanes >= NumSplatLanes) { + size_t DestLaneSize = VecT.getVectorElementType().getFixedSizeInBits() / 8; + size_t DestLaneCount = VecT.getVectorNumElements(); + size_t Scale1 = 1; + size_t Scale2 = 1; + SDValue Src1 = ShuffleSrc1; + SDValue Src2 = ShuffleSrc2 ? ShuffleSrc2 : DAG.getUNDEF(VecT); + if (Src1.getValueType() != VecT) { + size_t LaneSize = + Src1.getValueType().getVectorElementType().getFixedSizeInBits() / 8; + assert(LaneSize > DestLaneSize); + Scale1 = LaneSize / DestLaneSize; + Src1 = DAG.getBitcast(VecT, Src1); + } + if (Src2.getValueType() != VecT) { + size_t LaneSize = + Src2.getValueType().getVectorElementType().getFixedSizeInBits() / 8; + assert(LaneSize > DestLaneSize); + Scale2 = LaneSize / DestLaneSize; + Src2 = DAG.getBitcast(VecT, Src2); + } + + int Mask[16]; + assert(DestLaneCount <= 16); + for (size_t I = 0; I < DestLaneCount; ++I) { + const SDValue &Lane = Op->getOperand(I); + SDValue Src = GetShuffleSrc(Lane); + if (Src == ShuffleSrc1) { + Mask[I] = Lane->getConstantOperandVal(1) * Scale1; + } else if (Src && Src == ShuffleSrc2) { + Mask[I] = DestLaneCount + Lane->getConstantOperandVal(1) * Scale2; + } else { + Mask[I] = -1; + } + } + ArrayRef MaskRef(Mask, DestLaneCount); + Result = DAG.getVectorShuffle(VecT, DL, Src1, Src2, MaskRef); + IsLaneConstructed = [&](size_t, const SDValue &Lane) { + auto Src = GetShuffleSrc(Lane); + return Src == ShuffleSrc1 || (Src && Src == ShuffleSrc2); + }; } else if (NumConstantLanes >= NumSplatLanes) { SmallVector ConstLanes; for (const SDValue &Lane : Op->op_values()) { diff --git a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll --- a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll @@ -165,6 +165,22 @@ ret <8 x i16> %v0 } +; CHECK-LABEL: half_shuffle_i32x4: +; CHECK-NEXT: .functype half_shuffle_i32x4 (v128) -> (v128) +; CHECK: i8x16.shuffle $push[[L0:[0-9]+]]=, $0, $0, 0, 0, 0, 0, 8, 9, 10, 11, 0, 1, 2, 3, 0, 0, 0, 0 +; CHECK: i32x4.replace_lane +; CHECK: i32x4.replace_lane +; CHECK: return +define <4 x i32> @half_shuffle_i32x4(<4 x i32> %src) { + %s0 = extractelement <4 x i32> %src, i32 0 + %s2 = extractelement <4 x i32> %src, i32 2 + %v0 = insertelement <4 x i32> undef, i32 0, i32 0 + %v1 = insertelement <4 x i32> %v0, i32 %s2, i32 1 + %v2 = insertelement <4 x i32> %v1, i32 %s0, i32 2 + %v3 = insertelement <4 x i32> %v2, i32 3, i32 3 + ret <4 x i32> %v3 +} + ; CHECK-LABEL: mashup_swizzle_i8x16: ; CHECK-NEXT: .functype mashup_swizzle_i8x16 (v128, v128, i32) -> (v128) ; CHECK-NEXT: i8x16.swizzle $push[[L0:[0-9]+]]=, $0, $1 diff --git a/llvm/test/CodeGen/WebAssembly/simd-concat.ll b/llvm/test/CodeGen/WebAssembly/simd-concat.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/simd-concat.ll @@ -0,0 +1,79 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s + +; Check that all varieties of vector concatenations get lowered to shuffles. + +target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" +target triple = "wasm32-unknown--wasm" + +define <16 x i8> @concat_v8i8(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: concat_v8i8: +; CHECK: .functype concat_v8i8 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +; CHECK-NEXT: # fallthrough-return + %v = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> + ret <16 x i8> %v +} + +define <8 x i8> @concat_v4i8(<4 x i8> %a, <4 x i8> %b) { +; CHECK-LABEL: concat_v4i8: +; CHECK: .functype concat_v4i8 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 +; CHECK-NEXT: # fallthrough-return + %v = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> + ret <8 x i8> %v +} + +define <8 x i16> @concat_v4i16(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: concat_v4i16: +; CHECK: .functype concat_v4i16 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 +; CHECK-NEXT: # fallthrough-return + %v = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> + ret <8 x i16> %v +} + +define <4 x i8> @concat_v2i8(<2 x i8> %a, <2 x i8> %b) { +; CHECK-LABEL: concat_v2i8: +; CHECK: .functype concat_v2i8 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK-NEXT: # fallthrough-return + %v = shufflevector <2 x i8> %a, <2 x i8> %b, <4 x i32> + ret <4 x i8> %v +} + +define <4 x i16> @concat_v2i16(<2 x i16> %a, <2 x i16> %b) { +; CHECK-LABEL: concat_v2i16: +; CHECK: .functype concat_v2i16 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK-NEXT: # fallthrough-return + %v = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> + ret <4 x i16> %v +} + +define <4 x i32> @concat_v2i32(<2 x i32> %a, <2 x i32> %b) { +; CHECK-LABEL: concat_v2i32: +; CHECK: .functype concat_v2i32 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK-NEXT: # fallthrough-return + %v = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> + ret <4 x i32> %v +}