Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -5530,18 +5530,46 @@ return SDValue(); } +/// getExtFactor - Determine the adjustment factor for the position when +/// generating an "extract from vector registers" instruction. +static unsigned getExtFactor(SDValue &V) { + EVT EltType = V.getValueType().getVectorElementType(); + return EltType.getSizeInBits() / 8; +} + // Gather data to see if the operation can be modelled as a // shuffle in combination with VEXTs. SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const { + assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); SDLoc dl(Op); EVT VT = Op.getValueType(); unsigned NumElts = VT.getVectorNumElements(); - SmallVector SourceVecs; - SmallVector MinElts; - SmallVector MaxElts; + struct ShuffleSourceInfo { + SDValue Vec; + unsigned MinElt; + unsigned MaxElt; + + // We may insert some combination of BITCASTs and VEXT nodes to force Vec to + // be compatible with the shuffle we intend to construct. As a result + // ShuffleVec will be some sliding window into the original Vec. + SDValue ShuffleVec; + + // Code should guarantee that element i in Vec starts at element "WindowBase + // + i * WindowScale in ShuffleVec". + int WindowBase; + int WindowScale; + + bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } + ShuffleSourceInfo(SDValue Vec) + : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0), + WindowScale(1) {} + }; + // First gather all vectors used as an immediate source for this BUILD_VECTOR + // node. + SmallVector Sources; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.getOpcode() == ISD::UNDEF) @@ -5550,127 +5578,161 @@ // A shuffle can only come from building a vector from various // elements of other vectors. return SDValue(); - } else if (V.getOperand(0).getValueType().getVectorElementType() != - VT.getVectorElementType()) { - // This code doesn't know how to handle shuffles where the vector - // element types do not match (this happens because type legalization - // promotes the return type of EXTRACT_VECTOR_ELT). - // FIXME: It might be appropriate to extend this code to handle - // mismatched types. - return SDValue(); } - // Record this extraction against the appropriate vector if possible... + // Add this element source to the list if it's not already there. SDValue SourceVec = V.getOperand(0); - // If the element number isn't a constant, we can't effectively - // analyze what's going on. - if (!isa(V.getOperand(1))) - return SDValue(); - unsigned EltNo = cast(V.getOperand(1))->getZExtValue(); - bool FoundSource = false; - for (unsigned j = 0; j < SourceVecs.size(); ++j) { - if (SourceVecs[j] == SourceVec) { - if (MinElts[j] > EltNo) - MinElts[j] = EltNo; - if (MaxElts[j] < EltNo) - MaxElts[j] = EltNo; - FoundSource = true; - break; - } - } + auto Source = std::find(Sources.begin(), Sources.end(), SourceVec); + if (Source == Sources.end()) + Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); - // Or record a new source if not... - if (!FoundSource) { - SourceVecs.push_back(SourceVec); - MinElts.push_back(EltNo); - MaxElts.push_back(EltNo); - } + // Update the minimum and maximum lane number seen. + unsigned EltNo = cast(V.getOperand(1))->getZExtValue(); + Source->MinElt = std::min(Source->MinElt, EltNo); + Source->MaxElt = std::max(Source->MaxElt, EltNo); } // Currently only do something sane when at most two source vectors - // involved. - if (SourceVecs.size() > 2) + // are involved. + if (Sources.size() > 2) return SDValue(); - SDValue ShuffleSrcs[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT) }; - int VEXTOffsets[2] = {0, 0}; + // Find out the smallest element size among result and two sources, and use + // it as element size to build the shuffle_vector. + EVT SmallestEltTy = VT.getVectorElementType(); + for (auto &Source : Sources) { + EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); + if (SrcEltTy.bitsLT(SmallestEltTy)) + SmallestEltTy = SrcEltTy; + } + unsigned ResMultiplier = + VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits(); + NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); + EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); + + // If the source vector is too wide or too narrow, we may nevertheless be able + // to construct a compatible shuffle either by concatenating it with UNDEF or + // extracting a suitable range of elements. + for (auto &Src : Sources) { + EVT SrcVT = Src.ShuffleVec.getValueType(); - // This loop extracts the usage patterns of the source vectors - // and prepares appropriate SDValues for a shuffle if possible. - for (unsigned i = 0; i < SourceVecs.size(); ++i) { - if (SourceVecs[i].getValueType() == VT) { - // No VEXT necessary - ShuffleSrcs[i] = SourceVecs[i]; - VEXTOffsets[i] = 0; + if (SrcVT.getSizeInBits() == VT.getSizeInBits()) + continue; + + // This stage of the search produces a source with the same element type as + // the original, but with a total width matching the BUILD_VECTOR output. + EVT EltVT = SrcVT.getVectorElementType(); + unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); + EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); + + if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { + if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits()) + return SDValue(); + // We can pad out the smaller vector for free, so if it's part of a + // shuffle... + Src.ShuffleVec = + DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, + DAG.getUNDEF(Src.ShuffleVec.getValueType())); continue; - } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) { - // It probably isn't worth padding out a smaller vector just to - // break it down again in a shuffle. - return SDValue(); } - // Since only 64-bit and 128-bit vectors are legal on ARM and - // we've eliminated the other cases... - assert(SourceVecs[i].getValueType().getVectorNumElements() == 2*NumElts && - "unexpected vector sizes in ReconstructShuffle"); + if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits()) + return SDValue(); - if (MaxElts[i] - MinElts[i] >= NumElts) { + if (Src.MaxElt - Src.MinElt >= NumSrcElts) { // Span too large for a VEXT to cope return SDValue(); } - if (MinElts[i] >= NumElts) { + if (Src.MinElt >= NumSrcElts) { // The extraction can just take the second half - VEXTOffsets[i] = NumElts; - ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, - SourceVecs[i], - DAG.getIntPtrConstant(NumElts, dl)); - } else if (MaxElts[i] < NumElts) { + Src.ShuffleVec = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(NumSrcElts, dl, MVT::i32)); + Src.WindowBase = -NumSrcElts; + } else if (Src.MaxElt < NumSrcElts) { // The extraction can just take the first half - VEXTOffsets[i] = 0; - ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, - SourceVecs[i], - DAG.getIntPtrConstant(0, dl)); + Src.ShuffleVec = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(0, dl, MVT::i32)); } else { // An actual VEXT is needed - VEXTOffsets[i] = MinElts[i]; - SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, - SourceVecs[i], - DAG.getIntPtrConstant(0, dl)); - SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, - SourceVecs[i], - DAG.getIntPtrConstant(NumElts, dl)); - ShuffleSrcs[i] = DAG.getNode(ARMISD::VEXT, dl, VT, VEXTSrc1, VEXTSrc2, - DAG.getConstant(VEXTOffsets[i], dl, - MVT::i32)); + SDValue VEXTSrc1 = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(0, dl, MVT::i32)); + SDValue VEXTSrc2 = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(NumSrcElts, dl, MVT::i32)); + unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1); + + Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1, + VEXTSrc2, + DAG.getConstant(Imm, dl, MVT::i32)); + Src.WindowBase = -Src.MinElt; } } - SmallVector Mask; + // Another possible incompatibility occurs from the vector element types. We + // can fix this by bitcasting the source vectors to the same type we intend + // for the shuffle. + for (auto &Src : Sources) { + EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); + if (SrcEltTy == SmallestEltTy) + continue; + assert(ShuffleVT.getVectorElementType() == SmallestEltTy); + Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); + Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); + Src.WindowBase *= Src.WindowScale; + } - for (unsigned i = 0; i < NumElts; ++i) { + // Final sanity check before we try to actually produce a shuffle. + for (auto Src : Sources) + assert(Src.ShuffleVec.getValueType() == ShuffleVT); + + // The stars all align, our next step is to produce the mask for the shuffle. + SmallVector Mask(ShuffleVT.getVectorNumElements(), -1); + int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits(); + for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { SDValue Entry = Op.getOperand(i); - if (Entry.getOpcode() == ISD::UNDEF) { - Mask.push_back(-1); + if (Entry.getOpcode() == ISD::UNDEF) continue; - } - SDValue ExtractVec = Entry.getOperand(0); - int ExtractElt = cast(Op.getOperand(i) - .getOperand(1))->getSExtValue(); - if (ExtractVec == SourceVecs[0]) { - Mask.push_back(ExtractElt - VEXTOffsets[0]); - } else { - Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]); - } + auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0)); + int EltNo = cast(Entry.getOperand(1))->getSExtValue(); + + // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit + // trunc. So only std::min(SrcBits, DestBits) actually get defined in this + // segment. + EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); + int BitsDefined = std::min(OrigEltTy.getSizeInBits(), + VT.getVectorElementType().getSizeInBits()); + int LanesDefined = BitsDefined / BitsPerShuffleLane; + + // This source is expected to fill ResMultiplier lanes of the final shuffle, + // starting at the appropriate offset. + int *LaneMask = &Mask[i * ResMultiplier]; + + int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; + ExtractBase += NumElts * (Src - Sources.begin()); + for (int j = 0; j < LanesDefined; ++j) + LaneMask[j] = ExtractBase + j; } // Final check before we try to produce nonsense... - if (isShuffleMaskLegal(Mask, VT)) - return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1], - &Mask[0]); + if (!isShuffleMaskLegal(Mask, ShuffleVT)) + return SDValue(); - return SDValue(); + // We can't handle more than two sources. This should have already + // been checked before this point. + assert(Sources.size() <= 2 && "Too many sources!"); + + SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; + for (unsigned i = 0; i < Sources.size(); ++i) + ShuffleOps[i] = Sources[i].ShuffleVec; + + SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], + ShuffleOps[1], &Mask[0]); + return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); } /// isShuffleMaskLegal - Targets can use this to indicate that they only Index: test/CodeGen/ARM/vtrn.ll =================================================================== --- test/CodeGen/ARM/vtrn.ll +++ test/CodeGen/ARM/vtrn.ll @@ -335,3 +335,39 @@ %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> ret <8 x i16> %0 } + +; Here we get a build_vector node, where all the incoming extract_element +; values do modify the type. However, we get different input types, as some of +; them get truncated from i32 to i8 (from comparing cmp0 with cmp1) and some of +; them get truncated from i16 to i8 (from comparing cmp2 with cmp3). +define <8 x i8> @vtrn_mismatched_builvector0(<8 x i8> %tr0, <8 x i8> %tr1, + <4 x i32> %cmp0, <4 x i32> %cmp1, + <4 x i16> %cmp2, <4 x i16> %cmp3) { + ; CHECK-LABEL: vtrn_mismatched_builvector0 + ; CHECK: vmovn.i32 + ; CHECK: vtrn + ; CHECK: vbsl + %c0 = icmp ult <4 x i32> %cmp0, %cmp1 + %c1 = icmp ult <4 x i16> %cmp2, %cmp3 + %c = shufflevector <4 x i1> %c0, <4 x i1> %c1, <8 x i32> + %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1 + ret <8 x i8> %rv +} + +; Here we get a build_vector node, where half the incoming extract_element +; values do not modify the type (the values form cmp2), but half of them do +; (from the icmp operation). +define <8 x i8> @vtrn_mismatched_builvector1(<8 x i8> %tr0, <8 x i8> %tr1, + <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { + ; CHECK-LABEL: vtrn_mismatched_builvector1 + ; We need to extend the 4 x i8 to 4 x i16 in order to perform the vtrn + ; CHECK: vmovl + ; CHECK: vtrn.8 + ; CHECK: vbsl + %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 + %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> + %c0 = icmp ult <4 x i32> %cmp0, %cmp1 + %c = shufflevector <4 x i1> %c0, <4 x i1> %cmp2, <8 x i32> + %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1 + ret <8 x i8> %rv +} Index: test/CodeGen/ARM/vuzp.ll =================================================================== --- test/CodeGen/ARM/vuzp.ll +++ test/CodeGen/ARM/vuzp.ll @@ -285,3 +285,76 @@ %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> ret <4 x i32> %0 } + +define <8 x i8> @vuzp_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8 x i32> %cmp1) { +; In order to create the select we need to truncate the vcgt result from a vector of i32 to a vector of i8. +; This results in a build_vector with mismatched types. We will generate two vmovn.i32 instructions to +; truncate from i32 to i16 and one vuzp to perform the final truncation for i8. +; CHECK-LABEL: vuzp_trunc +; CHECK: vmovn.i32 +; CHECK: vmovn.i32 +; CHECK: vuzp +; CHECK: vbsl + %c = icmp ult <8 x i32> %cmp0, %cmp1 + %res = select <8 x i1> %c, <8 x i8> %in0, <8 x i8> %in1 + ret <8 x i8> %res +} + +; Shuffle the result from the compare with a <4 x i8>. +; We need to extend the loaded <4 x i8> to <4 x i16>. Otherwise we wouldn't be able +; to perform the vuzp and get the vbsl mask. +define <8 x i8> @vuzp_trunc_and_shuffle(<8 x i8> %tr0, <8 x i8> %tr1, + <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { +; CHECK-LABEL: vuzp_trunc_and_shuffle +; CHECK: vmovl +; CHECK: vuzp +; CHECK: vbsl + %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 + %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> + %c0 = icmp ult <4 x i32> %cmp0, %cmp1 + %c = shufflevector <4 x i1> %c0, <4 x i1> %cmp2, <8 x i32> + %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1 + ret <8 x i8> %rv +} + +; Use an undef value for the <4 x i8> that is being shuffled with the compare result. +; This produces a build_vector with some of the operands undefs. +define <8 x i8> @vuzp_trunc_and_shuffle_undef_right(<8 x i8> %tr0, <8 x i8> %tr1, + <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { +; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_right +; CHECK: vuzp +; CHECK: vbsl + %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 + %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> + %c0 = icmp ult <4 x i32> %cmp0, %cmp1 + %c = shufflevector <4 x i1> %c0, <4 x i1> undef, <8 x i32> + %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1 + ret <8 x i8> %rv +} + +define <8 x i8> @vuzp_trunc_and_shuffle_undef_left(<8 x i8> %tr0, <8 x i8> %tr1, + <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { +; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_left +; CHECK: vuzp +; CHECK: vbsl + %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 + %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> + %c0 = icmp ult <4 x i32> %cmp0, %cmp1 + %c = shufflevector <4 x i1> undef, <4 x i1> %c0, <8 x i32> + %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1 + ret <8 x i8> %rv +} + +; We're using large data types here, and we have to fill with undef values until we +; get some vector size that we can represent. +define <10 x i8> @vuzp_wide_type(<10 x i8> %tr0, <10 x i8> %tr1, + <5 x i32> %cmp0, <5 x i32> %cmp1, <5 x i8> *%cmp2_ptr) { +; CHECK-LABEL: vuzp_wide_type +; CHECK: vbsl + %cmp2_load = load <5 x i8>, <5 x i8> * %cmp2_ptr, align 4 + %cmp2 = trunc <5 x i8> %cmp2_load to <5 x i1> + %c0 = icmp ult <5 x i32> %cmp0, %cmp1 + %c = shufflevector <5 x i1> %c0, <5 x i1> %cmp2, <10 x i32> + %rv = select <10 x i1> %c, <10 x i8> %tr0, <10 x i8> %tr1 + ret <10 x i8> %rv +}