diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -19091,6 +19091,9 @@ uint64_t InVT1Size = InVT1.getFixedSizeInBits(); uint64_t InVT2Size = InVT2.getFixedSizeInBits(); + assert(InVT2Size <= InVT1Size && + "Inputs must be sorted to be in non-increasing vector size order."); + // We can't generate a shuffle node with mismatched input and output types. // Try to make the types match the type of the output. if (InVT1 != VT || InVT2 != VT) { @@ -19117,7 +19120,10 @@ // Since we now have shorter input vectors, adjust the offset of the // second vector's start. Vec2Offset = NumElems; - } else if (InVT2Size <= InVT1Size) { + } else { + assert(InVT2Size <= InVT1Size && + "Second input is not going to be larger than the first one."); + // VecIn1 is wider than the output, and we have another, possibly // smaller input. Pad the smaller input with undefs, shuffle at the // input vector width, and extract the output. @@ -19136,11 +19142,6 @@ DAG.getUNDEF(InVT1), VecIn2, ZeroIdx); } ShuffleNumElems = NumElems * 2; - } else { - // Both VecIn1 and VecIn2 are wider than the output, and VecIn2 is wider - // than VecIn1. We can't handle this for now - this case will disappear - // when we start sorting the vectors by type. - return SDValue(); } } else if (InVT2Size * 2 == VTSize && InVT1Size == VTSize) { SmallVector ConcatOps(2, DAG.getUNDEF(InVT2)); @@ -19272,6 +19273,14 @@ SDLoc DL(N); EVT VT = N->getValueType(0); + // FIXME: promote to STLExtras. + auto getFirstIndexOf = [](auto &&Range, const auto &Val) { + auto I = find(Range, Val); + if (I == Range.end()) + return -1L; + return std::distance(Range.begin(), I); + }; + // Only type-legal BUILD_VECTOR nodes are converted to shuffle nodes. if (!isTypeLegal(VT)) return SDValue(); @@ -19333,9 +19342,11 @@ // Have we seen this input vector before? // The vectors are expected to be tiny (usually 1 or 2 elements), so using // a map back from SDValues to numbers isn't worth it. - unsigned Idx = std::distance(VecIn.begin(), find(VecIn, ExtractedFromVec)); - if (Idx == VecIn.size()) + int Idx = getFirstIndexOf(VecIn, ExtractedFromVec); + if (Idx == -1) { // A new source vector? + Idx = VecIn.size(); VecIn.push_back(ExtractedFromVec); + } VectorMask[i] = Idx; } @@ -19391,9 +19402,28 @@ } } - // TODO: We want to sort the vectors by descending length, so that adjacent - // pairs have similar length, and the longer vector is always first in the - // pair. + // Sort input vectors by decreasing vector element count, + // while preserving the relative order of equally-sized vectors. + // Note that we keep the first "implicit zero vector as-is. + SmallVector SortedVecIn(VecIn); + llvm::stable_sort(MutableArrayRef(SortedVecIn).drop_front(), + [](const SDValue &a, const SDValue &b) { + return a.getValueType().getVectorNumElements() > + b.getValueType().getVectorNumElements(); + }); + + // We now also need to rebuild the VectorMask, because it referenced element + // order in VecIn, and we just sorted them. + for (int &SourceVectorIndex : VectorMask) { + if (SourceVectorIndex <= 0) + continue; + unsigned Idx = getFirstIndexOf(SortedVecIn, VecIn[SourceVectorIndex]); + assert(Idx > 0 && Idx < SortedVecIn.size() && + VecIn[SourceVectorIndex] == SortedVecIn[Idx] && "Remapping failure"); + SourceVectorIndex = Idx; + } + + VecIn = std::move(SortedVecIn); // TODO: Should this fire if some of the input vectors has illegal type (like // it does now), or should we let legalization run its course first? diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -1148,156 +1148,54 @@ ; AVX1-NEXT: vmovdqu (%rdi), %xmm0 ; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqu 32(%rdi), %xmm2 -; AVX1-NEXT: vpextrw $6, %xmm1, %eax ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[14,15,8,9,2,3,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $3, %xmm1, %eax -; AVX1-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; AVX1-NEXT: vmovd %xmm1, %eax -; AVX1-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $5, %xmm0, %eax -; AVX1-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $2, %xmm0, %eax -; AVX1-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $5, %xmm1, %eax +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,12,13,6,7,0,1,10,11,4,5] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4,5,6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[12,13,6,7,0,1,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpinsrw $3, %eax, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $2, %xmm1, %eax -; AVX1-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $7, %xmm0, %eax -; AVX1-NEXT: vpinsrw $5, %eax, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $4, %xmm0, %eax -; AVX1-NEXT: vpinsrw $6, %eax, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $1, %xmm0, %eax -; AVX1-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $2, %xmm2, %eax -; AVX1-NEXT: vpextrw $5, %xmm2, %edi -; AVX1-NEXT: vmovd %edi, %xmm2 -; AVX1-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $7, %xmm1, %eax -; AVX1-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $4, %xmm1, %eax -; AVX1-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $1, %xmm1, %eax -; AVX1-NEXT: vpinsrw $4, %eax, %xmm2, %xmm1 -; AVX1-NEXT: vpextrw $6, %xmm0, %eax -; AVX1-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 -; AVX1-NEXT: vpextrw $3, %xmm0, %eax -; AVX1-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,10,11,4,5,14,15,8,9,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,14,15,8,9,2,3,12,13,6,7,0,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7] ; AVX1-NEXT: vmovdqu %xmm3, (%rsi) ; AVX1-NEXT: vmovdqu %xmm4, (%rdx) ; AVX1-NEXT: vmovdqu %xmm0, (%rcx) ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: interleave_24i16_out_reverse: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqu (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqu 16(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqu 32(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpextrw $6, %xmm1, %eax -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[14,15,8,9,2,3,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpextrw $3, %xmm1, %eax -; AVX2-SLOW-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovd %xmm1, %eax -; AVX2-SLOW-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpextrw $5, %xmm0, %eax -; AVX2-SLOW-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpextrw $2, %xmm0, %eax -; AVX2-SLOW-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpextrw $5, %xmm1, %eax -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[12,13,6,7,0,1,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpinsrw $3, %eax, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpextrw $2, %xmm1, %eax -; AVX2-SLOW-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpextrw $7, %xmm0, %eax -; AVX2-SLOW-NEXT: vpinsrw $5, %eax, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpextrw $4, %xmm0, %eax -; AVX2-SLOW-NEXT: vpinsrw $6, %eax, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpextrw $1, %xmm0, %eax -; AVX2-SLOW-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpextrw $2, %xmm2, %eax -; AVX2-SLOW-NEXT: vpextrw $5, %xmm2, %edi -; AVX2-SLOW-NEXT: vmovd %edi, %xmm2 -; AVX2-SLOW-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpextrw $7, %xmm1, %eax -; AVX2-SLOW-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpextrw $4, %xmm1, %eax -; AVX2-SLOW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpextrw $1, %xmm1, %eax -; AVX2-SLOW-NEXT: vpinsrw $4, %eax, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpextrw $6, %xmm0, %eax -; AVX2-SLOW-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpextrw $3, %xmm0, %eax -; AVX2-SLOW-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovd %xmm0, %eax -; AVX2-SLOW-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vmovdqu %xmm3, (%rsi) -; AVX2-SLOW-NEXT: vmovdqu %xmm4, (%rdx) -; AVX2-SLOW-NEXT: vmovdqu %xmm0, (%rcx) -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: interleave_24i16_out_reverse: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqu (%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqu 16(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqu 32(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpextrw $6, %xmm1, %eax -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[14,15,8,9,2,3,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpextrw $3, %xmm1, %eax -; AVX2-FAST-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vmovd %xmm1, %eax -; AVX2-FAST-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpextrw $5, %xmm0, %eax -; AVX2-FAST-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpextrw $2, %xmm0, %eax -; AVX2-FAST-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpextrw $5, %xmm1, %eax -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[12,13,6,7,0,1,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpinsrw $3, %eax, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpextrw $2, %xmm1, %eax -; AVX2-FAST-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpextrw $7, %xmm0, %eax -; AVX2-FAST-NEXT: vpinsrw $5, %eax, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpextrw $4, %xmm0, %eax -; AVX2-FAST-NEXT: vpinsrw $6, %eax, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpextrw $1, %xmm0, %eax -; AVX2-FAST-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpextrw $7, %xmm1, %eax -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,4,5,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpextrw $4, %xmm1, %eax -; AVX2-FAST-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpextrw $1, %xmm1, %eax -; AVX2-FAST-NEXT: vpinsrw $4, %eax, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vpextrw $6, %xmm0, %eax -; AVX2-FAST-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpextrw $3, %xmm0, %eax -; AVX2-FAST-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovd %xmm0, %eax -; AVX2-FAST-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vmovdqu %xmm3, (%rsi) -; AVX2-FAST-NEXT: vmovdqu %xmm4, (%rdx) -; AVX2-FAST-NEXT: vmovdqu %xmm0, (%rcx) -; AVX2-FAST-NEXT: retq +; AVX2-LABEL: interleave_24i16_out_reverse: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2-NEXT: vmovdqu 16(%rdi), %xmm1 +; AVX2-NEXT: vmovdqu 32(%rdi), %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6],xmm2[7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[14,15,8,9,2,3,12,13,6,7,0,1,10,11,4,5] +; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm1[2],xmm4[3,4],xmm1[5],xmm4[6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[12,13,6,7,0,1,10,11,4,5,14,15,8,9,2,3] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,4,5,14,15,8,9,2,3,12,13,6,7,0,1] +; AVX2-NEXT: vmovdqu %xmm3, (%rsi) +; AVX2-NEXT: vmovdqu %xmm4, (%rdx) +; AVX2-NEXT: vmovdqu %xmm0, (%rcx) +; AVX2-NEXT: retq ; ; XOP-LABEL: interleave_24i16_out_reverse: ; XOP: # %bb.0: ; XOP-NEXT: vmovdqu (%rdi), %xmm0 ; XOP-NEXT: vmovdqu 16(%rdi), %xmm1 ; XOP-NEXT: vmovdqu 32(%rdi), %xmm2 -; XOP-NEXT: vpextrw $5, %xmm0, %eax -; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm2[14,15,8,9,2,3],xmm1[12,13,6,7,0,1],xmm2[u,u,u,u] -; XOP-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 -; XOP-NEXT: vpextrw $2, %xmm0, %eax -; XOP-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 -; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm2[12,13,6,7,0,1],xmm1[10,11,4,5],xmm2[u,u,u,u,u,u] -; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,7,8,9],xmm0[14,15,8,9,2,3] -; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm2[10,11,4,5],xmm1[14,15,8,9,2,3,u,u,u,u,u,u] -; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6,7,8,9],xmm0[12,13,6,7,0,1] +; XOP-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm2[14,15,8,9,2,3],xmm3[12,13,6,7,0,1,10,11,4,5] +; XOP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] +; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm2[12,13,6,7,0,1],xmm4[10,11,4,5,14,15,8,9,2,3] +; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm2[10,11,4,5],xmm0[14,15,8,9,2,3,12,13,6,7,0,1] ; XOP-NEXT: vmovdqu %xmm3, (%rsi) ; XOP-NEXT: vmovdqu %xmm4, (%rdx) ; XOP-NEXT: vmovdqu %xmm0, (%rcx)