diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -20339,6 +20339,154 @@ return NewLd; } +/// Given EXTRACT_SUBVECTOR(VECTOR_SHUFFLE(Op0, Op1, Mask)), +/// try to produce VECTOR_SHUFFLE(EXTRACT_SUBVECTOR(Op?, ?), +/// EXTRACT_SUBVECTOR(Op?, ?), +/// Mask')) +/// iff it is legal and profitable to do so. Notably, the trimmed mask +/// (containing only the elements that are extracted) +/// must reference at most two subvectors. +static SDValue foldExtractSubvectorFromShuffleVector(SDNode *N, + SelectionDAG &DAG, + const TargetLowering &TLI, + bool LegalOperations) { + assert(N->getOpcode() == ISD::EXTRACT_SUBVECTOR && + "Must only be called on EXTRACT_SUBVECTOR's"); + + SDValue N0 = N->getOperand(0); + + // Only deal with non-scalable vectors. + EVT NarrowVT = N->getValueType(0); + EVT WideVT = N0.getValueType(); + if (!NarrowVT.isFixedLengthVector() || !WideVT.isFixedLengthVector()) + return SDValue(); + + // The operand must be a shufflevector. + auto *WideShuffleVector = dyn_cast(N0); + if (!WideShuffleVector) + return SDValue(); + + uint64_t FirstExtractedEltIdx = N->getConstantOperandVal(1); + int NumEltsExtracted = NarrowVT.getVectorNumElements(); + assert((FirstExtractedEltIdx % NumEltsExtracted) == 0 && + "Extract index is not a multiple of the output vector length."); + + int WideNumElts = WideVT.getVectorNumElements(); + + SmallVector NewMask; + NewMask.reserve(NumEltsExtracted); + SmallSetVector, 2> + DemandedSubvectors; + + // Try to decode the wide mask into narrow mask from at most two subvectors. + for (int M : WideShuffleVector->getMask() + .drop_front(FirstExtractedEltIdx) + .take_front(NumEltsExtracted)) { + if (M < 0) { + assert(M == -1 && "Unexpected target shuffle mask?"); + // Does not depend on operands, does not require adjustment. + NewMask.emplace_back(M); + continue; + } + + assert(M < (2 * WideNumElts) && "Out-of-bounds shuffle mask?"); + + // From which operand of the shuffle does this shuffle mask element pick? + int WideShufOpIdx = M / WideNumElts; + // Which element of that operand is picked? + int OpEltIdx = M % WideNumElts; + + assert((OpEltIdx + WideShufOpIdx * WideNumElts) == M && + "Shuffle mask vector decomposition failure."); + + // And which NumEltsExtracted-sized subvector of that operand is that? + int OpSubvecIdx = OpEltIdx / NumEltsExtracted; + // And which element within that subvector of that operand is that? + int OpEltIdxInSubvec = OpEltIdx % NumEltsExtracted; + + assert((OpEltIdxInSubvec + OpSubvecIdx * NumEltsExtracted) == OpEltIdx && + "Shuffle mask subvector decomposition failure."); + + assert((OpEltIdxInSubvec + OpSubvecIdx * NumEltsExtracted + + WideShufOpIdx * WideNumElts) == M && + "Shuffle mask full decomposition failure."); + + SDValue Op = WideShuffleVector->getOperand(WideShufOpIdx); + + if (Op.isUndef()) { + // Picking from an undef operand. Let's adjust mask instead. + NewMask.emplace_back(-1); + continue; + } + + // Profitability check: only deal with extractions from the first subvector. + if (OpSubvecIdx != 0) + return SDValue(); + + const std::pair DemandedSubvector(Op, OpSubvecIdx); + + DemandedSubvectors.insert(DemandedSubvector); + if (DemandedSubvectors.size() > 2) + return SDValue(); // We can't handle more than two subvectors. + + // Ok, but from which operand of the new shuffle will this element pick? + int NewOpIdx = + getFirstIndexOf(DemandedSubvectors.getArrayRef(), DemandedSubvector); + assert((NewOpIdx == 0 || NewOpIdx == 1) && "Unexpected operand index."); + + int AdjM = OpEltIdxInSubvec + NewOpIdx * NumEltsExtracted; + NewMask.emplace_back(AdjM); + } + assert(NewMask.size() == (unsigned)NumEltsExtracted && "Produced bad mask."); + assert(DemandedSubvectors.size() <= 2 && + "Should have ended up demanding at most two subvectors."); + + // Did we discover that the shuffle does not actually depend on operands? + if (DemandedSubvectors.empty()) + return DAG.getUNDEF(NarrowVT); + + // Ok, looks like we will end up forming a new shuffle after all, + // which means that the old one needs to go away. + if (!WideShuffleVector->hasOneUse()) + return SDValue(); + + // And the narrow shufflevector that we'll form must be legal. + if (LegalOperations && + !TLI.isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, NarrowVT)) + return SDValue(); + + // We still perform the exact same EXTRACT_SUBVECTOR, just on different + // operand[s]/index[es], so there is no point in checking for it's legality. + + // Do not turn a legal shuffle into an illegal one. + if (TLI.isShuffleMaskLegal(WideShuffleVector->getMask(), WideVT) && + !TLI.isShuffleMaskLegal(NewMask, NarrowVT)) + return SDValue(); + + SDLoc DL(N); + + SmallVector NewOps; + for (const std::pair + &DemandedSubvector : DemandedSubvectors) { + // How many elements into the WideVT does this subvector start? + int Index = NumEltsExtracted * DemandedSubvector.second; + // Bail out if the extraction isn't going to be cheap. + if (!TLI.isExtractSubvectorCheap(NarrowVT, WideVT, Index)) + return SDValue(); + SDValue IndexC = DAG.getVectorIdxConstant(Index, DL); + NewOps.emplace_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowVT, + DemandedSubvector.first, IndexC)); + } + assert((NewOps.size() == 1 || NewOps.size() == 2) && + "Should end up with either one or two ops"); + + // If we ended up with only one operand, pad with an undef. + if (NewOps.size() == 1) + NewOps.emplace_back(DAG.getUNDEF(NarrowVT)); + + return DAG.getVectorShuffle(NarrowVT, DL, NewOps[0], NewOps[1], NewMask); +} + SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) { EVT NVT = N->getValueType(0); SDValue V = N->getOperand(0); @@ -20451,6 +20599,10 @@ } } + if (SDValue V = + foldExtractSubvectorFromShuffleVector(N, DAG, TLI, LegalOperations)) + return V; + V = peekThroughBitcasts(V); // If the input is a build vector. Try to make a smaller build vector. diff --git a/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll b/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll --- a/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll +++ b/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=armv7 | FileCheck %s ; ; Ensure that don't crash given a largeish power-of-two shufflevector index. @@ -9,8 +10,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: mov r1, #32 ; CHECK-NEXT: vld1.32 {d16, d17}, [r0], r1 -; CHECK-NEXT: vld1.32 {d18, d19}, [r0] -; CHECK-NEXT: vtrn.32 q8, q9 +; CHECK-NEXT: vldr d18, [r0] +; CHECK-NEXT: vtrn.32 d16, d18 ; CHECK-NEXT: vadd.i32 d16, d16, d16 ; CHECK-NEXT: vmov.32 r0, d16[1] ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/ARM/vext.ll b/llvm/test/CodeGen/ARM/vext.ll --- a/llvm/test/CodeGen/ARM/vext.ll +++ b/llvm/test/CodeGen/ARM/vext.ll @@ -218,13 +218,13 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: vldr d18, [r0, #32] ; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: vorr d22, d18, d18 +; CHECK-NEXT: vorr d21, d18, d18 ; CHECK-NEXT: vld1.16 {d16, d17}, [r1:128]! ; CHECK-NEXT: vldr d19, [r0, #48] -; CHECK-NEXT: vld1.64 {d20, d21}, [r1:128] -; CHECK-NEXT: vzip.16 d22, d19 -; CHECK-NEXT: vtrn.16 q8, q10 -; CHECK-NEXT: vext.16 d18, d18, d22, #2 +; CHECK-NEXT: vldr d20, [r1] +; CHECK-NEXT: vzip.16 d21, d19 +; CHECK-NEXT: vtrn.16 d16, d20 +; CHECK-NEXT: vext.16 d18, d18, d21, #2 ; CHECK-NEXT: vext.16 d16, d18, d16, #2 ; CHECK-NEXT: vext.16 d16, d16, d16, #2 ; CHECK-NEXT: vmov r0, r1, d16 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -663,8 +663,8 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -672,8 +672,8 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v4i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm2, %xmm2 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 @@ -758,8 +758,8 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v8i32: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vprotd %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -767,8 +767,8 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v8i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm2, %xmm2 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 @@ -872,9 +872,9 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v16i16: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -882,8 +882,8 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v16i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm2, %xmm2 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 @@ -1056,8 +1056,8 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v32i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm2, %xmm2 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -709,9 +709,9 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0 @@ -736,7 +736,6 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -756,7 +755,6 @@ ; ; AVX2-LABEL: splatvar_funnnel_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] @@ -841,8 +839,6 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -862,7 +858,6 @@ ; ; AVX2-LABEL: splatvar_funnnel_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -877,7 +872,6 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpsubw %xmm1, %xmm2, %xmm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -892,7 +886,6 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vpsubw %xmm1, %xmm2, %xmm1 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -907,7 +900,6 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v16i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsubw %xmm1, %xmm2, %xmm1 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -922,7 +914,6 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm2, %xmm1 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -951,10 +942,10 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v16i16: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 @@ -980,7 +971,6 @@ ; AVX1-LABEL: splatvar_funnnel_v32i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero @@ -1008,7 +998,6 @@ ; ; AVX2-LABEL: splatvar_funnnel_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -1031,7 +1020,6 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -1054,7 +1042,6 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -638,8 +638,8 @@ ; ; XOPAVX1-LABEL: splatvar_rotate_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -647,8 +647,8 @@ ; ; XOPAVX2-LABEL: splatvar_rotate_v4i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm2, %xmm2 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 @@ -736,8 +736,8 @@ ; ; XOPAVX1-LABEL: splatvar_rotate_v8i32: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vprotd %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -745,8 +745,8 @@ ; ; XOPAVX2-LABEL: splatvar_rotate_v8i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm2, %xmm2 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 @@ -853,9 +853,9 @@ ; ; XOPAVX1-LABEL: splatvar_rotate_v16i16: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -863,8 +863,8 @@ ; ; XOPAVX2-LABEL: splatvar_rotate_v16i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm2, %xmm2 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 @@ -1028,8 +1028,8 @@ ; ; XOPAVX2-LABEL: splatvar_rotate_v32i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm2, %xmm2 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll --- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -677,9 +677,9 @@ ; ; XOPAVX1-LABEL: splatvar_shift_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vpshaq %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll --- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -669,8 +669,8 @@ ; ; XOPAVX2-LABEL: splatvar_shift_v32i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -4,8 +4,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-ALL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-PERLANE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL @@ -1904,25 +1904,11 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: retq ; -; AVX1-LABEL: PR32160: -; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: PR32160: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: PR32160: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9] -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX-LABEL: PR32160: +; AVX: # %bb.0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9] +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq ; ; AVX512F-LABEL: PR32160: ; AVX512F: # %bb.0: