diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -323,10 +323,15 @@ "platform configuration instruction">; // On recent X86 (port bound) processors, its preferable to combine to a single shuffle // using a variable mask over multiple fixed shuffles. -def FeatureFastVariableShuffle - : SubtargetFeature<"fast-variable-shuffle", - "HasFastVariableShuffle", - "true", "Shuffles with variable masks are fast">; +def FeatureFastVariableCrossLaneShuffle + : SubtargetFeature<"fast-variable-crosslane-shuffle", + "HasFastVariableCrossLaneShuffle", + "true", "Cross-lane shuffles with variable masks are fast">; +def FeatureFastVariablePerLaneShuffle + : SubtargetFeature<"fast-variable-perlane-shuffle", + "HasFastVariablePerLaneShuffle", + "true", "Per-lane shuffles with variable masks are fast">; + // On some X86 processors, a vzeroupper instruction should be inserted after // using ymm/zmm registers before executing code that may use SSE instructions. def FeatureInsertVZEROUPPER @@ -639,7 +644,8 @@ FeatureFastScalarFSQRT, FeatureFastSHLDRotate, FeatureFast15ByteNOP, - FeatureFastVariableShuffle, + FeatureFastVariableCrossLaneShuffle, + FeatureFastVariablePerLaneShuffle, FeaturePOPCNTFalseDeps, FeatureLZCNTFalseDeps, FeatureInsertVZEROUPPER]; @@ -667,7 +673,8 @@ FeatureFastVectorFSQRT, FeatureFastSHLDRotate, FeatureFast15ByteNOP, - FeatureFastVariableShuffle, + FeatureFastVariableCrossLaneShuffle, + FeatureFastVariablePerLaneShuffle, FeaturePOPCNTFalseDeps, FeatureInsertVZEROUPPER]; list SKLFeatures = @@ -693,7 +700,8 @@ FeatureFastVectorFSQRT, FeatureFastSHLDRotate, FeatureFast15ByteNOP, - FeatureFastVariableShuffle, + FeatureFastVariableCrossLaneShuffle, + FeatureFastVariablePerLaneShuffle, FeaturePrefer256Bit, FeaturePOPCNTFalseDeps, FeatureInsertVZEROUPPER]; @@ -730,7 +738,8 @@ FeatureFastVectorFSQRT, FeatureFastSHLDRotate, FeatureFast15ByteNOP, - FeatureFastVariableShuffle, + FeatureFastVariableCrossLaneShuffle, + FeatureFastVariablePerLaneShuffle, FeaturePrefer256Bit, FeatureInsertVZEROUPPER]; list CNLFeatures = diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -15967,9 +15967,9 @@ if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2)) return V; - // If that doesn't work and we have fast variable shuffle, + // If that doesn't work and we have fast variable cross-lane shuffle, // attempt 32-bit sublanes (vpermd). - if (!Subtarget.hasFastVariableShuffle()) + if (!Subtarget.hasFastVariableCrossLaneShuffle()) return SDValue(); return getSublanePermute(/*NumSublanes=*/NumLanes * 4); @@ -16475,7 +16475,7 @@ if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() && !is128BitUnpackShuffleMask(HalfMask) && (!isSingleSHUFPSMask(HalfMask) || - Subtarget.hasFastVariableShuffle())) + Subtarget.hasFastVariableCrossLaneShuffle())) return SDValue(); // If this is a unary shuffle (assume that the 2nd operand is // canonicalized to undef), then we can use vpermpd. Otherwise, we @@ -35542,7 +35542,8 @@ static SDValue combineX86ShuffleChainWithExtract( ArrayRef Inputs, SDValue Root, ArrayRef BaseMask, int Depth, - bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG, + bool HasVariableMask, bool AllowVariableCrossLaneMask, + bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget); /// Combine an arbitrary chain of shuffles into a single instruction if @@ -35557,7 +35558,9 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, ArrayRef BaseMask, int Depth, bool HasVariableMask, - bool AllowVariableMask, SelectionDAG &DAG, + bool AllowVariableCrossLaneMask, + bool AllowVariablePerLaneMask, + SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!"); assert((Inputs.size() == 1 || Inputs.size() == 2) && @@ -36012,8 +36015,14 @@ return SDValue(); // Depth threshold above which we can efficiently use variable mask shuffles. - int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2; - AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask; + int VariableCrossLaneShuffleDepth = + Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2; + int VariablePerLaneShuffleDepth = + Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2; + AllowVariableCrossLaneMask &= + (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask; + AllowVariablePerLaneMask &= + (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask; // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a // higher depth before combining them. bool AllowBWIVPERMV3 = (Depth >= 2 || HasVariableMask); @@ -36022,7 +36031,7 @@ if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) { // If we have a single input lane-crossing shuffle then lower to VPERMV. - if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros) { + if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) { if (Subtarget.hasAVX2() && (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) { SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true); @@ -36047,7 +36056,7 @@ // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero // vector as the second source (non-VLX will pad to 512-bit shuffles). - if (UnaryShuffle && AllowVariableMask && + if (UnaryShuffle && AllowVariableCrossLaneMask && ((Subtarget.hasAVX512() && (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 || @@ -36070,13 +36079,14 @@ // If that failed and either input is extracted then try to combine as a // shuffle with the larger type. if (SDValue WideShuffle = combineX86ShuffleChainWithExtract( - Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask, - DAG, Subtarget)) + Inputs, Root, BaseMask, Depth, HasVariableMask, + AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, + Subtarget)) return WideShuffle; // If we have a dual input lane-crossing shuffle then lower to VPERMV3, // (non-VLX will pad to 512-bit shuffles). - if (AllowVariableMask && !MaskContainsZeros && + if (AllowVariableCrossLaneMask && !MaskContainsZeros && ((Subtarget.hasAVX512() && (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 || @@ -36096,7 +36106,7 @@ // See if we can combine a single input shuffle with zeros to a bit-mask, // which is much simpler than any shuffle. - if (UnaryShuffle && MaskContainsZeros && AllowVariableMask && + if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask && isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) && DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) { APInt Zero = APInt::getNullValue(MaskEltSizeInBits); @@ -36124,7 +36134,7 @@ // If we have a single input shuffle with different shuffle patterns in the // the 128-bit lanes use the variable mask to VPERMILPS. // TODO Combine other mask types at higher depths. - if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros && + if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros && ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) || (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) { SmallVector VPermIdx; @@ -36141,7 +36151,7 @@ // With XOP, binary shuffles of 128/256-bit floating point vectors can combine // to VPERMIL2PD/VPERMIL2PS. - if (AllowVariableMask && Subtarget.hasXOP() && + if (AllowVariablePerLaneMask && Subtarget.hasXOP() && (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 || MaskVT == MVT::v8f32)) { // VPERMIL2 Operation. @@ -36179,7 +36189,7 @@ // Intel's manuals suggest only using PSHUFB if doing so replacing 5 // instructions, but in practice PSHUFB tends to be *very* fast so we're // more aggressive. - if (UnaryShuffle && AllowVariableMask && + if (UnaryShuffle && AllowVariablePerLaneMask && ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) || (RootVT.is256BitVector() && Subtarget.hasAVX2()) || (RootVT.is512BitVector() && Subtarget.hasBWI()))) { @@ -36210,7 +36220,8 @@ // With XOP, if we have a 128-bit binary input shuffle we can always combine // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never // slower than PSHUFB on targets that support both. - if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) { + if (AllowVariablePerLaneMask && RootVT.is128BitVector() && + Subtarget.hasXOP()) { // VPPERM Mask Operation // Bits[4:0] - Byte Index (0 - 31) // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO) @@ -36241,13 +36252,13 @@ // If that failed and either input is extracted then try to combine as a // shuffle with the larger type. if (SDValue WideShuffle = combineX86ShuffleChainWithExtract( - Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask, - DAG, Subtarget)) + Inputs, Root, BaseMask, Depth, HasVariableMask, + AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget)) return WideShuffle; // If we have a dual input shuffle then lower to VPERMV3, // (non-VLX will pad to 512-bit shuffles) - if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros && + if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros && ((Subtarget.hasAVX512() && (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 || MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 || @@ -36255,9 +36266,11 @@ MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || (Subtarget.hasBWI() && AllowBWIVPERMV3 && - (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) || + (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 || + MaskVT == MVT::v32i16)) || (Subtarget.hasVBMI() && AllowBWIVPERMV3 && - (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) { + (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 || + MaskVT == MVT::v64i8)))) { V1 = CanonicalizeShuffleInput(MaskVT, V1); V2 = CanonicalizeShuffleInput(MaskVT, V2); Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG); @@ -36278,7 +36291,8 @@ // extract_subvector(shuffle(x,y,m2),0) static SDValue combineX86ShuffleChainWithExtract( ArrayRef Inputs, SDValue Root, ArrayRef BaseMask, int Depth, - bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG, + bool HasVariableMask, bool AllowVariableCrossLaneMask, + bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { unsigned NumMaskElts = BaseMask.size(); unsigned NumInputs = Inputs.size(); @@ -36366,9 +36380,10 @@ // Attempt to combine wider chain. // TODO: Can we use a better Root? SDValue WideRoot = WideInputs[0]; - if (SDValue WideShuffle = combineX86ShuffleChain( - WideInputs, WideRoot, WideMask, Depth, HasVariableMask, - AllowVariableMask, DAG, Subtarget)) { + if (SDValue WideShuffle = + combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth, + HasVariableMask, AllowVariableCrossLaneMask, + AllowVariablePerLaneMask, DAG, Subtarget)) { WideShuffle = extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits); return DAG.getBitcast(RootVT, WideShuffle); @@ -36696,8 +36711,9 @@ static SDValue combineX86ShufflesRecursively( ArrayRef SrcOps, int SrcOpIndex, SDValue Root, ArrayRef RootMask, ArrayRef SrcNodes, unsigned Depth, - unsigned MaxDepth, bool HasVariableMask, bool AllowVariableMask, - SelectionDAG &DAG, const X86Subtarget &Subtarget) { + unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask, + bool AllowVariablePerLaneMask, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { assert(RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && "Illegal shuffle root mask"); @@ -36927,13 +36943,17 @@ SmallVector ResolvedMask = Mask; if (EmptyRoot) resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero); - bool AllowVar = false; + bool AllowCrossLaneVar = false; + bool AllowPerLaneVar = false; if (Ops[i].getNode()->hasOneUse() || - SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) - AllowVar = AllowVariableMask; + SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) { + AllowCrossLaneVar = AllowVariableCrossLaneMask; + AllowPerLaneVar = AllowVariablePerLaneMask; + } if (SDValue Res = combineX86ShufflesRecursively( Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth, - HasVariableMask, AllowVar, DAG, Subtarget)) + HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG, + Subtarget)) return Res; } } @@ -36984,23 +37004,25 @@ // Finally, try to combine into a single shuffle instruction. return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, - AllowVariableMask, DAG, Subtarget); + AllowVariableCrossLaneMask, + AllowVariablePerLaneMask, DAG, Subtarget); } // If that failed and any input is extracted then try to combine as a // shuffle with the larger type. - return combineX86ShuffleChainWithExtract(Ops, Root, Mask, Depth, - HasVariableMask, AllowVariableMask, - DAG, Subtarget); + return combineX86ShuffleChainWithExtract( + Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask, + AllowVariablePerLaneMask, DAG, Subtarget); } /// Helper entry wrapper to combineX86ShufflesRecursively. static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 0, - X86::MaxShuffleCombineDepth, - /*HasVarMask*/ false, - /*AllowVarMask*/ true, DAG, Subtarget); + return combineX86ShufflesRecursively( + {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth, + /*HasVarMask*/ false, + /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG, + Subtarget); } /// Get the PSHUF-style mask from PSHUF node. @@ -37463,7 +37485,8 @@ if (SDValue Res = combineX86ShufflesRecursively( {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth, - /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) + /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true, + /*AllowPerLaneVarMask*/ true, DAG, Subtarget)) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, DAG.getBitcast(SrcVT, Res)); } @@ -38958,7 +38981,8 @@ SDValue NewShuffle = combineX86ShufflesRecursively( {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth, /*HasVarMask*/ false, - /*AllowVarMask*/ true, TLO.DAG, Subtarget); + /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG, + Subtarget); if (NewShuffle) return TLO.CombineTo(Op, NewShuffle); } @@ -44630,7 +44654,8 @@ if (SDValue Shuffle = combineX86ShufflesRecursively( {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1, X86::MaxShuffleCombineDepth, - /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) + /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true, + /*AllowVarPerLaneMask*/ true, DAG, Subtarget)) return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle, N->getOperand(0).getOperand(1)); } diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -247,9 +247,13 @@ /// True if LZCNT/TZCNT instructions have a false dependency on the destination register. bool HasLZCNTFalseDeps = false; - /// True if its preferable to combine to a single shuffle using a variable - /// mask over multiple fixed shuffles. - bool HasFastVariableShuffle = false; + /// True if its preferable to combine to a single cross-lane shuffle + /// using a variable mask over multiple fixed shuffles. + bool HasFastVariableCrossLaneShuffle = false; + + /// True if its preferable to combine to a single per-lane shuffle + /// using a variable mask over multiple fixed shuffles. + bool HasFastVariablePerLaneShuffle = false; /// True if vzeroupper instructions should be inserted after code that uses /// ymm or zmm registers. @@ -704,8 +708,11 @@ bool useLeaForSP() const { return UseLeaForSP; } bool hasPOPCNTFalseDeps() const { return HasPOPCNTFalseDeps; } bool hasLZCNTFalseDeps() const { return HasLZCNTFalseDeps; } - bool hasFastVariableShuffle() const { - return HasFastVariableShuffle; + bool hasFastVariableCrossLaneShuffle() const { + return HasFastVariableCrossLaneShuffle; + } + bool hasFastVariablePerLaneShuffle() const { + return HasFastVariablePerLaneShuffle; } bool insertVZEROUPPER() const { return InsertVZEROUPPER; } bool hasFastGather() const { return HasFastGather; } diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -55,7 +55,8 @@ X86::FeatureFastSHLDRotate, X86::FeatureFastScalarShiftMasks, X86::FeatureFastVectorShiftMasks, - X86::FeatureFastVariableShuffle, + X86::FeatureFastVariableCrossLaneShuffle, + X86::FeatureFastVariablePerLaneShuffle, X86::FeatureFastVectorFSQRT, X86::FeatureLEAForSP, X86::FeatureLEAUsesAG, diff --git a/llvm/test/CodeGen/X86/avx2-conversions.ll b/llvm/test/CodeGen/X86/avx2-conversions.ll --- a/llvm/test/CodeGen/X86/avx2-conversions.ll +++ b/llvm/test/CodeGen/X86/avx2-conversions.ll @@ -1,8 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X32,X32-SLOW -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X32,X32-FAST +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X32,X32-FAST-ALL +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X32,X32-FAST-PERLANE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X64,X64-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X64,X64-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X64,X64-FAST-PERLANE define <4 x i32> @trunc4(<4 x i64> %A) nounwind { ; X32-SLOW-LABEL: trunc4: @@ -12,13 +14,20 @@ ; X32-SLOW-NEXT: vzeroupper ; X32-SLOW-NEXT: retl ; -; X32-FAST-LABEL: trunc4: -; X32-FAST: # %bb.0: -; X32-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; X32-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; X32-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; X32-FAST-NEXT: vzeroupper -; X32-FAST-NEXT: retl +; X32-FAST-ALL-LABEL: trunc4: +; X32-FAST-ALL: # %bb.0: +; X32-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; X32-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; X32-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; X32-FAST-ALL-NEXT: vzeroupper +; X32-FAST-ALL-NEXT: retl +; +; X32-FAST-PERLANE-LABEL: trunc4: +; X32-FAST-PERLANE: # %bb.0: +; X32-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X32-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X32-FAST-PERLANE-NEXT: vzeroupper +; X32-FAST-PERLANE-NEXT: retl ; ; X64-SLOW-LABEL: trunc4: ; X64-SLOW: # %bb.0: @@ -27,13 +36,20 @@ ; X64-SLOW-NEXT: vzeroupper ; X64-SLOW-NEXT: retq ; -; X64-FAST-LABEL: trunc4: -; X64-FAST: # %bb.0: -; X64-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; X64-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; X64-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; X64-FAST-NEXT: vzeroupper -; X64-FAST-NEXT: retq +; X64-FAST-ALL-LABEL: trunc4: +; X64-FAST-ALL: # %bb.0: +; X64-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; X64-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; X64-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; X64-FAST-ALL-NEXT: vzeroupper +; X64-FAST-ALL-NEXT: retq +; +; X64-FAST-PERLANE-LABEL: trunc4: +; X64-FAST-PERLANE: # %bb.0: +; X64-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X64-FAST-PERLANE-NEXT: vzeroupper +; X64-FAST-PERLANE-NEXT: retq %B = trunc <4 x i64> %A to <4 x i32> ret <4 x i32>%B } diff --git a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll --- a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll +++ b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll @@ -1,8 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X86,X86-SLOW -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X86,X86-FAST +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X86,X86-FAST-ALL +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X86,X86-FAST-PERLANE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X64,X64-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X64,X64-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X64,X64-FAST-PERLANE ; AVX2 Logical Shift Left @@ -384,15 +386,25 @@ ; X86-SLOW-NEXT: vzeroupper ; X86-SLOW-NEXT: retl ; -; X86-FAST-LABEL: srl_trunc_and_v4i64: -; X86-FAST: # %bb.0: -; X86-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> -; X86-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; X86-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] -; X86-FAST-NEXT: vpand %xmm2, %xmm1, %xmm1 -; X86-FAST-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; X86-FAST-NEXT: vzeroupper -; X86-FAST-NEXT: retl +; X86-FAST-ALL-LABEL: srl_trunc_and_v4i64: +; X86-FAST-ALL: # %bb.0: +; X86-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; X86-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; X86-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] +; X86-FAST-ALL-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X86-FAST-ALL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; X86-FAST-ALL-NEXT: vzeroupper +; X86-FAST-ALL-NEXT: retl +; +; X86-FAST-PERLANE-LABEL: srl_trunc_and_v4i64: +; X86-FAST-PERLANE: # %bb.0: +; X86-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; X86-FAST-PERLANE-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8] +; X86-FAST-PERLANE-NEXT: vandps %xmm2, %xmm1, %xmm1 +; X86-FAST-PERLANE-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; X86-FAST-PERLANE-NEXT: vzeroupper +; X86-FAST-PERLANE-NEXT: retl ; ; X64-SLOW-LABEL: srl_trunc_and_v4i64: ; X64-SLOW: # %bb.0: @@ -404,15 +416,25 @@ ; X64-SLOW-NEXT: vzeroupper ; X64-SLOW-NEXT: retq ; -; X64-FAST-LABEL: srl_trunc_and_v4i64: -; X64-FAST: # %bb.0: -; X64-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> -; X64-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; X64-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] -; X64-FAST-NEXT: vpand %xmm2, %xmm1, %xmm1 -; X64-FAST-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; X64-FAST-NEXT: vzeroupper -; X64-FAST-NEXT: retq +; X64-FAST-ALL-LABEL: srl_trunc_and_v4i64: +; X64-FAST-ALL: # %bb.0: +; X64-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; X64-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; X64-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] +; X64-FAST-ALL-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X64-FAST-ALL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; X64-FAST-ALL-NEXT: vzeroupper +; X64-FAST-ALL-NEXT: retq +; +; X64-FAST-PERLANE-LABEL: srl_trunc_and_v4i64: +; X64-FAST-PERLANE: # %bb.0: +; X64-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; X64-FAST-PERLANE-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8] +; X64-FAST-PERLANE-NEXT: vandps %xmm2, %xmm1, %xmm1 +; X64-FAST-PERLANE-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; X64-FAST-PERLANE-NEXT: vzeroupper +; X64-FAST-PERLANE-NEXT: retq %and = and <4 x i64> %y, %trunc = trunc <4 x i64> %and to <4 x i32> %sra = lshr <4 x i32> %x, %trunc diff --git a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll --- a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll +++ b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+fast-variable-shuffle -O2 | FileCheck %s --check-prefix=AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+fast-variable-shuffle -O2 | FileCheck %s --check-prefix=AVX512NOTDQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle -O2 | FileCheck %s --check-prefixes=AVX512,AVX512-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+fast-variable-perlane-shuffle -O2 | FileCheck %s --check-prefixes=AVX512,AVX512-FAST-PERLANE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle -O2 | FileCheck %s --check-prefixes=AVX512NOTDQ,AVX512NOTDQ-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+fast-variable-perlane-shuffle -O2 | FileCheck %s --check-prefixes=AVX512NOTDQ,AVX512NOTDQ-FAST-PERLANE define void @load_v8i1_broadcast_4_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { ; AVX512-LABEL: load_v8i1_broadcast_4_v2i1: @@ -322,31 +324,57 @@ ret void } define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) { -; AVX512-LABEL: load_v32i1_broadcast_31_v8i1: -; AVX512: # %bb.0: -; AVX512-NEXT: kmovb 3(%rdi), %k0 -; AVX512-NEXT: vpmovm2d %k0, %ymm2 -; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] -; AVX512-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX512-NEXT: vpmovd2m %ymm2, %k1 -; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1} -; AVX512-NEXT: vmovaps %ymm1, (%rsi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512-FAST-LABEL: load_v32i1_broadcast_31_v8i1: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: kmovb 3(%rdi), %k0 +; AVX512-FAST-NEXT: vpmovm2d %k0, %ymm2 +; AVX512-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] +; AVX512-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX512-FAST-NEXT: vpmovd2m %ymm2, %k1 +; AVX512-FAST-NEXT: vmovaps %ymm0, %ymm1 {%k1} +; AVX512-FAST-NEXT: vmovaps %ymm1, (%rsi) +; AVX512-FAST-NEXT: vzeroupper +; AVX512-FAST-NEXT: retq ; -; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v8i1: -; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: movzbl 3(%rdi), %eax -; AVX512NOTDQ-NEXT: kmovd %eax, %k1 -; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} -; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] -; AVX512NOTDQ-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 -; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} -; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) -; AVX512NOTDQ-NEXT: vzeroupper -; AVX512NOTDQ-NEXT: retq +; AVX512-FAST-PERLANE-LABEL: load_v32i1_broadcast_31_v8i1: +; AVX512-FAST-PERLANE: # %bb.0: +; AVX512-FAST-PERLANE-NEXT: kmovb 3(%rdi), %k0 +; AVX512-FAST-PERLANE-NEXT: vpmovm2d %k0, %ymm2 +; AVX512-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] +; AVX512-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512-FAST-PERLANE-NEXT: vpmovd2m %ymm2, %k1 +; AVX512-FAST-PERLANE-NEXT: vmovaps %ymm0, %ymm1 {%k1} +; AVX512-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) +; AVX512-FAST-PERLANE-NEXT: vzeroupper +; AVX512-FAST-PERLANE-NEXT: retq +; +; AVX512NOTDQ-FAST-LABEL: load_v32i1_broadcast_31_v8i1: +; AVX512NOTDQ-FAST: # %bb.0: +; AVX512NOTDQ-FAST-NEXT: movzbl 3(%rdi), %eax +; AVX512NOTDQ-FAST-NEXT: kmovd %eax, %k1 +; AVX512NOTDQ-FAST-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512NOTDQ-FAST-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} +; AVX512NOTDQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] +; AVX512NOTDQ-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX512NOTDQ-FAST-NEXT: vptestmd %ymm2, %ymm2, %k1 +; AVX512NOTDQ-FAST-NEXT: vmovaps %ymm0, %ymm1 {%k1} +; AVX512NOTDQ-FAST-NEXT: vmovaps %ymm1, (%rsi) +; AVX512NOTDQ-FAST-NEXT: vzeroupper +; AVX512NOTDQ-FAST-NEXT: retq +; +; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v32i1_broadcast_31_v8i1: +; AVX512NOTDQ-FAST-PERLANE: # %bb.0: +; AVX512NOTDQ-FAST-PERLANE-NEXT: movzbl 3(%rdi), %eax +; AVX512NOTDQ-FAST-PERLANE-NEXT: kmovd %eax, %k1 +; AVX512NOTDQ-FAST-PERLANE-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} +; AVX512NOTDQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] +; AVX512NOTDQ-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512NOTDQ-FAST-PERLANE-NEXT: vptestmd %ymm2, %ymm2, %k1 +; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovaps %ymm0, %ymm1 {%k1} +; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) +; AVX512NOTDQ-FAST-PERLANE-NEXT: vzeroupper +; AVX512NOTDQ-FAST-PERLANE-NEXT: retq %d0 = load <32 x i1>, <32 x i1>* %a0 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32> %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2 @@ -527,31 +555,57 @@ ret void } define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) { -; AVX512-LABEL: load_v64i1_broadcast_63_v8i1: -; AVX512: # %bb.0: -; AVX512-NEXT: kmovb 7(%rdi), %k0 -; AVX512-NEXT: vpmovm2d %k0, %ymm2 -; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] -; AVX512-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX512-NEXT: vpmovd2m %ymm2, %k1 -; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1} -; AVX512-NEXT: vmovaps %ymm1, (%rsi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512-FAST-LABEL: load_v64i1_broadcast_63_v8i1: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: kmovb 7(%rdi), %k0 +; AVX512-FAST-NEXT: vpmovm2d %k0, %ymm2 +; AVX512-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] +; AVX512-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX512-FAST-NEXT: vpmovd2m %ymm2, %k1 +; AVX512-FAST-NEXT: vmovaps %ymm0, %ymm1 {%k1} +; AVX512-FAST-NEXT: vmovaps %ymm1, (%rsi) +; AVX512-FAST-NEXT: vzeroupper +; AVX512-FAST-NEXT: retq ; -; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v8i1: -; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: movzbl 7(%rdi), %eax -; AVX512NOTDQ-NEXT: kmovd %eax, %k1 -; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} -; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] -; AVX512NOTDQ-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 -; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} -; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) -; AVX512NOTDQ-NEXT: vzeroupper -; AVX512NOTDQ-NEXT: retq +; AVX512-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v8i1: +; AVX512-FAST-PERLANE: # %bb.0: +; AVX512-FAST-PERLANE-NEXT: kmovb 7(%rdi), %k0 +; AVX512-FAST-PERLANE-NEXT: vpmovm2d %k0, %ymm2 +; AVX512-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] +; AVX512-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512-FAST-PERLANE-NEXT: vpmovd2m %ymm2, %k1 +; AVX512-FAST-PERLANE-NEXT: vmovaps %ymm0, %ymm1 {%k1} +; AVX512-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) +; AVX512-FAST-PERLANE-NEXT: vzeroupper +; AVX512-FAST-PERLANE-NEXT: retq +; +; AVX512NOTDQ-FAST-LABEL: load_v64i1_broadcast_63_v8i1: +; AVX512NOTDQ-FAST: # %bb.0: +; AVX512NOTDQ-FAST-NEXT: movzbl 7(%rdi), %eax +; AVX512NOTDQ-FAST-NEXT: kmovd %eax, %k1 +; AVX512NOTDQ-FAST-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512NOTDQ-FAST-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} +; AVX512NOTDQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] +; AVX512NOTDQ-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX512NOTDQ-FAST-NEXT: vptestmd %ymm2, %ymm2, %k1 +; AVX512NOTDQ-FAST-NEXT: vmovaps %ymm0, %ymm1 {%k1} +; AVX512NOTDQ-FAST-NEXT: vmovaps %ymm1, (%rsi) +; AVX512NOTDQ-FAST-NEXT: vzeroupper +; AVX512NOTDQ-FAST-NEXT: retq +; +; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v8i1: +; AVX512NOTDQ-FAST-PERLANE: # %bb.0: +; AVX512NOTDQ-FAST-PERLANE-NEXT: movzbl 7(%rdi), %eax +; AVX512NOTDQ-FAST-PERLANE-NEXT: kmovd %eax, %k1 +; AVX512NOTDQ-FAST-PERLANE-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} +; AVX512NOTDQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] +; AVX512NOTDQ-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512NOTDQ-FAST-PERLANE-NEXT: vptestmd %ymm2, %ymm2, %k1 +; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovaps %ymm0, %ymm1 {%k1} +; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) +; AVX512NOTDQ-FAST-PERLANE-NEXT: vzeroupper +; AVX512NOTDQ-FAST-PERLANE-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32> %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2 @@ -559,29 +613,53 @@ ret void } define void @load_v64i1_broadcast_63_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) { -; AVX512-LABEL: load_v64i1_broadcast_63_v16i1: -; AVX512: # %bb.0: -; AVX512-NEXT: kmovw 6(%rdi), %k0 -; AVX512-NEXT: vpmovm2d %k0, %zmm2 -; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512-NEXT: vpermd %zmm2, %zmm3, %zmm2 -; AVX512-NEXT: vpmovd2m %zmm2, %k1 -; AVX512-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovaps %zmm1, (%rsi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512-FAST-LABEL: load_v64i1_broadcast_63_v16i1: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: kmovw 6(%rdi), %k0 +; AVX512-FAST-NEXT: vpmovm2d %k0, %zmm2 +; AVX512-FAST-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-FAST-NEXT: vpermd %zmm2, %zmm3, %zmm2 +; AVX512-FAST-NEXT: vpmovd2m %zmm2, %k1 +; AVX512-FAST-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; AVX512-FAST-NEXT: vmovaps %zmm1, (%rsi) +; AVX512-FAST-NEXT: vzeroupper +; AVX512-FAST-NEXT: retq ; -; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v16i1: -; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovw 6(%rdi), %k1 -; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512NOTDQ-NEXT: vpermd %zmm2, %zmm3, %zmm2 -; AVX512NOTDQ-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512NOTDQ-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; AVX512NOTDQ-NEXT: vmovaps %zmm1, (%rsi) -; AVX512NOTDQ-NEXT: vzeroupper -; AVX512NOTDQ-NEXT: retq +; AVX512-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v16i1: +; AVX512-FAST-PERLANE: # %bb.0: +; AVX512-FAST-PERLANE-NEXT: kmovw 6(%rdi), %k0 +; AVX512-FAST-PERLANE-NEXT: vpmovm2d %k0, %zmm2 +; AVX512-FAST-PERLANE-NEXT: vpshufd {{.*#+}} zmm2 = zmm2[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] +; AVX512-FAST-PERLANE-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[6,7,6,7,6,7,6,7] +; AVX512-FAST-PERLANE-NEXT: vpmovd2m %zmm2, %k1 +; AVX512-FAST-PERLANE-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; AVX512-FAST-PERLANE-NEXT: vmovaps %zmm1, (%rsi) +; AVX512-FAST-PERLANE-NEXT: vzeroupper +; AVX512-FAST-PERLANE-NEXT: retq +; +; AVX512NOTDQ-FAST-LABEL: load_v64i1_broadcast_63_v16i1: +; AVX512NOTDQ-FAST: # %bb.0: +; AVX512NOTDQ-FAST-NEXT: kmovw 6(%rdi), %k1 +; AVX512NOTDQ-FAST-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; AVX512NOTDQ-FAST-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512NOTDQ-FAST-NEXT: vpermd %zmm2, %zmm3, %zmm2 +; AVX512NOTDQ-FAST-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512NOTDQ-FAST-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; AVX512NOTDQ-FAST-NEXT: vmovaps %zmm1, (%rsi) +; AVX512NOTDQ-FAST-NEXT: vzeroupper +; AVX512NOTDQ-FAST-NEXT: retq +; +; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v16i1: +; AVX512NOTDQ-FAST-PERLANE: # %bb.0: +; AVX512NOTDQ-FAST-PERLANE-NEXT: kmovw 6(%rdi), %k1 +; AVX512NOTDQ-FAST-PERLANE-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; AVX512NOTDQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} zmm2 = zmm2[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] +; AVX512NOTDQ-FAST-PERLANE-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[6,7,6,7,6,7,6,7] +; AVX512NOTDQ-FAST-PERLANE-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovaps %zmm1, (%rsi) +; AVX512NOTDQ-FAST-PERLANE-NEXT: vzeroupper +; AVX512NOTDQ-FAST-PERLANE-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32> %d2 = select <16 x i1> %d1, <16 x float> %a1, <16 x float> %a2 @@ -1170,30 +1248,55 @@ ret void } define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) { -; AVX512-LABEL: load_v32i1_broadcast_31_v8i1_store: -; AVX512: # %bb.0: -; AVX512-NEXT: kmovb 3(%rdi), %k0 -; AVX512-NEXT: vpmovm2d %k0, %ymm0 -; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] -; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vpmovd2m %ymm0, %k0 -; AVX512-NEXT: kmovb %k0, (%rsi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512-FAST-LABEL: load_v32i1_broadcast_31_v8i1_store: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: kmovb 3(%rdi), %k0 +; AVX512-FAST-NEXT: vpmovm2d %k0, %ymm0 +; AVX512-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] +; AVX512-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512-FAST-NEXT: vpmovd2m %ymm0, %k0 +; AVX512-FAST-NEXT: kmovb %k0, (%rsi) +; AVX512-FAST-NEXT: vzeroupper +; AVX512-FAST-NEXT: retq ; -; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v8i1_store: -; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: movzbl 3(%rdi), %eax -; AVX512NOTDQ-NEXT: kmovd %eax, %k1 -; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] -; AVX512NOTDQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0 -; AVX512NOTDQ-NEXT: kmovd %k0, %eax -; AVX512NOTDQ-NEXT: movb %al, (%rsi) -; AVX512NOTDQ-NEXT: vzeroupper -; AVX512NOTDQ-NEXT: retq +; AVX512-FAST-PERLANE-LABEL: load_v32i1_broadcast_31_v8i1_store: +; AVX512-FAST-PERLANE: # %bb.0: +; AVX512-FAST-PERLANE-NEXT: kmovb 3(%rdi), %k0 +; AVX512-FAST-PERLANE-NEXT: vpmovm2d %k0, %ymm0 +; AVX512-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] +; AVX512-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512-FAST-PERLANE-NEXT: vpmovd2m %ymm0, %k0 +; AVX512-FAST-PERLANE-NEXT: kmovb %k0, (%rsi) +; AVX512-FAST-PERLANE-NEXT: vzeroupper +; AVX512-FAST-PERLANE-NEXT: retq +; +; AVX512NOTDQ-FAST-LABEL: load_v32i1_broadcast_31_v8i1_store: +; AVX512NOTDQ-FAST: # %bb.0: +; AVX512NOTDQ-FAST-NEXT: movzbl 3(%rdi), %eax +; AVX512NOTDQ-FAST-NEXT: kmovd %eax, %k1 +; AVX512NOTDQ-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512NOTDQ-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512NOTDQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] +; AVX512NOTDQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512NOTDQ-FAST-NEXT: vptestmd %ymm0, %ymm0, %k0 +; AVX512NOTDQ-FAST-NEXT: kmovd %k0, %eax +; AVX512NOTDQ-FAST-NEXT: movb %al, (%rsi) +; AVX512NOTDQ-FAST-NEXT: vzeroupper +; AVX512NOTDQ-FAST-NEXT: retq +; +; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v32i1_broadcast_31_v8i1_store: +; AVX512NOTDQ-FAST-PERLANE: # %bb.0: +; AVX512NOTDQ-FAST-PERLANE-NEXT: movzbl 3(%rdi), %eax +; AVX512NOTDQ-FAST-PERLANE-NEXT: kmovd %eax, %k1 +; AVX512NOTDQ-FAST-PERLANE-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512NOTDQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] +; AVX512NOTDQ-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512NOTDQ-FAST-PERLANE-NEXT: vptestmd %ymm0, %ymm0, %k0 +; AVX512NOTDQ-FAST-PERLANE-NEXT: kmovd %k0, %eax +; AVX512NOTDQ-FAST-PERLANE-NEXT: movb %al, (%rsi) +; AVX512NOTDQ-FAST-PERLANE-NEXT: vzeroupper +; AVX512NOTDQ-FAST-PERLANE-NEXT: retq %d0 = load <32 x i1>, <32 x i1>* %a0 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32> store <8 x i1> %d1, <8 x i1>* %a1 @@ -1408,57 +1511,104 @@ ret void } define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) { -; AVX512-LABEL: load_v64i1_broadcast_63_v8i1_store: -; AVX512: # %bb.0: -; AVX512-NEXT: kmovb 7(%rdi), %k0 -; AVX512-NEXT: vpmovm2d %k0, %ymm0 -; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] -; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vpmovd2m %ymm0, %k0 -; AVX512-NEXT: kmovb %k0, (%rsi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512-FAST-LABEL: load_v64i1_broadcast_63_v8i1_store: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: kmovb 7(%rdi), %k0 +; AVX512-FAST-NEXT: vpmovm2d %k0, %ymm0 +; AVX512-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] +; AVX512-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512-FAST-NEXT: vpmovd2m %ymm0, %k0 +; AVX512-FAST-NEXT: kmovb %k0, (%rsi) +; AVX512-FAST-NEXT: vzeroupper +; AVX512-FAST-NEXT: retq ; -; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v8i1_store: -; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: movzbl 7(%rdi), %eax -; AVX512NOTDQ-NEXT: kmovd %eax, %k1 -; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] -; AVX512NOTDQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0 -; AVX512NOTDQ-NEXT: kmovd %k0, %eax -; AVX512NOTDQ-NEXT: movb %al, (%rsi) -; AVX512NOTDQ-NEXT: vzeroupper -; AVX512NOTDQ-NEXT: retq +; AVX512-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v8i1_store: +; AVX512-FAST-PERLANE: # %bb.0: +; AVX512-FAST-PERLANE-NEXT: kmovb 7(%rdi), %k0 +; AVX512-FAST-PERLANE-NEXT: vpmovm2d %k0, %ymm0 +; AVX512-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] +; AVX512-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512-FAST-PERLANE-NEXT: vpmovd2m %ymm0, %k0 +; AVX512-FAST-PERLANE-NEXT: kmovb %k0, (%rsi) +; AVX512-FAST-PERLANE-NEXT: vzeroupper +; AVX512-FAST-PERLANE-NEXT: retq +; +; AVX512NOTDQ-FAST-LABEL: load_v64i1_broadcast_63_v8i1_store: +; AVX512NOTDQ-FAST: # %bb.0: +; AVX512NOTDQ-FAST-NEXT: movzbl 7(%rdi), %eax +; AVX512NOTDQ-FAST-NEXT: kmovd %eax, %k1 +; AVX512NOTDQ-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512NOTDQ-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512NOTDQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] +; AVX512NOTDQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512NOTDQ-FAST-NEXT: vptestmd %ymm0, %ymm0, %k0 +; AVX512NOTDQ-FAST-NEXT: kmovd %k0, %eax +; AVX512NOTDQ-FAST-NEXT: movb %al, (%rsi) +; AVX512NOTDQ-FAST-NEXT: vzeroupper +; AVX512NOTDQ-FAST-NEXT: retq +; +; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v8i1_store: +; AVX512NOTDQ-FAST-PERLANE: # %bb.0: +; AVX512NOTDQ-FAST-PERLANE-NEXT: movzbl 7(%rdi), %eax +; AVX512NOTDQ-FAST-PERLANE-NEXT: kmovd %eax, %k1 +; AVX512NOTDQ-FAST-PERLANE-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512NOTDQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] +; AVX512NOTDQ-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512NOTDQ-FAST-PERLANE-NEXT: vptestmd %ymm0, %ymm0, %k0 +; AVX512NOTDQ-FAST-PERLANE-NEXT: kmovd %k0, %eax +; AVX512NOTDQ-FAST-PERLANE-NEXT: movb %al, (%rsi) +; AVX512NOTDQ-FAST-PERLANE-NEXT: vzeroupper +; AVX512NOTDQ-FAST-PERLANE-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32> store <8 x i1> %d1, <8 x i1>* %a1 ret void } define void @load_v64i1_broadcast_63_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) { -; AVX512-LABEL: load_v64i1_broadcast_63_v16i1_store: -; AVX512: # %bb.0: -; AVX512-NEXT: kmovw 6(%rdi), %k0 -; AVX512-NEXT: vpmovm2d %k0, %zmm0 -; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vpmovd2m %zmm0, %k0 -; AVX512-NEXT: kmovw %k0, (%rsi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512-FAST-LABEL: load_v64i1_broadcast_63_v16i1_store: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: kmovw 6(%rdi), %k0 +; AVX512-FAST-NEXT: vpmovm2d %k0, %zmm0 +; AVX512-FAST-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512-FAST-NEXT: vpmovd2m %zmm0, %k0 +; AVX512-FAST-NEXT: kmovw %k0, (%rsi) +; AVX512-FAST-NEXT: vzeroupper +; AVX512-FAST-NEXT: retq ; -; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v16i1_store: -; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovw 6(%rdi), %k1 -; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512NOTDQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512NOTDQ-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512NOTDQ-NEXT: kmovw %k0, (%rsi) -; AVX512NOTDQ-NEXT: vzeroupper -; AVX512NOTDQ-NEXT: retq +; AVX512-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v16i1_store: +; AVX512-FAST-PERLANE: # %bb.0: +; AVX512-FAST-PERLANE-NEXT: kmovw 6(%rdi), %k0 +; AVX512-FAST-PERLANE-NEXT: vpmovm2d %k0, %zmm0 +; AVX512-FAST-PERLANE-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] +; AVX512-FAST-PERLANE-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,6,7,6,7,6,7] +; AVX512-FAST-PERLANE-NEXT: vpmovd2m %zmm0, %k0 +; AVX512-FAST-PERLANE-NEXT: kmovw %k0, (%rsi) +; AVX512-FAST-PERLANE-NEXT: vzeroupper +; AVX512-FAST-PERLANE-NEXT: retq +; +; AVX512NOTDQ-FAST-LABEL: load_v64i1_broadcast_63_v16i1_store: +; AVX512NOTDQ-FAST: # %bb.0: +; AVX512NOTDQ-FAST-NEXT: kmovw 6(%rdi), %k1 +; AVX512NOTDQ-FAST-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512NOTDQ-FAST-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512NOTDQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512NOTDQ-FAST-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512NOTDQ-FAST-NEXT: kmovw %k0, (%rsi) +; AVX512NOTDQ-FAST-NEXT: vzeroupper +; AVX512NOTDQ-FAST-NEXT: retq +; +; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v16i1_store: +; AVX512NOTDQ-FAST-PERLANE: # %bb.0: +; AVX512NOTDQ-FAST-PERLANE-NEXT: kmovw 6(%rdi), %k1 +; AVX512NOTDQ-FAST-PERLANE-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512NOTDQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] +; AVX512NOTDQ-FAST-PERLANE-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,6,7,6,7,6,7] +; AVX512NOTDQ-FAST-PERLANE-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512NOTDQ-FAST-PERLANE-NEXT: kmovw %k0, (%rsi) +; AVX512NOTDQ-FAST-PERLANE-NEXT: vzeroupper +; AVX512NOTDQ-FAST-PERLANE-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32> store <16 x i1> %d1, <16 x i1>* %a1 diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll b/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512dq,+fast-variable-shuffle %s -o - | FileCheck %s +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle %s -o - | FileCheck %s +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512dq,+fast-variable-perlane-shuffle %s -o - | FileCheck %s define <4 x i32> @test_2xi32_to_4xi32(<4 x i32> %vec) { ; CHECK-LABEL: test_2xi32_to_4xi32: diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-shuffle %s -o - | FileCheck %s +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle %s -o - | FileCheck --check-prefixes=CHECK,CHECK-FAST %s +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-perlane-shuffle %s -o - | FileCheck --check-prefixes=CHECK,CHECK-FAST-PERLANE %s ; FIXME: All cases here should be fixed by PR34380 @@ -1758,13 +1759,24 @@ } define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) { -; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask9: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [12,9,4,10] -; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_16xi32_to_4xi32_perm_mask9: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm1 = [12,9,4,10] +; CHECK-FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-FAST-NEXT: vzeroupper +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_16xi32_to_4xi32_perm_mask9: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <4,1,u,2> +; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; CHECK-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm1, %ymm1 +; CHECK-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 +; CHECK-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,3] +; CHECK-FAST-PERLANE-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 +; CHECK-FAST-PERLANE-NEXT: vzeroupper +; CHECK-FAST-PERLANE-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> ret <4 x i32> %res } @@ -1937,13 +1949,22 @@ ret <4 x i64> %res } define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask1: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [6,4,6,1] -; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask1: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,4,6,1] +; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0 +; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1 +; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask1: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] +; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1 +; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,2,1] +; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 @@ -1951,26 +1972,43 @@ } define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1] -; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} -; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1] +; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 +; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 +; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,1] +; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask2: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [6,3,6,3] -; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask2: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,3,6,3] +; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0 +; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1 +; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask2: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] +; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1 +; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,3] +; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 @@ -1978,36 +2016,60 @@ } define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [6,3,6,3] -; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} -; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,3,6,3] +; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 +; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 +; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,3] +; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) { -; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask3: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [6,0,0,7] -; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mask3: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [6,0,0,7] +; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mask3: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; CHECK-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,0,3] +; CHECK-FAST-PERLANE-NEXT: retq %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> ret <4 x i64> %res } define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask3: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [6,0,0,7] -; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask3: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,0,0,7] +; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0 +; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1 +; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask3: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] +; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1 +; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,0,3] +; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 @@ -2015,26 +2077,43 @@ } define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7] -; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} -; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7] +; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 +; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 +; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,0,3] +; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask4: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,7,7,5] -; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask4: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [3,7,7,5] +; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0 +; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1 +; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask4: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] +; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1 +; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,3,1] +; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 @@ -2042,13 +2121,21 @@ } define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,7,5] -; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} -; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,7,5] +; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 +; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 +; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,3,1] +; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer @@ -2082,23 +2169,39 @@ ret <4 x i64> %res } define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) { -; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask6: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,3] -; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mask6: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,3] +; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mask6: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,2,1,3] +; CHECK-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; CHECK-FAST-PERLANE-NEXT: retq %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> ret <4 x i64> %res } define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask6: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [7,6,5,3] -; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask6: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [7,6,5,3] +; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0 +; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1 +; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask6: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,2,1,3] +; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1 +; CHECK-FAST-PERLANE-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 @@ -2106,13 +2209,22 @@ } define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,3] -; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} -; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,3] +; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 +; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,2,1,3] +; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 +; CHECK-FAST-PERLANE-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer @@ -2250,14 +2362,22 @@ } define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,4] -; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,4] +; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 +; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 +; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2 +; CHECK-FAST-PERLANE-NEXT: vpblendd $3, (%rdi), %ymm2, %ymm2 # ymm2 = mem[0,1],ymm2[2,3,4,5,6,7] +; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 +; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,3,2,0] +; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2266,14 +2386,22 @@ } define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,4] -; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,4] +; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 +; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} +; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-FAST-PERLANE-NEXT: vpblendd $3, (%rdi), %ymm1, %ymm1 # ymm1 = mem[0,1],ymm1[2,3,4,5,6,7] +; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1 +; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,3,2,0] +; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2282,14 +2410,22 @@ } define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,5,5,1] -; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [3,5,5,1] +; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 +; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 +; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2 +; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 +; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,0] +; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2298,14 +2434,22 @@ } define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,5,5,1] -; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [3,5,5,1] +; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 +; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} +; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1 +; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,0] +; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2314,25 +2458,40 @@ } define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp) { -; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask3: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,0,2] -; CHECK-NEXT: vpermi2q 32(%rdi), %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mem_mask3: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,0,2] +; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm1, %ymm0 +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mem_mask3: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-FAST-PERLANE-NEXT: vpalignr $8, 32(%rdi), %ymm0, %ymm0 # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,1,3] +; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> ret <4 x i64> %res } define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [7,0,0,2] -; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [7,0,0,2] +; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 +; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 +; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 +; CHECK-FAST-PERLANE-NEXT: vpalignr $8, 32(%rdi), %ymm2, %ymm2 # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 +; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,3] +; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2341,14 +2500,22 @@ } define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,0,2] -; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,0,2] +; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 +; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} +; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-FAST-PERLANE-NEXT: vpalignr $8, 32(%rdi), %ymm1, %ymm1 # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1 +; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,3] +; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2389,14 +2556,22 @@ } define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,7,1] -; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,7,1] +; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 +; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 +; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 +; CHECK-FAST-PERLANE-NEXT: vpblendd $192, 32(%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] +; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 +; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,2,3,1] +; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2405,14 +2580,22 @@ } define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,7,1] -; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,7,1] +; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 +; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} +; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-FAST-PERLANE-NEXT: vpblendd $192, 32(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1 +; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,2,3,1] +; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2464,14 +2647,22 @@ } define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,1,5] -; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,1,5] +; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 +; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 +; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2 +; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 +; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,2,0,1] +; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2480,14 +2671,22 @@ } define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,1,5] -; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,1,5] +; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 +; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} +; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1 +; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,2,0,1] +; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -3050,15 +3249,27 @@ ret <4 x float> %res } define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { -; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [8,6,10,6] -; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 -; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm3 = [8,6,10,6] +; CHECK-FAST-NEXT: vpermps %zmm0, %zmm3, %zmm0 +; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-FAST-NEXT: vcmpeqps %xmm3, %xmm2, %k1 +; CHECK-FAST-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-FAST-NEXT: vzeroupper +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; CHECK-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm4 = [0,6,2,6] +; CHECK-FAST-PERLANE-NEXT: vpermi2ps %xmm0, %xmm3, %xmm4 +; CHECK-FAST-PERLANE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-FAST-PERLANE-NEXT: vcmpeqps %xmm0, %xmm2, %k1 +; CHECK-FAST-PERLANE-NEXT: vblendmps %xmm4, %xmm1, %xmm0 {%k1} +; CHECK-FAST-PERLANE-NEXT: vzeroupper +; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 @@ -3066,15 +3277,26 @@ } define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) { -; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [8,6,10,6] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 -; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm2 = [8,6,10,6] +; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-FAST-NEXT: vcmpeqps %xmm3, %xmm1, %k1 +; CHECK-FAST-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-FAST-NEXT: vzeroupper +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; CHECK-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm3 +; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm0 = [0,6,2,6] +; CHECK-FAST-PERLANE-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; CHECK-FAST-PERLANE-NEXT: vcmpeqps %xmm4, %xmm1, %k1 +; CHECK-FAST-PERLANE-NEXT: vpermi2ps %xmm3, %xmm2, %xmm0 {%k1} {z} +; CHECK-FAST-PERLANE-NEXT: vzeroupper +; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer @@ -3234,15 +3456,26 @@ } define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { -; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [9,5,2,3,2,8,8,1] -; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1} -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovaps 32(%rdi), %ymm2 +; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [9,5,2,3,2,8,8,1] +; CHECK-FAST-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 +; CHECK-FAST-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-FAST-NEXT: vcmpeqps %ymm2, %ymm1, %k1 +; CHECK-FAST-NEXT: vmovaps %ymm3, %ymm0 {%k1} +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2 +; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3 +; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm4 = [9,5,2,3,2,8,8,1] +; CHECK-FAST-PERLANE-NEXT: vpermi2ps %ymm2, %ymm3, %ymm4 +; CHECK-FAST-PERLANE-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-FAST-PERLANE-NEXT: vcmpeqps %ymm2, %ymm1, %k1 +; CHECK-FAST-PERLANE-NEXT: vmovaps %ymm4, %ymm0 {%k1} +; CHECK-FAST-PERLANE-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -3251,15 +3484,26 @@ } define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* %vp, <8 x float> %mask) { -; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1 -; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovaps %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovaps 32(%rdi), %ymm2 +; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1] +; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-FAST-NEXT: vcmpeqps %ymm3, %ymm0, %k1 +; CHECK-FAST-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} +; CHECK-FAST-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2 +; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3 +; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1] +; CHECK-FAST-PERLANE-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; CHECK-FAST-PERLANE-NEXT: vcmpeqps %ymm4, %ymm0, %k1 +; CHECK-FAST-PERLANE-NEXT: vpermi2ps %ymm2, %ymm3, %ymm1 {%k1} {z} +; CHECK-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-FAST-PERLANE-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -3757,15 +4001,25 @@ ret <4 x double> %res } define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { -; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4: -; CHECK: # %bb.0: -; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3 -; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [1,1,5,5] -; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4 -; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1 -; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm4 = [1,1,5,5] +; CHECK-FAST-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4 +; CHECK-FAST-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; CHECK-FAST-NEXT: vcmpeqpd %ymm0, %ymm2, %k1 +; CHECK-FAST-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1} +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,0,1,1] +; CHECK-FAST-PERLANE-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 @@ -3773,15 +4027,24 @@ } define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) { -; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4: -; CHECK: # %bb.0: -; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3 -; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [1,1,5,5] -; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1 -; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovapd %ymm2, %ymm0 -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [1,1,5,5] +; CHECK-FAST-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; CHECK-FAST-NEXT: vcmpeqpd %ymm4, %ymm1, %k1 +; CHECK-FAST-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z} +; CHECK-FAST-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 +; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,1,1] +; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer @@ -3817,24 +4080,41 @@ ret <4 x double> %res } define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) { -; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask6: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [5,8,7,8] -; CHECK-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_8xdouble_to_4xdouble_perm_mask6: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [5,8,7,8] +; CHECK-FAST-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_8xdouble_to_4xdouble_perm_mask6: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; CHECK-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 +; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; CHECK-FAST-PERLANE-NEXT: retq %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> ret <4 x double> %res } define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { -; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [5,8,7,8] -; CHECK-NEXT: vpermi2pd %zmm0, %zmm0, %zmm3 -; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1 -; CHECK-NEXT: vblendmpd %ymm3, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [5,8,7,8] +; CHECK-FAST-NEXT: vpermi2pd %zmm0, %zmm0, %zmm3 +; CHECK-FAST-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; CHECK-FAST-NEXT: vcmpeqpd %ymm0, %ymm2, %k1 +; CHECK-FAST-NEXT: vblendmpd %ymm3, %ymm1, %ymm0 {%k1} +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm3 +; CHECK-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 +; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm4, %ymm2, %k1 +; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm3[1],ymm0[1],ymm3[3],ymm0[3] +; CHECK-FAST-PERLANE-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 @@ -3842,28 +4122,47 @@ } define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %mask) { -; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [5,8,7,8] -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 -; CHECK-NEXT: vpermt2pd %zmm0, %zmm2, %zmm0 {%k1} {z} -; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [5,8,7,8] +; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 +; CHECK-FAST-NEXT: vpermt2pd %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; CHECK-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 +; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 +; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { -; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [3,5,0,6] -; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [3,5,0,6] +; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm0 +; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-FAST-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm3 +; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,3] +; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm4, %ymm2, %k1 +; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm1 {%k1} = ymm0[1],ymm3[1],ymm0[2],ymm3[2] +; CHECK-FAST-PERLANE-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 @@ -3871,14 +4170,23 @@ } define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %mask) { -; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [3,5,0,6] -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 -; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} -; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [3,5,0,6] +; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 +; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,3] +; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 +; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm2[1],ymm0[2],ymm2[2] +; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer @@ -4004,15 +4312,23 @@ } define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) { -; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %ymm2 -; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [3,4,2,6] -; CHECK-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm3 -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2 +; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [3,4,2,6] +; CHECK-FAST-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm3 +; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 +; CHECK-FAST-NEXT: vmovapd %ymm3, %ymm0 {%k1} +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vpermpd $236, (%rdi), %ymm2 # ymm2 = mem[0,3,2,3] +; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 +; CHECK-FAST-PERLANE-NEXT: vshufpd $1, 32(%rdi){1to4}, %ymm2, %ymm0 {%k1} +; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -4021,15 +4337,23 @@ } define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %mask) { -; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %ymm2 -; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [3,4,2,6] -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 -; CHECK-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovapd %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2 +; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [3,4,2,6] +; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 +; CHECK-FAST-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm1 {%k1} {z} +; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vpermpd $236, (%rdi), %ymm1 # ymm1 = mem[0,3,2,3] +; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 +; CHECK-FAST-PERLANE-NEXT: vshufpd $1, 32(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z} +; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -4038,15 +4362,24 @@ } define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) { -; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %ymm2 -; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [1,2,3,4] -; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2 +; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [1,2,3,4] +; CHECK-FAST-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3 +; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 +; CHECK-FAST-NEXT: vmovapd %ymm3, %ymm0 {%k1} +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vmovapd (%rdi), %ymm2 +; CHECK-FAST-PERLANE-NEXT: vperm2f128 $33, 32(%rdi), %ymm2, %ymm3 # ymm3 = ymm2[2,3],mem[0,1] +; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm4, %ymm1, %k1 +; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm2[1],ymm3[0],ymm2[3],ymm3[2] +; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -4055,15 +4388,24 @@ } define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double>* %vp, <4 x double> %mask) { -; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %ymm2 -; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [1,2,3,4] -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 -; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovapd %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2 +; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [1,2,3,4] +; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 +; CHECK-FAST-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z} +; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vmovapd (%rdi), %ymm1 +; CHECK-FAST-PERLANE-NEXT: vperm2f128 $33, 32(%rdi), %ymm1, %ymm2 # ymm2 = ymm1[2,3],mem[0,1] +; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 +; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm1[1],ymm2[0],ymm1[3],ymm2[2] +; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer diff --git a/llvm/test/CodeGen/X86/avx512-trunc.ll b/llvm/test/CodeGen/X86/avx512-trunc.ll --- a/llvm/test/CodeGen/X86/avx512-trunc.ll +++ b/llvm/test/CodeGen/X86/avx512-trunc.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=KNL -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,SKX +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,SKX +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,SKX attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll @@ -3,8 +3,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512 ; ; 128-bit vectors diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -3,9 +3,11 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512VLBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512VLBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512VLBW ; ; 128-bit vectors diff --git a/llvm/test/CodeGen/X86/broadcastm-lowering.ll b/llvm/test/CodeGen/X86/broadcastm-lowering.ll --- a/llvm/test/CodeGen/X86/broadcastm-lowering.ll +++ b/llvm/test/CodeGen/X86/broadcastm-lowering.ll @@ -1,10 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX512CD -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512cd,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX512CD -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX512CDBW -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512cd,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX512CDBW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,avx512cd,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX512VLCDBW -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl,avx512cd,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX512VLCDBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX512CD +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX512CD +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512cd,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX512CD +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512cd,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX512CD +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX512CDBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX512CDBW +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512cd,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX512CDBW +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512cd,+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX512CDBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,avx512cd,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX512VLCDBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,avx512cd,+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX512VLCDBW +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl,avx512cd,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX512VLCDBW +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl,avx512cd,+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX512VLCDBW define <2 x i64> @test_mm_epi64(<8 x i16> %a, <8 x i16> %b) { ; AVX512CD-LABEL: test_mm_epi64: diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll --- a/llvm/test/CodeGen/X86/combine-shl.ll +++ b/llvm/test/CodeGen/X86/combine-shl.ll @@ -2,7 +2,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX-FAST-PERLANE ; fold (shl 0, x) -> 0 define <4 x i32> @combine_vec_shl_zero(<4 x i32> %x) { @@ -145,14 +146,23 @@ ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: combine_vec_shl_trunc_and: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> -; AVX-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-FAST-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX-FAST-ALL-LABEL: combine_vec_shl_trunc_and: +; AVX-FAST-ALL: # %bb.0: +; AVX-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-FAST-ALL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 +; AVX-FAST-ALL-NEXT: vzeroupper +; AVX-FAST-ALL-NEXT: retq +; +; AVX-FAST-PERLANE-LABEL: combine_vec_shl_trunc_and: +; AVX-FAST-PERLANE: # %bb.0: +; AVX-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX-FAST-PERLANE-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-FAST-PERLANE-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 +; AVX-FAST-PERLANE-NEXT: vzeroupper +; AVX-FAST-PERLANE-NEXT: retq %1 = and <4 x i64> %y, %2 = trunc <4 x i64> %1 to <4 x i32> %3 = shl <4 x i32> %x, %2 diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll --- a/llvm/test/CodeGen/X86/combine-sra.ll +++ b/llvm/test/CodeGen/X86/combine-sra.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST-PERLANE ; fold (sra 0, x) -> 0 define <4 x i32> @combine_vec_ashr_zero(<4 x i32> %x) { @@ -175,14 +176,23 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: combine_vec_ashr_trunc_and: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpsravd %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_and: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-FAST-ALL-NEXT: vpsravd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: combine_vec_ashr_trunc_and: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-FAST-PERLANE-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpsravd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq %1 = and <4 x i64> %y, %2 = trunc <4 x i64> %1 to <4 x i32> %3 = ashr <4 x i32> %x, %2 @@ -213,13 +223,21 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: combine_vec_ashr_trunc_lshr: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_lshr: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7] +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: combine_vec_ashr_trunc_lshr: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX2-FAST-PERLANE-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq %1 = lshr <4 x i64> %x, %2 = trunc <4 x i64> %1 to <4 x i32> %3 = ashr <4 x i32> %2, @@ -278,13 +296,21 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: combine_vec_ashr_trunc_ashr: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_ashr: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7] +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: combine_vec_ashr_trunc_ashr: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX2-FAST-PERLANE-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq %1 = ashr <4 x i64> %x, %2 = trunc <4 x i64> %1 to <4 x i32> %3 = ashr <4 x i32> %2, diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll --- a/llvm/test/CodeGen/X86/combine-srl.ll +++ b/llvm/test/CodeGen/X86/combine-srl.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST-PERLANE ; fold (srl 0, x) -> 0 define <4 x i32> @combine_vec_lshr_zero(<4 x i32> %x) { @@ -210,14 +211,23 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: combine_vec_lshr_trunc_lshr1: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: combine_vec_lshr_trunc_lshr1: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: combine_vec_lshr_trunc_lshr1: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-FAST-PERLANE-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq %1 = lshr <4 x i64> %x, %2 = trunc <4 x i64> %1 to <4 x i32> %3 = lshr <4 x i32> %2, @@ -423,14 +433,23 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: combine_vec_lshr_trunc_and: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: combine_vec_lshr_trunc_and: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-FAST-ALL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: combine_vec_lshr_trunc_and: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-FAST-PERLANE-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq %1 = and <4 x i64> %y, %2 = trunc <4 x i64> %1 to <4 x i32> %3 = lshr <4 x i32> %x, %2 diff --git a/llvm/test/CodeGen/X86/insertelement-zero.ll b/llvm/test/CodeGen/X86/insertelement-zero.ll --- a/llvm/test/CodeGen/X86/insertelement-zero.ll +++ b/llvm/test/CodeGen/X86/insertelement-zero.ll @@ -5,7 +5,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST define <2 x double> @insert_v2f64_z1(<2 x double> %a) { ; SSE2-LABEL: insert_v2f64_z1: diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -3,7 +3,8 @@ ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-PERLANE ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+xop | FileCheck %s --check-prefix=XOP define void @v3i64(<2 x i64> %a, <2 x i64> %b, <3 x i64>* %p) nounwind { @@ -582,20 +583,35 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: v12i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] -; AVX2-FAST-NEXT: vmovaps %xmm0, 32(%rdi) -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdi) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: v12i32: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6> +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm2 +; AVX2-FAST-ALL-NEXT: vbroadcastsd %xmm1, %ymm3 +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm3 = +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm3, %ymm0 +; AVX2-FAST-ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] +; AVX2-FAST-ALL-NEXT: vmovaps %xmm0, 32(%rdi) +; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, (%rdi) +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: v12i32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, 32(%rdi) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdi) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; ; XOP-LABEL: v12i32: ; XOP: # %bb.0: @@ -1155,27 +1171,50 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: interleave_24i16_in: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqu (%rsi), %xmm0 -; AVX2-FAST-NEXT: vmovdqu (%rdx), %xmm1 -; AVX2-FAST-NEXT: vmovdqu (%rcx), %xmm2 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,4,1,5,1,5,2,6] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,4,5,u,u,2,3,6,7,u,u,8,9,12,13,u,u,18,19,22,23,u,u,24,25,28,29,u,u,26,27] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX2-FAST-NEXT: vmovdqu %xmm0, 32(%rdi) -; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rdi) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: interleave_24i16_in: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqu (%rsi), %xmm0 +; AVX2-FAST-ALL-NEXT: vmovdqu (%rdx), %xmm1 +; AVX2-FAST-ALL-NEXT: vmovdqu (%rcx), %xmm2 +; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm4, %ymm4 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [0,4,1,5,1,5,2,6] +; AVX2-FAST-ALL-NEXT: vpermd %ymm3, %ymm5, %ymm3 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,4,5,u,u,2,3,6,7,u,u,8,9,12,13,u,u,18,19,22,23,u,u,24,25,28,29,u,u,26,27] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-ALL-NEXT: vmovdqu %xmm0, 32(%rdi) +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm3, (%rdi) +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: interleave_24i16_in: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rdx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rcx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,u,u,6,7,2,3,u,u,8,9,4,5,u,u,16,17,u,u,22,23,18,19,u,u,24,25,20,21,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %xmm0, 32(%rdi) +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rdi) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; ; XOP-LABEL: interleave_24i16_in: ; XOP: # %bb.0: @@ -1349,35 +1388,65 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: interleave_24i32_out: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovups (%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups 32(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups 64(%rdi), %ymm2 -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u> -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, (%rsi) -; AVX2-FAST-NEXT: vmovups %ymm4, (%rdx) -; AVX2-FAST-NEXT: vmovups %ymm0, (%rcx) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: interleave_24i32_out: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovups (%rdi), %ymm0 +; AVX2-FAST-ALL-NEXT: vmovups 32(%rdi), %ymm1 +; AVX2-FAST-ALL-NEXT: vmovups 64(%rdi), %ymm2 +; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482] +; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm3, %ymm3 +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u> +; AVX2-FAST-ALL-NEXT: vpermps %ymm4, %ymm5, %ymm4 +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6] +; AVX2-FAST-ALL-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm4, %ymm4 +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u> +; AVX2-FAST-ALL-NEXT: vpermps %ymm5, %ymm6, %ymm5 +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm5 = +; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm5, %ymm2 +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-ALL-NEXT: vmovups %ymm3, (%rsi) +; AVX2-FAST-ALL-NEXT: vmovups %ymm4, (%rdx) +; AVX2-FAST-ALL-NEXT: vmovups %ymm0, (%rcx) +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: interleave_24i32_out: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovups (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovups 32(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups 64(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6] +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; ; XOP-LABEL: interleave_24i32_out: ; XOP: # %bb.0: @@ -1572,34 +1641,62 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: interleave_24i32_in: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovups (%rsi), %ymm0 -; AVX2-FAST-NEXT: vmovups (%rdx), %ymm1 -; AVX2-FAST-NEXT: vmovups (%rcx), %ymm2 -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] -; AVX2-FAST-NEXT: vbroadcastsd (%rcx), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = <5,u,u,6,u,u,7,u> -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] -; AVX2-FAST-NEXT: vbroadcastsd 24(%rsi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, 32(%rdi) -; AVX2-FAST-NEXT: vmovups %ymm4, 64(%rdi) -; AVX2-FAST-NEXT: vmovups %ymm3, (%rdi) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: interleave_24i32_in: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovups (%rsi), %ymm0 +; AVX2-FAST-ALL-NEXT: vmovups (%rdx), %ymm1 +; AVX2-FAST-ALL-NEXT: vmovups (%rcx), %ymm2 +; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-ALL-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm3, %ymm3 +; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] +; AVX2-FAST-ALL-NEXT: vbroadcastsd (%rcx), %ymm4 +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm4 = <5,u,u,6,u,u,7,u> +; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm4, %ymm4 +; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[2,1,3,3] +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX2-FAST-ALL-NEXT: vbroadcastsd 24(%rsi), %ymm5 +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] +; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] +; AVX2-FAST-ALL-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-ALL-NEXT: vmovups %ymm0, 32(%rdi) +; AVX2-FAST-ALL-NEXT: vmovups %ymm4, 64(%rdi) +; AVX2-FAST-ALL-NEXT: vmovups %ymm3, (%rdi) +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: interleave_24i32_in: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovups (%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups (%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = mem[1,0,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd (%rcx), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rsi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, 32(%rdi) +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, 64(%rdi) +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, (%rdi) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; ; XOP-LABEL: interleave_24i32_in: ; XOP: # %bb.0: diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll --- a/llvm/test/CodeGen/X86/oddsubvector.ll +++ b/llvm/test/CodeGen/X86/oddsubvector.ll @@ -3,9 +3,11 @@ ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 -; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512 +; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512 +; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+xop | FileCheck %s --check-prefixes=AVX,XOP define void @insert_v7i8_v2i16_2(<7 x i8> *%a0, <2 x i16> *%a1) nounwind { diff --git a/llvm/test/CodeGen/X86/paddus.ll b/llvm/test/CodeGen/X86/paddus.ll --- a/llvm/test/CodeGen/X86/paddus.ll +++ b/llvm/test/CodeGen/X86/paddus.ll @@ -4,7 +4,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 ; FIXME: should be paddusb define <16 x i8> @test1(<16 x i8> %x) { diff --git a/llvm/test/CodeGen/X86/phaddsub.ll b/llvm/test/CodeGen/X86/phaddsub.ll --- a/llvm/test/CodeGen/X86/phaddsub.ll +++ b/llvm/test/CodeGen/X86/phaddsub.ll @@ -5,7 +5,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2-SLOW ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2-FAST -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-SHUF +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-SHUF +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-SHUF define <8 x i16> @phaddw1(<8 x i16> %x, <8 x i16> %y) { ; SSSE3-LABEL: phaddw1: diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -4,8 +4,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 define <8 x i16> @test1(<8 x i16> %x) nounwind { ; SSE-LABEL: test1: @@ -2591,24 +2593,42 @@ ; AVX2-SLOW-NEXT: vpsubd %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: test33: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-FAST-NEXT: vpxor %ymm3, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] -; AVX2-FAST-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-FAST-NEXT: vblendvpd %ymm4, %ymm1, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpxor %ymm3, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm2, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: test33: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm1, %ymm4 +; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] +; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 +; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-FAST-ALL-NEXT: vblendvpd %ymm4, %ymm1, %ymm6, %ymm1 +; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm4, %ymm1 +; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm2, %ymm3 +; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 +; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm2, %ymm6, %ymm2 +; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm4, %ymm2 +; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-FAST-ALL-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: test33: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] +; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6] +; AVX2-FAST-PERLANE-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512-LABEL: test33: ; AVX512: # %bb.0: @@ -2811,26 +2831,46 @@ ; AVX2-SLOW-NEXT: vpsubd %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: test34: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] -; AVX2-FAST-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-FAST-NEXT: vpxor %ymm3, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] -; AVX2-FAST-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-FAST-NEXT: vblendvpd %ymm4, %ymm1, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpxor %ymm3, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm2, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: test34: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] +; AVX2-FAST-ALL-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm1, %ymm4 +; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] +; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 +; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-FAST-ALL-NEXT: vblendvpd %ymm4, %ymm1, %ymm6, %ymm1 +; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm4, %ymm1 +; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm2, %ymm3 +; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 +; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm2, %ymm6, %ymm2 +; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm4, %ymm2 +; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-FAST-ALL-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: test34: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] +; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6] +; AVX2-FAST-PERLANE-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512-LABEL: test34: ; AVX512: # %bb.0: diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll --- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -4,8 +4,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW declare <1 x i8> @llvm.sadd.sat.v1i8(<1 x i8>, <1 x i8>) declare <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8>, <2 x i8>) diff --git a/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll b/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll --- a/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll +++ b/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST-PERLANE ; PR32449 define <2 x double> @foo2(<2 x double> %v, <2 x double> *%p) nounwind { @@ -35,12 +36,19 @@ ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdi) ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: foo8: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vbroadcastss {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rdi) -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: foo8: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vbroadcastss {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5] +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: foo8: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-FAST-PERLANE-NEXT: retq %res = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> %res1 = shufflevector<8 x float> %res, <8 x float> undef, <8 x i32> store <8 x float> %res, <8 x float>* %p diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll @@ -3,11 +3,15 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL define void @shuffle_v16i8_to_v8i8_1(<16 x i8>* %L, <8 x i8>* %S) nounwind { ; SSE2-LABEL: shuffle_v16i8_to_v8i8_1: diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll @@ -1,11 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL define void @shuffle_v32i8_to_v16i8_1(<32 x i8>* %L, <16 x i8>* %S) nounwind { ; AVX-LABEL: shuffle_v32i8_to_v16i8_1: diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll @@ -1,8 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL,AVX512BWVL-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL,AVX512BWVL-FAST-PERLANE define void @shuffle_v64i8_to_v32i8_1(<64 x i8>* %L, <32 x i8>* %S) nounwind { ; AVX512F-LABEL: shuffle_v64i8_to_v32i8_1: @@ -17,41 +20,29 @@ ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v64i8_to_v32i8_1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7] -; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 -; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512BWVL-FAST-ALL-LABEL: shuffle_v64i8_to_v32i8_1: +; AVX512BWVL-FAST-ALL: # %bb.0: +; AVX512BWVL-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512BWVL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] +; AVX512BWVL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] +; AVX512BWVL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7] +; AVX512BWVL-FAST-ALL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512BWVL-FAST-ALL-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512BWVL-FAST-ALL-NEXT: vzeroupper +; AVX512BWVL-FAST-ALL-NEXT: retq ; -; AVX512BW-LABEL: shuffle_v64i8_to_v32i8_1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,9,11] -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa %ymm1, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8_1: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] -; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7] -; AVX512BWVL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 -; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512BWVL-FAST-PERLANE-LABEL: shuffle_v64i8_to_v32i8_1: +; AVX512BWVL-FAST-PERLANE: # %bb.0: +; AVX512BWVL-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512BWVL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] +; AVX512BWVL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] +; AVX512BWVL-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX512BWVL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512BWVL-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512BWVL-FAST-PERLANE-NEXT: vzeroupper +; AVX512BWVL-FAST-PERLANE-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> store <32 x i8> %strided.vec, <32 x i8>* %S @@ -71,18 +62,6 @@ ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v32i16_to_v16i16_1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7] -; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 -; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; ; AVX512BW-LABEL: shuffle_v32i16_to_v16i16_1: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,33,35,37,39,9,11,13,15,41,43,45,47] @@ -118,33 +97,23 @@ ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i32_to_v8i32_1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15] -; AVX512VL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1 -; AVX512VL-NEXT: vmovdqa %ymm1, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512BWVL-FAST-ALL-LABEL: shuffle_v16i32_to_v8i32_1: +; AVX512BWVL-FAST-ALL: # %bb.0: +; AVX512BWVL-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15] +; AVX512BWVL-FAST-ALL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-FAST-ALL-NEXT: vmovdqa %ymm1, (%rsi) +; AVX512BWVL-FAST-ALL-NEXT: vzeroupper +; AVX512BWVL-FAST-ALL-NEXT: retq ; -; AVX512BW-LABEL: shuffle_v16i32_to_v8i32_1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,17,19,21,23] -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa %ymm1, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32_1: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15] -; AVX512BWVL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1 -; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512BWVL-FAST-PERLANE-LABEL: shuffle_v16i32_to_v8i32_1: +; AVX512BWVL-FAST-PERLANE: # %bb.0: +; AVX512BWVL-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 +; AVX512BWVL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],mem[1,3],ymm0[5,7],mem[5,7] +; AVX512BWVL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512BWVL-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rsi) +; AVX512BWVL-FAST-PERLANE-NEXT: vzeroupper +; AVX512BWVL-FAST-PERLANE-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %L %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> store <8 x i32> %strided.vec, <8 x i32>* %S diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll @@ -3,12 +3,17 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL ; PR31551 ; Pairs of shufflevector:trunc functions with functional equivalence. diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -1,12 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512BWVL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512VBMIVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512VBMIVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512VBMIVL ; PR31551 ; Pairs of shufflevector:trunc functions with functional equivalence. @@ -309,13 +314,20 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm0, (%rsi) ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: trunc_v4i64_to_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm0 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermps (%rdi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps %xmm0, (%rsi) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i32: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm0 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpermps (%rdi), %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: vmovaps %xmm0, (%rsi) +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: trunc_v4i64_to_v4i32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, (%rsi) +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-LABEL: trunc_v4i64_to_v4i32: ; AVX512F: # %bb.0: @@ -790,13 +802,21 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: ; AVX512F: # %bb.0: @@ -853,13 +873,21 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: ; AVX512F: # %bb.0: diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -1,10 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL,AVX512VL-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL,AVX512VL-FAST-PERLANE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL ; PR31551 ; Pairs of shufflevector:trunc functions with functional equivalence. @@ -23,17 +28,29 @@ ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v64i8_to_v32i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7] -; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 -; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v64i8_to_v32i8: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] +; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7] +; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512VL-FAST-ALL-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512VL-FAST-ALL-NEXT: vzeroupper +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v64i8_to_v32i8: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; AVX512VL-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512VL-FAST-PERLANE-NEXT: vzeroupper +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_to_v32i8: ; AVX512BW: # %bb.0: diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll --- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -4,8 +4,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW declare <1 x i8> @llvm.ssub.sat.v1i8(<1 x i8>, <1 x i8>) declare <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8>, <2 x i8>) diff --git a/llvm/test/CodeGen/X86/uadd_sat_vec.ll b/llvm/test/CodeGen/X86/uadd_sat_vec.ll --- a/llvm/test/CodeGen/X86/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/uadd_sat_vec.ll @@ -4,8 +4,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW declare <1 x i8> @llvm.uadd.sat.v1i8(<1 x i8>, <1 x i8>) declare <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8>, <2 x i8>) diff --git a/llvm/test/CodeGen/X86/usub_sat_vec.ll b/llvm/test/CodeGen/X86/usub_sat_vec.ll --- a/llvm/test/CodeGen/X86/usub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/usub_sat_vec.ll @@ -4,8 +4,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW declare <1 x i8> @llvm.usub.sat.v1i8(<1 x i8>, <1 x i8>) declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>) diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll --- a/llvm/test/CodeGen/X86/vec_saddo.ll +++ b/llvm/test/CodeGen/X86/vec_saddo.ll @@ -4,7 +4,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512 declare {<1 x i32>, <1 x i1>} @llvm.sadd.with.overflow.v1i32(<1 x i32>, <1 x i32>) declare {<2 x i32>, <2 x i1>} @llvm.sadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -4,8 +4,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW declare {<1 x i32>, <1 x i1>} @llvm.smul.with.overflow.v1i32(<1 x i32>, <1 x i32>) declare {<2 x i32>, <2 x i1>} @llvm.smul.with.overflow.v2i32(<2 x i32>, <2 x i32>) diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll --- a/llvm/test/CodeGen/X86/vec_ssubo.ll +++ b/llvm/test/CodeGen/X86/vec_ssubo.ll @@ -4,7 +4,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512 declare {<1 x i32>, <1 x i1>} @llvm.ssub.with.overflow.v1i32(<1 x i32>, <1 x i32>) declare {<2 x i32>, <2 x i1>} @llvm.ssub.with.overflow.v2i32(<2 x i32>, <2 x i32>) diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -4,7 +4,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512 declare {<1 x i32>, <1 x i1>} @llvm.uadd.with.overflow.v1i32(<1 x i32>, <1 x i32>) declare {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -4,8 +4,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW declare {<1 x i32>, <1 x i1>} @llvm.umul.with.overflow.v1i32(<1 x i32>, <1 x i32>) declare {<2 x i32>, <2 x i1>} @llvm.umul.with.overflow.v2i32(<2 x i32>, <2 x i32>) diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -4,7 +4,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512 declare {<1 x i32>, <1 x i1>} @llvm.usub.with.overflow.v1i32(<1 x i32>, <1 x i32>) declare {<2 x i32>, <2 x i1>} @llvm.usub.with.overflow.v2i32(<2 x i32>, <2 x i32>) diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -1,9 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX1 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX2 -; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c,+fast-variable-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX2 +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX2 +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX2 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX512 -; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX512 +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX512 +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX512 ; ; Half to Float diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck --check-prefixes=AVX2,AVX2-SLOW %s -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-shuffle | FileCheck --check-prefixes=AVX2,AVX2-FAST %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2,AVX2-FAST %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2,AVX2-FAST %s ; These patterns are produced by LoopVectorizer for interleaved loads. diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck --check-prefixes=AVX2,AVX2-SLOW %s -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-shuffle | FileCheck --check-prefixes=AVX2,AVX2-FAST %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2,AVX2-FAST %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2,AVX2-FAST %s ; These patterns are produced by LoopVectorizer for interleaved loads. diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck --check-prefixes=AVX2,AVX2-SLOW %s -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-shuffle | FileCheck --check-prefixes=AVX2,AVX2-FAST %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2,AVX2-FAST,AVX2-FAST-ALL %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2,AVX2-FAST,AVX2-FAST-PERLANE %s ; These patterns are produced by LoopVectorizer for interleaved loads. @@ -322,80 +323,180 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: vf16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm13 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm14 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0],xmm2[1,2,3],xmm5[4],xmm2[5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm2[1,2,3],xmm3[4],xmm2[5,6,7] -; AVX2-FAST-NEXT: vpackusdw %xmm7, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0],xmm2[1,2,3],xmm14[4],xmm2[5,6,7] -; AVX2-FAST-NEXT: vpackusdw %xmm7, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm4, %ymm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm14, %xmm6 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm11[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm10, (%rsi) -; AVX2-FAST-NEXT: vmovdqa %ymm9, (%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: vf16: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa 64(%rdi), %ymm8 +; AVX2-FAST-ALL-NEXT: vmovdqa 96(%rdi), %ymm13 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm14 +; AVX2-FAST-ALL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-FAST-ALL-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX2-FAST-ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0],xmm2[1,2,3],xmm5[4],xmm2[5,6,7] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm2[1,2,3],xmm3[4],xmm2[5,6,7] +; AVX2-FAST-ALL-NEXT: vpackusdw %xmm7, %xmm4, %xmm4 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0],xmm2[1,2,3],xmm14[4],xmm2[5,6,7] +; AVX2-FAST-ALL-NEXT: vpackusdw %xmm7, %xmm2, %xmm2 +; AVX2-FAST-ALL-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6] +; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm4, %ymm7 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm10 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29] +; AVX2-FAST-ALL-NEXT: vpermd %ymm8, %ymm4, %ymm4 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm11 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm4, %ymm12 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-ALL-NEXT: vpshufb %xmm2, %xmm5, %xmm6 +; AVX2-FAST-ALL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-ALL-NEXT: vpshufb %xmm6, %xmm1, %xmm0 +; AVX2-FAST-ALL-NEXT: vpshufb %xmm6, %xmm14, %xmm6 +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm7, %ymm6 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm12 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpshufb %ymm12, %ymm4, %ymm4 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm0, %ymm4 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29] +; AVX2-FAST-ALL-NEXT: vpermd %ymm8, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm0, %ymm8 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[3,1,2,3] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm11[2,3] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm4, %ymm4 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,1,3,1,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm10, (%rsi) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm9, (%rdx) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm2, (%rcx) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, (%r8) +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vf16: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm13[0],xmm0[1,2,3],xmm13[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm0[1,2,3],xmm12[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0],xmm0[1,2,3],xmm11[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0],xmm0[1,2,3],xmm10[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm5, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0],xmm0[1,2,3],xmm6[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm0[1,2,3],xmm7[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3],xmm5[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq %wide.vec = load <64 x i16>, <64 x i16>* %in.vec, align 32 %strided.vec0 = shufflevector <64 x i16> %wide.vec, <64 x i16> poison, <16 x i32> @@ -656,180 +757,415 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: vf32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $216, %rsp -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm9 -; AVX2-FAST-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm8[1,2,3],xmm9[4],xmm8[5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0],xmm8[1,2,3],xmm12[4],xmm8[5,6,7] -; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm8[1,2,3],xmm11[4],xmm8[5,6,7] -; AVX2-FAST-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm8[1,2,3],xmm2[4],xmm8[5,6,7] -; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm4, %xmm1 -; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,2,2,3,0,2,4,6] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm15, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm15, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 176(%rdi), %xmm13 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0],xmm8[1,2,3],xmm13[4],xmm8[5,6,7] -; AVX2-FAST-NEXT: vmovdqa %xmm13, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm8[1,2,3],xmm0[4],xmm8[5,6,7] -; AVX2-FAST-NEXT: vpackusdw %xmm5, %xmm7, %xmm10 -; AVX2-FAST-NEXT: vmovdqa 144(%rdi), %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm8[1,2,3],xmm5[4],xmm8[5,6,7] -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm8[1,2,3],xmm4[4],xmm8[5,6,7] -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm6, %xmm0 -; AVX2-FAST-NEXT: vpackusdw %xmm10, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm10 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm15, %ymm6 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm15 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm9, %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm12, %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm3 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm12 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm12, %ymm12 -; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm15 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm13, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm13, %xmm2 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,2,3,1,3,5,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vpshufd $231, (%rsp), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm13[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm7 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm5[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm14[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FAST-NEXT: addq $216, %rsp -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: vf32: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: subq $216, %rsp +; AVX2-FAST-ALL-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vmovdqa 96(%rdi), %ymm6 +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-FAST-ALL-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-ALL-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX2-FAST-ALL-NEXT: vmovdqa 48(%rdi), %xmm9 +; AVX2-FAST-ALL-NEXT: vpxor %xmm8, %xmm8, %xmm8 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm8[1,2,3],xmm9[4],xmm8[5,6,7] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0],xmm8[1,2,3],xmm12[4],xmm8[5,6,7] +; AVX2-FAST-ALL-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm8[1,2,3],xmm11[4],xmm8[5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm8[1,2,3],xmm2[4],xmm8[5,6,7] +; AVX2-FAST-ALL-NEXT: vpackusdw %xmm1, %xmm4, %xmm1 +; AVX2-FAST-ALL-NEXT: vpackusdw %xmm0, %xmm1, %xmm1 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm15 = [0,2,2,3,0,2,4,6] +; AVX2-FAST-ALL-NEXT: vpermd %ymm6, %ymm15, %ymm2 +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-ALL-NEXT: vpshufb %ymm0, %ymm2, %ymm4 +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, %ymm3 +; AVX2-FAST-ALL-NEXT: vpermd %ymm5, %ymm15, %ymm2 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm0 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpshufb %ymm0, %ymm2, %ymm5 +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, %ymm14 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vmovdqa 176(%rdi), %xmm13 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0],xmm8[1,2,3],xmm13[4],xmm8[5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa %xmm13, (%rsp) # 16-byte Spill +; AVX2-FAST-ALL-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX2-FAST-ALL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm8[1,2,3],xmm0[4],xmm8[5,6,7] +; AVX2-FAST-ALL-NEXT: vpackusdw %xmm5, %xmm7, %xmm10 +; AVX2-FAST-ALL-NEXT: vmovdqa 144(%rdi), %xmm5 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm8[1,2,3],xmm5[4],xmm8[5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-ALL-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm8[1,2,3],xmm4[4],xmm8[5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-ALL-NEXT: vpackusdw %xmm0, %xmm6, %xmm0 +; AVX2-FAST-ALL-NEXT: vpackusdw %xmm10, %xmm0, %xmm0 +; AVX2-FAST-ALL-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vmovdqa 224(%rdi), %ymm10 +; AVX2-FAST-ALL-NEXT: vpermd %ymm10, %ymm15, %ymm6 +; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm15, %ymm1 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm3, %ymm6, %ymm15 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm14, %ymm1, %ymm14 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-ALL-NEXT: vpshufb %xmm14, %xmm9, %xmm0 +; AVX2-FAST-ALL-NEXT: vmovdqa %xmm12, %xmm7 +; AVX2-FAST-ALL-NEXT: vpshufb %xmm14, %xmm12, %xmm3 +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-ALL-NEXT: vpshufb %xmm3, %xmm11, %xmm0 +; AVX2-FAST-ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-ALL-NEXT: vpshufb %xmm3, %xmm8, %xmm12 +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm12, %ymm12 +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm11, %ymm15 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm11 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vpshufb %xmm14, %xmm13, %xmm0 +; AVX2-FAST-ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-ALL-NEXT: vpshufb %xmm14, %xmm13, %xmm2 +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FAST-ALL-NEXT: vpshufb %xmm3, %xmm5, %xmm2 +; AVX2-FAST-ALL-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm6, %ymm2 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,2,3,1,3,5,7] +; AVX2-FAST-ALL-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-ALL-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpshufb %ymm2, %ymm14, %ymm2 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[3,1,2,3] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[3,1,2,3] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,2,0,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,2,0,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-ALL-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX2-FAST-ALL-NEXT: # xmm12 = mem[3,1,2,3] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[3,1,2,3] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm15 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vpermd %ymm10, %ymm1, %ymm2 +; AVX2-FAST-ALL-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-ALL-NEXT: vpshufd $231, (%rsp), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-ALL-NEXT: # xmm3 = mem[3,1,2,3] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm7 = xmm13[0,1,2,0,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; AVX2-FAST-ALL-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX2-FAST-ALL-NEXT: # xmm7 = mem[3,1,2,3] +; AVX2-FAST-ALL-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-FAST-ALL-NEXT: # xmm5 = mem[3,1,2,3] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm15 = xmm5[2,0,2,3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm6, %ymm1 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm8 = ymm14[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,1,3,1,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[0,1,3,1,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,3,1,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,3,1,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm1, 32(%r8) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, (%r8) +; AVX2-FAST-ALL-NEXT: addq $216, %rsp +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vf32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: subq $280, %rsp # imm = 0x118 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm0[1,2,3],xmm3[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm0[1,2,3],xmm5[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm0[1,2,3],xmm6[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0],xmm0[1,2,3],xmm15[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0],xmm0[1,2,3],xmm12[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm0[1,2,3],xmm14[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm7, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0],xmm0[1,2,3],xmm10[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm13[0],xmm0[1,2,3],xmm13[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm3, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm0[1,2,3],xmm3[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm7, %xmm7, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm15, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm12, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm14, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm10, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm14 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm12 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r8) +; AVX2-FAST-PERLANE-NEXT: addq $280, %rsp # imm = 0x118 +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq %wide.vec = load <128 x i16>, <128 x i16>* %in.vec, align 32 %strided.vec0 = shufflevector <128 x i16> %wide.vec, <128 x i16> poison, <32 x i32> diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck --check-prefixes=AVX2-SLOW %s -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-shuffle | FileCheck --check-prefixes=AVX2-FAST %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2-FAST,AVX2-FAST-ALL %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2-FAST,AVX2-FAST-PERLANE %s ; These patterns are produced by LoopVectorizer for interleaved loads. @@ -329,94 +330,181 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: vf16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10],ymm1[11],ymm2[12],ymm1[13],ymm2[14,15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,0,2,4,6,1,3] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6],xmm5[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,u,u,u,4,7,1,6> -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1],ymm3[2],ymm4[3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8,9],ymm3[10],ymm4[11],ymm3[12],ymm4[13,14],ymm3[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,2,u,u,5,7,2,4> -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,4,6,0,1,4,6,0] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,3,u,u,5,0,2,7> -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,4,7,0,2,4,7,0] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <1,3,u,u,6,0,3,5> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm9, (%rsi) -; AVX2-FAST-NEXT: vmovdqa %ymm12, (%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm7, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r9) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: vf16: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-FAST-ALL-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-FAST-ALL-NEXT: vmovdqa 96(%rdi), %ymm2 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] +; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10],ymm1[11],ymm2[12],ymm1[13],ymm2[14,15] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,0,2,4,6,1,3] +; AVX2-FAST-ALL-NEXT: vpermd %ymm6, %ymm7, %ymm6 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm6, %ymm6 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm6, %ymm6 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15] +; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm6, %xmm5 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6],xmm5[7] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = <2,u,u,u,4,7,1,6> +; AVX2-FAST-ALL-NEXT: vpermd %ymm6, %ymm10, %ymm6 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm6, %ymm6 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm6, %ymm6 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1],ymm3[2],ymm4[3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8,9],ymm3[10],ymm4[11],ymm3[12],ymm4[13,14],ymm3[15] +; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = <0,2,u,u,5,7,2,4> +; AVX2-FAST-ALL-NEXT: vpermd %ymm7, %ymm10, %ymm7 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,4,6,0,1,4,6,0] +; AVX2-FAST-ALL-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm7, %ymm7 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm7, %ymm7 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] +; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = <0,3,u,u,5,0,2,7> +; AVX2-FAST-ALL-NEXT: vpermd %ymm6, %ymm10, %ymm6 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,4,7,0,2,4,7,0] +; AVX2-FAST-ALL-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm6, %ymm6 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm6, %ymm6 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] +; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <1,3,u,u,6,0,3,5> +; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5],ymm3[6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm9, (%rsi) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm12, (%rdx) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm7, (%rcx) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm5, (%r8) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vf16: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm7[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0,1],xmm4[2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4],xmm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0],xmm5[1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm7[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0,1],xmm5[2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12,13,14,15,4,5,14,15,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[0,1,2,3,0,1,10,11,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq %wide.vec = load <80 x i16>, <80 x i16>* %in.vec, align 32 %strided.vec0 = shufflevector <80 x i16> %wide.vec, <80 x i16> poison, <16 x i32> diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck --check-prefixes=AVX2-SLOW %s -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-shuffle | FileCheck --check-prefixes=AVX2-FAST %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2-FAST %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2-FAST %s ; These patterns are produced by LoopVectorizer for interleaved loads. diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck --check-prefixes=AVX2 %s -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-shuffle | FileCheck --check-prefixes=AVX2 %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2 %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2 %s ; These patterns are produced by LoopVectorizer for interleaved stores. diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck --check-prefixes=AVX2,AVX2-SLOW %s -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-shuffle | FileCheck --check-prefixes=AVX2,AVX2-FAST %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2,AVX2-FAST,AVX2-FAST-ALL %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2,AVX2-FAST,AVX2-FAST-PERLANE %s ; These patterns are produced by LoopVectorizer for interleaved stores. @@ -83,27 +84,50 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: vf8: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,4,1,5,1,5,2,6] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,4,5,u,u,2,3,6,7,u,u,8,9,12,13,u,u,18,19,22,23,u,u,24,25,28,29,u,u,26,27] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX2-FAST-NEXT: vmovdqa %xmm0, 32(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rcx) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: vf8: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm4, %ymm4 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [0,4,1,5,1,5,2,6] +; AVX2-FAST-ALL-NEXT: vpermd %ymm3, %ymm5, %ymm3 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,4,5,u,u,2,3,6,7,u,u,8,9,12,13,u,u,18,19,22,23,u,u,24,25,28,29,u,u,26,27] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-ALL-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm3, (%rcx) +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vf8: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,u,u,6,7,2,3,u,u,8,9,4,5,u,u,16,17,u,u,22,23,18,19,u,u,24,25,20,21,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq %in.vec0 = load <8 x i16>, <8 x i16>* %in.vecptr0, align 32 %in.vec1 = load <8 x i16>, <8 x i16>* %in.vecptr1, align 32 %in.vec2 = load <8 x i16>, <8 x i16>* %in.vecptr2, align 32 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll @@ -1,20 +1,32 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck --check-prefixes=AVX2,AVX2-SLOW %s -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-shuffle | FileCheck --check-prefixes=AVX2,AVX2-FAST %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck --check-prefixes=AVX2-SLOW %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2-FAST,AVX2-FAST-ALL %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2-FAST,AVX2-FAST-PERLANE %s ; These patterns are produced by LoopVectorizer for interleaved stores. define void @vf2(<2 x i16>* %in.vecptr0, <2 x i16>* %in.vecptr1, <2 x i16>* %in.vecptr2, <2 x i16>* %in.vecptr3, <8 x i16>* %out.vec) nounwind { -; AVX2-LABEL: vf2: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15] -; AVX2-NEXT: vmovdqa %xmm0, (%r8) -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: vf2: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15] +; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%r8) +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: vf2: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15] +; AVX2-FAST-NEXT: vmovdqa %xmm0, (%r8) +; AVX2-FAST-NEXT: retq %in.vec0 = load <2 x i16>, <2 x i16>* %in.vecptr0, align 32 %in.vec1 = load <2 x i16>, <2 x i16>* %in.vecptr1, align 32 %in.vec2 = load <2 x i16>, <2 x i16>* %in.vecptr2, align 32 @@ -48,21 +60,38 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: vf4: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31] -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: vf4: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-FAST-ALL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-FAST-ALL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7] +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31] +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, (%r8) +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vf4: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,22,23,30,31] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,20,21,28,29,u,u,u,u,22,23,30,31,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r8) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq %in.vec0 = load <4 x i16>, <4 x i16>* %in.vecptr0, align 32 %in.vec1 = load <4 x i16>, <4 x i16>* %in.vecptr1, align 32 %in.vec2 = load <4 x i16>, <4 x i16>* %in.vecptr2, align 32 @@ -79,28 +108,51 @@ } define void @vf8(<8 x i16>* %in.vecptr0, <8 x i16>* %in.vecptr1, <8 x i16>* %in.vecptr2, <8 x i16>* %in.vecptr3, <32 x i16>* %out.vec) nounwind { -; AVX2-LABEL: vf8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,2,0,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = <0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u> -; AVX2-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] -; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] -; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-NEXT: vmovdqa %ymm0, 32(%r8) -; AVX2-NEXT: vmovdqa %ymm2, (%r8) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: vf8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,2,0,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r8) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: vf8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-FAST-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,2,0,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r8) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq %in.vec0 = load <8 x i16>, <8 x i16>* %in.vecptr0, align 32 %in.vec1 = load <8 x i16>, <8 x i16>* %in.vecptr1, align 32 %in.vec2 = load <8 x i16>, <8 x i16>* %in.vecptr2, align 32 @@ -117,58 +169,111 @@ } define void @vf16(<16 x i16>* %in.vecptr0, <16 x i16>* %in.vecptr1, <16 x i16>* %in.vecptr2, <16 x i16>* %in.vecptr3, <64 x i16>* %out.vec) nounwind { -; AVX2-LABEL: vf16: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rcx), %xmm5 -; AVX2-NEXT: vmovdqa 16(%rcx), %xmm8 -; AVX2-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-NEXT: vmovdqa 16(%rdx), %xmm9 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX2-NEXT: vmovdqa (%rsi), %xmm7 -; AVX2-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,1,1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm2[0],zero,xmm2[1],zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm10, %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] -; AVX2-NEXT: vmovdqa %ymm2, 96(%r8) -; AVX2-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-NEXT: vmovdqa %ymm1, 32(%r8) -; AVX2-NEXT: vmovdqa %ymm11, 64(%r8) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: vf16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm8 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm9 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm2[0],zero,xmm2[1],zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm10, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm11, 64(%r8) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: vf16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm8 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm6 +; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm9 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,1,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm2[0],zero,xmm2[1],zero +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm11, 64(%r8) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq %in.vec0 = load <16 x i16>, <16 x i16>* %in.vecptr0, align 32 %in.vec1 = load <16 x i16>, <16 x i16>* %in.vecptr1, align 32 %in.vec2 = load <16 x i16>, <16 x i16>* %in.vecptr2, align 32 @@ -185,108 +290,211 @@ } define void @vf32(<32 x i16>* %in.vecptr0, <32 x i16>* %in.vecptr1, <32 x i16>* %in.vecptr2, <32 x i16>* %in.vecptr3, <128 x i16>* %out.vec) nounwind { -; AVX2-LABEL: vf32: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rcx), %xmm15 -; AVX2-NEXT: vmovdqa 16(%rcx), %xmm12 -; AVX2-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX2-NEXT: vmovdqa 48(%rcx), %xmm2 -; AVX2-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-NEXT: vmovdqa 16(%rdx), %xmm13 -; AVX2-NEXT: vmovdqa 32(%rdx), %xmm1 -; AVX2-NEXT: vmovdqa 48(%rdx), %xmm7 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm8 -; AVX2-NEXT: vmovdqa 16(%rsi), %xmm14 -; AVX2-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,0,1,1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm8 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm10, %ymm0 -; AVX2-NEXT: vmovdqa 48(%rsi), %xmm10 -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,0,1,1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm1 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-NEXT: vmovdqa (%rsi), %xmm4 -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] -; AVX2-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm7, %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] -; AVX2-NEXT: vmovdqa %ymm3, 96(%r8) -; AVX2-NEXT: vmovdqa %ymm1, (%r8) -; AVX2-NEXT: vmovdqa %ymm0, 32(%r8) -; AVX2-NEXT: vmovdqa %ymm2, 192(%r8) -; AVX2-NEXT: vmovdqa %ymm11, 224(%r8) -; AVX2-NEXT: vmovdqa %ymm8, 128(%r8) -; AVX2-NEXT: vmovdqa %ymm9, 160(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 64(%r8) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: vf32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm15 +; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 48(%rcx), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdx), %xmm7 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm14 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm0[0],zero,xmm0[1],zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 48(%rsi), %xmm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm7, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 96(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 192(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm11, 224(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, 128(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, 160(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: vf32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm15 +; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm12 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 48(%rcx), %xmm2 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm6 +; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm13 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 48(%rdx), %xmm7 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm8 +; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm14 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,0,1,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm0[0],zero,xmm0[1],zero +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm10, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,0,1,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa %ymm3, 96(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 192(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm11, 224(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 128(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm9, 160(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq %in.vec0 = load <32 x i16>, <32 x i16>* %in.vecptr0, align 32 %in.vec1 = load <32 x i16>, <32 x i16>* %in.vecptr1, align 32 %in.vec2 = load <32 x i16>, <32 x i16>* %in.vecptr2, align 32 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck --check-prefixes=AVX2-SLOW %s -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-shuffle | FileCheck --check-prefixes=AVX2-FAST %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2-FAST,AVX2-FAST-ALL %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2-FAST,AVX2-FAST-PERLANE %s ; These patterns are produced by LoopVectorizer for interleaved stores. @@ -178,46 +179,90 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: vf8: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm3 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm4 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,0] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[0,1,8,9,12,13],zero,zero,zero,zero,ymm5[2,3,18,19,18,19],zero,zero,zero,zero,ymm5[28,29,20,21,28,29],zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,8,9],zero,zero,zero,zero,zero,zero,ymm6[2,3,10,11],zero,zero,zero,zero,zero,zero,ymm6[20,21,28,29],zero,zero,zero,zero,zero,zero,ymm6[22,23] -; AVX2-FAST-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <1,5,2,u,6,2,u,u> -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[2,3,6,7,6,7],zero,zero,zero,zero,ymm6[8,9,16,17,18,19],zero,zero,zero,zero,ymm6[22,23,18,19,18,19],zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <1,5,2,6,2,6,3,u> -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,6,7],zero,zero,zero,zero,zero,zero,ymm8[8,9,12,13],zero,zero,zero,zero,zero,zero,ymm8[18,19,22,23],zero,zero,zero,zero,zero,zero,ymm8[24,25] -; AVX2-FAST-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vpbroadcastq 8(%rdi), %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm6, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,8,9,14,15,u,u,u,u,u,u,12,13] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,u,u,u,u,u,u,12,13,14,15,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6],xmm1[7] -; AVX2-FAST-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] -; AVX2-FAST-NEXT: vmovdqa %xmm0, 64(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 32(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r9) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: vf8: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %xmm3 +; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %xmm4 +; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5 +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,0] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[0,1,8,9,12,13],zero,zero,zero,zero,ymm5[2,3,18,19,18,19],zero,zero,zero,zero,ymm5[28,29,20,21,28,29],zero,zero +; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm6 +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,8,9],zero,zero,zero,zero,zero,zero,ymm6[2,3,10,11],zero,zero,zero,zero,zero,zero,ymm6[20,21,28,29],zero,zero,zero,zero,zero,zero,ymm6[22,23] +; AVX2-FAST-ALL-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX2-FAST-ALL-NEXT: vpbroadcastq (%r8), %ymm6 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = <1,5,2,u,6,2,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm6, %ymm8, %ymm6 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[2,3,6,7,6,7],zero,zero,zero,zero,ymm6[8,9,16,17,18,19],zero,zero,zero,zero,ymm6[22,23,18,19,18,19],zero,zero +; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm8 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm9 = <1,5,2,6,2,6,3,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm8, %ymm9, %ymm8 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,6,7],zero,zero,zero,zero,zero,zero,ymm8[8,9,12,13],zero,zero,zero,zero,zero,zero,ymm8[18,19,22,23],zero,zero,zero,zero,zero,zero,ymm8[24,25] +; AVX2-FAST-ALL-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX2-FAST-ALL-NEXT: vpbroadcastq 8(%rdi), %ymm8 +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm7, %ymm6, %ymm8, %ymm6 +; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,8,9,14,15,u,u,u,u,u,u,12,13] +; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,u,u,u,u,u,u,12,13,14,15,u,u] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6],xmm1[7] +; AVX2-FAST-ALL-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa %xmm0, 64(%r9) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm6, 32(%r9) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm5, (%r9) +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vf8: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm5[6,7,u,u,u,u,10,11,u,u,8,9,u,u,u,u,22,23,u,u,u,u,26,27,u,u,24,25,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,6,7,10,11,u,u,8,9,u,u,8,9,12,13,u,u,22,23,26,27,u,u,24,25,u,u,24,25,28,29] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4],ymm6[5],ymm5[6,7],ymm6[8],ymm5[9,10],ymm6[11],ymm5[12],ymm6[13],ymm5[14,15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,10,11,6,7,u,u,u,u,10,11,12,13,8,9,24,25,26,27,22,23,u,u,u,u,26,27,28,29,24,25] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4],ymm7[5,6,7,8,9,10],ymm6[11,12],ymm7[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 8(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,12,13],zero,zero,zero,zero,ymm6[2,3,18,19,18,19],zero,zero,zero,zero,ymm6[28,29,20,21,28,29],zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,8,9],zero,zero,zero,zero,zero,zero,ymm8[2,3,10,11],zero,zero,zero,zero,zero,zero,ymm8[20,21,28,29],zero,zero,zero,zero,zero,zero,ymm8[22,23] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq (%r8), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm6, %ymm8, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,8,9,14,15,u,u,u,u,u,u,12,13] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,u,u,u,u,u,u,12,13,14,15,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq %in.vec0 = load <8 x i16>, <8 x i16>* %in.vecptr0, align 32 %in.vec1 = load <8 x i16>, <8 x i16>* %in.vecptr1, align 32 %in.vec2 = load <8 x i16>, <8 x i16>* %in.vecptr2, align 32 @@ -315,79 +360,153 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: vf16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm12 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm11 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm7, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm8, %ymm9 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,2,2,2] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,u,u,10,11,u,u,8,9,8,9,u,u,12,13] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2],xmm1[3],xmm5[4,5],xmm1[6],xmm5[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = mem[2,1,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,6,7,u,u,10,11,u,u,u,u,8,9] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3],xmm0[4],xmm5[5,6],xmm0[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq 8(%rdi), %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm8 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,u,u,24,25,20,21,u,u,24,25] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[0,1,2,1,4,5,6,5] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3],ymm1[4],ymm5[5,6],ymm1[7],ymm5[8,9],ymm1[10],ymm5[11],ymm1[12],ymm5[13,14],ymm1[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [25769803781,25769803781,25769803781,25769803781] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7],ymm6[8,9],ymm5[10],ymm6[11],ymm5[12],ymm6[13,14],ymm5[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm5 = ymm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm12[1,1,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4],ymm4[5],ymm2[6,7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12],ymm4[13],ymm2[14,15] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 128(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 32(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm9, (%r9) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: vf16: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm12 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %ymm11 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %ymm3 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %ymm4 +; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %ymm2 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %xmm5 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm6 +; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %xmm6 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %xmm0 +; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm10, %ymm7, %ymm1, %ymm1 +; AVX2-FAST-ALL-NEXT: vpbroadcastq (%r8), %ymm8 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm7, %ymm1, %ymm8, %ymm9 +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,2,2,2] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,u,u,10,11,u,u,8,9,8,9,u,u,12,13] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2],xmm1[3],xmm5[4,5],xmm1[6],xmm5[7] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm5 = mem[2,1,2,3] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,6,7,u,u,10,11,u,u,u,u,8,9] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3],xmm0[4],xmm5[5,6],xmm0[7] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: vpbroadcastq 8(%rdi), %ymm1 +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm8 +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,0,1,4,5,4,5] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,u,u,24,25,20,21,u,u,24,25] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[0,1,2,1,4,5,6,5] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm5 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3],ymm1[4],ymm5[5,6],ymm1[7],ymm5[8,9],ymm1[10],ymm5[11],ymm1[12],ymm5[13,14],ymm1[15] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [25769803781,25769803781,25769803781,25769803781] +; AVX2-FAST-ALL-NEXT: vpermd %ymm3, %ymm1, %ymm1 +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm5 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[3,2,3,3,7,6,7,7] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7],ymm6[8,9],ymm5[10],ymm6[11],ymm5[12],ymm6[13,14],ymm5[15] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm10, %ymm1, %ymm5, %ymm1 +; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} ymm5 = ymm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm1 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm4 = ymm12[1,1,2,2] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,2,2] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4],ymm4[5],ymm2[6,7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12],ymm4[13],ymm2[14,15] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm2, 64(%r9) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm1, 128(%r9) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, 96(%r9) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm8, 32(%r9) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm9, (%r9) +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vf16: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm7, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq (%r8), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm8, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,u,u,10,11,u,u,8,9,8,9,u,u,12,13] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2],xmm1[3],xmm5[4,5],xmm1[6],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = mem[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,6,7,u,u,10,11,u,u,u,u,8,9] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3],xmm0[4],xmm5[5,6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 8(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,u,u,24,25,20,21,u,u,24,25] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3],ymm1[4],ymm5[5,6],ymm1[7],ymm5[8,9],ymm1[10],ymm5[11],ymm1[12],ymm5[13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[3,2,3,3,7,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7],ymm6[8,9],ymm5[10],ymm6[11],ymm5[12],ymm6[13,14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm5 = ymm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm12[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4],ymm4[5],ymm2[6,7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12],ymm4[13],ymm2[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 128(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%r9) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq %in.vec0 = load <16 x i16>, <16 x i16>* %in.vecptr0, align 32 %in.vec1 = load <16 x i16>, <16 x i16>* %in.vecptr1, align 32 %in.vec2 = load <16 x i16>, <16 x i16>* %in.vecptr2, align 32 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck --check-prefixes=AVX2-SLOW %s -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-shuffle | FileCheck --check-prefixes=AVX2-FAST %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2-FAST,AVX2-FAST-ALL %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2-FAST,AVX2-FAST-PERLANE %s ; These patterns are produced by LoopVectorizer for interleaved stores. @@ -100,37 +101,67 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: vf4: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm2[0],xmm3[0] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [4,6,1,3,4,6,1,3] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,28,29,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4,5],ymm4[6],ymm6[7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rax) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: vf4: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-FAST-ALL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-FAST-ALL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 +; AVX2-FAST-ALL-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm2[0],xmm3[0] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [4,6,1,3,4,6,1,3] +; AVX2-FAST-ALL-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermd %ymm4, %ymm7, %ymm4 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,28,29,u,u,u,u] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4,5],ymm4[6],ymm6[7] +; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u] +; AVX2-FAST-ALL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] +; AVX2-FAST-ALL-NEXT: vmovdqa %xmm0, 32(%rax) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vf4: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm2[0],xmm3[0] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4,5],ymm4[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq %in.vec0 = load <4 x i16>, <4 x i16>* %in.vecptr0, align 32 %in.vec1 = load <4 x i16>, <4 x i16>* %in.vecptr1, align 32 %in.vec2 = load <4 x i16>, <4 x i16>* %in.vecptr2, align 32 @@ -191,54 +222,93 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: vf8: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,1,5,0,4,1,5] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,0,1,4,5,u,u,u,u,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u,24,25,28,29] -; AVX2-FAST-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] -; AVX2-FAST-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,4,4,0,0,4,4,0] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,12,13,8,9,u,u,u,u,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u,u,u,u,u,18,19,26,27] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,6,1,5,2,6,1,5] -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,10,11,14,15,u,u,u,u,u,u,u,u,16,17,20,21,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,6,3,7,2,6,3,7] -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,6,7,u,u,u,u,u,u,u,u,8,9,12,13,u,u,u,u,u,u,u,u,26,27,30,31,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,3,3,7,7,3,3,7] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,8,9,12,13,u,u,u,u,u,u,u,u,22,23,18,19,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rax) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: vf8: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %xmm2 +; AVX2-FAST-ALL-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,1,5,0,4,1,5] +; AVX2-FAST-ALL-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm3, %ymm3 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,0,1,4,5,u,u,u,u,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u,24,25,28,29] +; AVX2-FAST-ALL-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] +; AVX2-FAST-ALL-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,4,4,0,0,4,4,0] +; AVX2-FAST-ALL-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm4, %ymm4 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,12,13,8,9,u,u,u,u,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u,u,u,u,u,18,19,26,27] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,6,1,5,2,6,1,5] +; AVX2-FAST-ALL-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm5, %ymm5 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,10,11,14,15,u,u,u,u,u,u,u,u,16,17,20,21,u,u,u,u,u,u,u,u] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,6,3,7,2,6,3,7] +; AVX2-FAST-ALL-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm5, %ymm1 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,6,7,u,u,u,u,u,u,u,u,8,9,12,13,u,u,u,u,u,u,u,u,26,27,30,31,u,u,u,u] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,3,3,7,7,3,3,7] +; AVX2-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,8,9,12,13,u,u,u,u,u,u,u,u,22,23,18,19,u,u,u,u,u,u,u,u] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm4, 32(%rax) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vf8: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u,u,u,u,u,18,19,26,27] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,10,11,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq %in.vec0 = load <8 x i16>, <8 x i16>* %in.vecptr0, align 32 %in.vec1 = load <8 x i16>, <8 x i16>* %in.vecptr1, align 32 %in.vec2 = load <8 x i16>, <8 x i16>* %in.vecptr2, align 32 @@ -356,98 +426,192 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: vf16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm11 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm14 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm13 -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm15 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm1 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm4 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm6 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <1,u,u,2,u,u,3,u> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5,6],ymm6[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm6 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm3 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[8],ymm6[8],ymm3[9],ymm6[9],ymm3[10],ymm6[10],ymm3[11],ymm6[11] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[1,2,2,3,5,6,6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[1,2,2,3,5,6,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm12[0],ymm6[0],ymm12[1],ymm6[1],ymm12[2],ymm6[2],ymm12[3],ymm6[3],ymm12[8],ymm6[8],ymm12[9],ymm6[9],ymm12[10],ymm6[10],ymm12[11],ymm6[11] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm12 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6],ymm3[7] -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm6 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[8],ymm12[8],ymm14[9],ymm12[9],ymm14[10],ymm12[10],ymm14[11],ymm12[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm15[0],ymm13[1],ymm15[1],ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[8],ymm15[8],ymm13[9],ymm15[9],ymm13[10],ymm15[10],ymm13[11],ymm15[11] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm14[4],ymm12[4],ymm14[5],ymm12[5],ymm14[6],ymm12[6],ymm14[7],ymm12[7],ymm14[12],ymm12[12],ymm14[13],ymm12[13],ymm14[14],ymm12[14],ymm14[15],ymm12[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <5,u,u,6,u,u,7,u> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm13[4],ymm15[4],ymm13[5],ymm15[5],ymm13[6],ymm15[6],ymm13[7],ymm15[7],ymm13[12],ymm15[12],ymm13[13],ymm15[13],ymm13[14],ymm15[14],ymm13[15],ymm15[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm2, 160(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 128(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm9, 64(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: vf16: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm10 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %ymm11 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %ymm14 +; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %ymm13 +; AVX2-FAST-ALL-NEXT: vmovdqa (%r9), %ymm15 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %xmm8 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX2-FAST-ALL-NEXT: vpshufb %xmm0, %xmm8, %xmm1 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm7 +; AVX2-FAST-ALL-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vmovdqa (%r9), %xmm5 +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[1,2,2,3] +; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %xmm0 +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3] +; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %xmm4 +; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} xmm6 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm6 = <1,u,u,2,u,u,3,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm3, %ymm6, %ymm3 +; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5,6],ymm6[7] +; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-ALL-NEXT: vpshufb %ymm3, %ymm11, %ymm6 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm3, %ymm10, %ymm3 +; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[8],ymm6[8],ymm3[9],ymm6[9],ymm3[10],ymm6[10],ymm3[11],ymm6[11] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[1,2,2,3,5,6,6,7] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[1,2,2,3,5,6,6,7] +; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm12[0],ymm6[0],ymm12[1],ymm6[1],ymm12[2],ymm6[2],ymm12[3],ymm6[3],ymm12[8],ymm6[8],ymm12[9],ymm6[9],ymm12[10],ymm6[10],ymm12[11],ymm6[11] +; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %ymm12 +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6],ymm3[7] +; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} ymm6 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} ymm2 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[8],ymm12[8],ymm14[9],ymm12[9],ymm14[10],ymm12[10],ymm14[11],ymm12[11] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm15[0],ymm13[1],ymm15[1],ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[8],ymm15[8],ymm13[9],ymm15[9],ymm13[10],ymm15[10],ymm13[11],ymm15[11] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm14[4],ymm12[4],ymm14[5],ymm12[5],ymm14[6],ymm12[6],ymm14[7],ymm12[7],ymm14[12],ymm12[12],ymm14[13],ymm12[13],ymm14[14],ymm12[14],ymm14[15],ymm12[15] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = <5,u,u,6,u,u,7,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm13[4],ymm15[4],ymm13[5],ymm15[5],ymm13[6],ymm15[6],ymm13[7],ymm15[7],ymm13[12],ymm15[12],ymm13[13],ymm15[13],ymm13[14],ymm15[14],ymm13[15],ymm15[15] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] +; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; AVX2-FAST-ALL-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm2, 160(%rax) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm1, 96(%rax) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm3, 128(%rax) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm9, 64(%rax) +; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-ALL-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vf16: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm0[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3,4],ymm4[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0],ymm15[0],ymm13[1],ymm15[1],ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[8],ymm15[8],ymm13[9],ymm15[9],ymm13[10],ymm15[10],ymm13[11],ymm15[11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm9[0],ymm14[1],ymm9[1],ymm14[2],ymm9[2],ymm14[3],ymm9[3],ymm14[8],ymm9[8],ymm14[9],ymm9[9],ymm14[10],ymm9[10],ymm14[11],ymm9[11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm14[4],ymm9[4],ymm14[5],ymm9[5],ymm14[6],ymm9[6],ymm14[7],ymm9[7],ymm14[12],ymm9[12],ymm14[13],ymm9[13],ymm14[14],ymm9[14],ymm14[15],ymm9[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,2,3,6,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm15[4],ymm13[5],ymm15[5],ymm13[6],ymm15[6],ymm13[7],ymm15[7],ymm13[12],ymm15[12],ymm13[13],ymm15[13],ymm13[14],ymm15[14],ymm13[15],ymm15[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm15, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm13, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm4 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq %in.vec0 = load <16 x i16>, <16 x i16>* %in.vecptr0, align 32 %in.vec1 = load <16 x i16>, <16 x i16>* %in.vecptr1, align 32 %in.vec2 = load <16 x i16>, <16 x i16>* %in.vecptr2, align 32 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-unpck.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-unpck.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-unpck.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-unpck.ll @@ -1,9 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX2OR512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=ALL,AVX2OR512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL define <2 x i64> @unpckh_unary_extracted_v4i64(<4 x i64> %x) { ; AVX1-LABEL: unpckh_unary_extracted_v4i64: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -4,9 +4,12 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL,AVX512VLBW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512vbmi,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL,AVX512VLVBMI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL,AVX512VLBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL,AVX512VLBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512vbmi,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL,AVX512VLVBMI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512vbmi,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL,AVX512VLVBMI ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,XOP,XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,XOP,XOPAVX2 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -5,8 +5,10 @@ ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW -; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST -; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX512VL +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX512VL +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX512VL define <4 x i32> @shuffle_v4i32_0001(<4 x i32> %a, <4 x i32> %b) { ; SSE-LABEL: shuffle_v4i32_0001: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -4,9 +4,11 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX2,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX2,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX2,AVX2-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX512VL,AVX512VL-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX512VL,AVX512VL-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX512VL,AVX512VL-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX512VL,AVX512VL-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=AVX,XOP,XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=AVX,XOP,XOPAVX2 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -1,9 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX1OR2,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST,AVX2-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST,AVX2-FAST-PERLANE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX512VL,AVX512VL-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX512VL,AVX512VL-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX512VL,AVX512VL-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX512VL,AVX512VL-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX2 @@ -52,12 +54,18 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1] +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; AVX512VL: # %bb.0: @@ -101,12 +109,18 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; AVX512VL: # %bb.0: @@ -150,12 +164,18 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: ; AVX512VL: # %bb.0: @@ -987,13 +1007,19 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,1,0,0,0,1] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,1,0,0,0,1] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: ; AVX512VL: # %bb.0: @@ -1034,13 +1060,19 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: ; AVX512VL: # %bb.0: @@ -1081,13 +1113,19 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: ; AVX512VL: # %bb.0: @@ -3482,12 +3520,20 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,6,u,4,6,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,4,5,4,5,4,5,8,9,16,17,16,17,16,17,16,17,20,21,20,21,20,21,20,21] -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,6,u,4,6,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,4,5,4,5,4,5,8,9,16,17,16,17,16,17,16,17,20,21,20,21,20,21,20,21] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,u,u,16,17,16,17,16,17,16,17,24,25,24,25,24,25,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpsllq $48, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12: ; AVX512VL: # %bb.0: @@ -3667,12 +3713,20 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,0,4,u,6,4,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,4,5,4,5,4,5,8,9,16,17,16,17,16,17,16,17,20,21,20,21,20,21,20,21] -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <2,0,4,u,6,4,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,4,5,4,5,4,5,8,9,16,17,16,17,16,17,16,17,20,21,20,21,20,21,20,21] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,u,u,24,25,24,25,24,25,24,25,16,17,16,17,16,17,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08: ; AVX512VL: # %bb.0: @@ -3896,12 +3950,20 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,u,4,6,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,6,7,4,5,2,3,8,9,18,19,16,17,22,23,20,21,22,23,20,21,18,19,16,17] -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,u,4,6,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,6,7,4,5,2,3,8,9,18,19,16,17,22,23,20,21,22,23,20,21,18,19,16,17] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,u,u,18,19,16,17,26,27,24,25,26,27,24,25,18,19,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08: ; AVX512VL: # %bb.0: @@ -3947,12 +4009,20 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,0,4,u,6,4,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,2,3,0,1,6,7,8,9,18,19,16,17,22,23,20,21,18,19,16,17,22,23,20,21] -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <2,0,4,u,6,4,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,2,3,0,1,6,7,8,9,18,19,16,17,22,23,20,21,18,19,16,17,22,23,20,21] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,u,u,26,27,24,25,18,19,16,17,26,27,24,25,18,19,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08: ; AVX512VL: # %bb.0: @@ -3998,12 +4068,20 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,0,6,u,6,4,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,6,7,4,5,2,3,8,9,18,19,16,17,22,23,20,21,22,23,20,21,18,19,16,17] -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <2,0,6,u,6,4,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,6,7,4,5,2,3,8,9,18,19,16,17,22,23,20,21,22,23,20,21,18,19,16,17] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,u,u,26,27,24,25,18,19,16,17,18,19,16,17,26,27,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpsllq $48, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12: ; AVX512VL: # %bb.0: @@ -4049,12 +4127,20 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,u,4,6,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,0,1,4,5,4,5,8,9,16,17,20,21,20,21,16,17,16,17,20,21,20,21,16,17] -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,u,4,6,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,0,1,4,5,4,5,8,9,16,17,20,21,20,21,16,17,16,17,20,21,20,21,16,17] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,u,u,16,17,24,25,24,25,16,17,16,17,24,25,24,25,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08: ; AVX512VL: # %bb.0: @@ -4100,12 +4186,20 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,0,6,u,6,4,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,0,1,4,5,4,5,8,9,16,17,20,21,20,21,16,17,16,17,20,21,20,21,16,17] -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <2,0,6,u,6,4,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,0,1,4,5,4,5,8,9,16,17,20,21,20,21,16,17,16,17,20,21,20,21,16,17] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,u,u,24,25,16,17,16,17,24,25,24,25,16,17,16,17,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpsllq $48, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12: ; AVX512VL: # %bb.0: @@ -4276,12 +4370,19 @@ ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,0,5,7,6,4,5] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,2,3,6,7,10,11,0,1,4,5,14,15,16,17,16,17,18,19,22,23,26,27,16,17,20,21,30,31] -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,0,5,7,6,4,5] +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,2,3,6,7,10,11,0,1,4,5,14,15,16,17,16,17,18,19,22,23,26,27,16,17,20,21,30,31] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: ; AVX512VL: # %bb.0: @@ -4326,12 +4427,20 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,6,u,4,6,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,4,5,4,5,4,5,8,9,16,17,16,17,20,21,20,21,20,21,20,21,20,21,20,21] -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,6,u,4,6,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,4,5,4,5,4,5,8,9,16,17,16,17,20,21,20,21,20,21,20,21,20,21,20,21] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,u,u,16,17,16,17,24,25,24,25,24,25,24,25,24,25,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpsllq $48, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12: ; AVX512VL: # %bb.0: @@ -4377,12 +4486,20 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,0,6,u,6,4,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,0,1,0,1,0,1,8,9,16,17,16,17,20,21,20,21,16,17,16,17,16,17,16,17] -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <2,0,6,u,6,4,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,0,1,0,1,0,1,8,9,16,17,16,17,20,21,20,21,16,17,16,17,16,17,16,17] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,u,u,24,25,24,25,16,17,16,17,24,25,24,25,24,25,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpsllq $48, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12: ; AVX512VL: # %bb.0: @@ -4428,12 +4545,20 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,6,u,4,6,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,4,5,4,5,4,5,8,9,16,17,20,21,20,21,16,17,20,21,20,21,20,21,20,21] -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,6,u,4,6,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,4,5,4,5,4,5,8,9,16,17,20,21,20,21,16,17,20,21,20,21,20,21,20,21] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,u,u,16,17,24,25,24,25,16,17,24,25,24,25,24,25,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpsllq $48, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12: ; AVX512VL: # %bb.0: @@ -4479,12 +4604,20 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,u,4,6,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,0,1,0,1,0,1,8,9,16,17,20,21,20,21,16,17,16,17,16,17,16,17,16,17] -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,u,4,6,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,0,1,0,1,0,1,8,9,16,17,20,21,20,21,16,17,16,17,16,17,16,17,16,17] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,u,u,16,17,24,25,24,25,16,17,16,17,16,17,16,17,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08: ; AVX512VL: # %bb.0: @@ -4531,12 +4664,19 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,3,7,4,6,7,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,4,5,6,7,8,9,14,15,16,17,20,21,20,21,16,17,20,21,22,23,24,25,26,27] -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,3,7,4,6,7,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,4,5,6,7,8,9,14,15,16,17,20,21,20,21,16,17,20,21,22,23,24,25,26,27] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,8,9,0,1,8,9,10,11,12,13,u,u,16,17,24,25,24,25,16,17,24,25,26,27,28,29,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15: ; AVX512VL: # %bb.0: @@ -4583,12 +4723,20 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,6,u,4,6,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,4,5,4,5,4,5,4,5,4,5,8,9,16,17,u,u,20,21,20,21,20,21,20,21,20,21,20,21] -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,6,u,4,6,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,4,5,4,5,4,5,4,5,4,5,8,9,16,17,u,u,20,21,20,21,20,21,20,21,20,21,20,21] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,u,u,16,17,18,19,24,25,24,25,24,25,24,25,24,25,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpsllq $48, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12: ; AVX512VL: # %bb.0: @@ -4634,12 +4782,20 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,0,6,u,6,4,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,u,u,4,5,0,1,0,1,0,1,8,9,16,17,16,17,u,u,20,21,16,17,16,17,16,17,16,17] -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <2,0,6,u,6,4,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,u,u,4,5,0,1,0,1,0,1,8,9,16,17,16,17,u,u,20,21,16,17,16,17,16,17,16,17] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,u,u,24,25,24,25,24,25,16,17,24,25,24,25,24,25,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpsllq $48, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12: ; AVX512VL: # %bb.0: @@ -4685,12 +4841,20 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,0,6,u,6,4,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,1,0,1,4,5,0,1,0,1,0,1,8,9,u,u,16,17,16,17,20,21,16,17,16,17,16,17,16,17] -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <2,0,6,u,6,4,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,1,0,1,4,5,0,1,0,1,0,1,8,9,u,u,16,17,16,17,20,21,16,17,16,17,16,17,16,17] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,u,u,16,17,24,25,24,25,16,17,24,25,24,25,24,25,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpsllq $48, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12: ; AVX512VL: # %bb.0: @@ -4952,12 +5116,19 @@ ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,0,6,5,7,4,6] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,8,9,0,1,6,7,2,3,14,15,18,19,22,23,26,27,24,25,16,17,22,23,18,19,30,31] -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,0,6,5,7,4,6] +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,8,9,0,1,6,7,2,3,14,15,18,19,22,23,26,27,24,25,16,17,22,23,18,19,30,31] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: ; AVX512VL: # %bb.0: @@ -5197,14 +5368,23 @@ ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,3,7,u,4,7,u,u> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,10,11,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,3,7,u,4,7,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,10,11,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7] +; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7],ymm1[8,9,10,11,12,13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,3,2,3,4,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: ; AVX512VL: # %bb.0: @@ -5259,14 +5439,23 @@ ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,0,4,u,6,4,u,u> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,10,11,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <2,0,4,u,6,4,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,10,11,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7] +; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7,8],ymm2[9],ymm1[10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25: ; AVX512VL: # %bb.0: @@ -5864,13 +6053,21 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,2,3,2,3,2,3,8,9,10,11,14,15,u,u,18,19,18,19,18,19,18,19,24,25,26,27,30,31,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4] +; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,2,3,2,3,2,3,8,9,10,11,14,15,u,u,18,19,18,19,18,19,18,19,24,25,26,27,30,31,u,u] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,2,3,2,3,2,3,8,9,10,11,14,15,u,u,18,19,18,19,18,19,18,19,24,25,26,27,30,31,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25: ; AVX512VL: # %bb.0: @@ -7248,16 +7445,30 @@ ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: PR24935: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,5,u,u,0,4,6,2> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[6,7],zero,zero,ymm0[18,19,22,23],zero,zero,zero,zero,ymm0[26,27,28,29,16,17],zero,zero -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <5,6,3,0,0,6,4,u> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,0,1],zero,zero,ymm1[6,7,0,1,10,11],zero,zero,ymm1[12,13],zero,zero,zero,zero,ymm1[16,17,20,21],zero,zero,zero,zero,zero,zero,ymm1[24,25] -; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: PR24935: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,5,u,u,0,4,6,2> +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[6,7],zero,zero,ymm0[18,19,22,23],zero,zero,zero,zero,ymm0[26,27,28,29,16,17],zero,zero +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <5,6,3,0,0,6,4,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,0,1],zero,zero,ymm1[6,7,0,1,10,11],zero,zero,ymm1[12,13],zero,zero,zero,zero,ymm1[16,17,20,21],zero,zero,zero,zero,zero,zero,ymm1[24,25] +; AVX2-FAST-ALL-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: PR24935: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm1[8,9],zero,zero,zero,zero,ymm1[14,15,12,13,0,1,24,25,24,25],zero,zero,ymm1[24,25,16,17,30,31,28,29,16,17] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5],zero,zero,ymm1[10,11,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,2,3,2,3,u,u,10,11,u,u,u,u,u,u,u,u,18,19,18,19,u,u,26,27,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5,6,7,8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: PR24935: ; AVX512VL: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1,11 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX2,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX2,AVX2-FAST,AVX2-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX2,AVX2-FAST,AVX2-FAST-PERLANE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX512VL,AVX512VLBW,AVX512VLBW-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX512VL,AVX512VLBW,AVX512VLBW-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX512VL,AVX512VLBW,AVX512VLBW-FAST,AVX512VLBW-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX512VL,AVX512VLBW,AVX512VLBW-FAST,AVX512VLBW-FAST-PERLANE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX512VL,AVX512VLVBMI,AVX512VLVBMI-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512vbmi,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX512VL,AVX512VLVBMI,AVX512VLVBMI-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512vbmi,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX512VL,AVX512VLVBMI,AVX512VLVBMI-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512vbmi,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX512VL,AVX512VLVBMI,AVX512VLVBMI-FAST-PERLANE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX2 @@ -64,11 +67,17 @@ ; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VLVBMI-SLOW-NEXT: retq ; -; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: -; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] -; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-NEXT: retq +; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX512VLVBMI-FAST-ALL: # %bb.0: +; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-ALL-NEXT: retq +; +; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX512VLVBMI-FAST-PERLANE: # %bb.0: +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VLVBMI-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; XOPAVX1: # %bb.0: @@ -114,11 +123,17 @@ ; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VLVBMI-SLOW-NEXT: retq ; -; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: -; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] -; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-NEXT: retq +; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX512VLVBMI-FAST-ALL: # %bb.0: +; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-ALL-NEXT: retq +; +; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX512VLVBMI-FAST-PERLANE: # %bb.0: +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VLVBMI-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; XOPAVX1: # %bb.0: @@ -164,11 +179,17 @@ ; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VLVBMI-SLOW-NEXT: retq ; -; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: -; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] -; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-NEXT: retq +; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VLVBMI-FAST-ALL: # %bb.0: +; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-ALL-NEXT: retq +; +; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VLVBMI-FAST-PERLANE: # %bb.0: +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VLVBMI-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: ; XOPAVX1: # %bb.0: @@ -214,11 +235,17 @@ ; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VLVBMI-SLOW-NEXT: retq ; -; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: -; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] -; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-NEXT: retq +; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX512VLVBMI-FAST-ALL: # %bb.0: +; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-ALL-NEXT: retq +; +; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX512VLVBMI-FAST-PERLANE: # %bb.0: +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VLVBMI-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -264,11 +291,17 @@ ; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VLVBMI-SLOW-NEXT: retq ; -; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: -; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0] -; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-NEXT: retq +; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX512VLVBMI-FAST-ALL: # %bb.0: +; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-ALL-NEXT: retq +; +; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX512VLVBMI-FAST-PERLANE: # %bb.0: +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VLVBMI-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -314,11 +347,17 @@ ; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VLVBMI-SLOW-NEXT: retq ; -; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: -; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-NEXT: retq +; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-ALL: # %bb.0: +; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-ALL-NEXT: retq +; +; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-PERLANE: # %bb.0: +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VLVBMI-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -364,11 +403,17 @@ ; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VLVBMI-SLOW-NEXT: retq ; -; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-NEXT: retq +; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-ALL: # %bb.0: +; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-ALL-NEXT: retq +; +; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-PERLANE: # %bb.0: +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VLVBMI-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -414,11 +459,17 @@ ; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VLVBMI-SLOW-NEXT: retq ; -; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-NEXT: retq +; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-ALL: # %bb.0: +; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-ALL-NEXT: retq +; +; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-PERLANE: # %bb.0: +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8] +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -464,11 +515,17 @@ ; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VLVBMI-SLOW-NEXT: retq ; -; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-NEXT: retq +; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-ALL: # %bb.0: +; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-ALL-NEXT: retq +; +; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-PERLANE: # %bb.0: +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -514,11 +571,17 @@ ; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VLVBMI-SLOW-NEXT: retq ; -; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-NEXT: retq +; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-ALL: # %bb.0: +; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-ALL-NEXT: retq +; +; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-PERLANE: # %bb.0: +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -564,11 +627,17 @@ ; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VLVBMI-SLOW-NEXT: retq ; -; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-NEXT: retq +; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-ALL: # %bb.0: +; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-ALL-NEXT: retq +; +; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-PERLANE: # %bb.0: +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -614,11 +683,17 @@ ; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VLVBMI-SLOW-NEXT: retq ; -; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-NEXT: retq +; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-ALL: # %bb.0: +; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-ALL-NEXT: retq +; +; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-PERLANE: # %bb.0: +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -664,11 +739,17 @@ ; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VLVBMI-SLOW-NEXT: retq ; -; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-NEXT: retq +; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-ALL: # %bb.0: +; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-ALL-NEXT: retq +; +; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-PERLANE: # %bb.0: +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,13,0,0,0,0,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -714,11 +795,17 @@ ; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VLVBMI-SLOW-NEXT: retq ; -; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-NEXT: retq +; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-ALL: # %bb.0: +; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-ALL-NEXT: retq +; +; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-PERLANE: # %bb.0: +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -764,11 +851,17 @@ ; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VLVBMI-SLOW-NEXT: retq ; -; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-NEXT: retq +; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-ALL: # %bb.0: +; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-ALL-NEXT: retq +; +; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-PERLANE: # %bb.0: +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -1822,12 +1915,18 @@ ; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512VLVBMI-SLOW-NEXT: retq ; -; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: -; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] -; AVX512VLVBMI-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-NEXT: retq +; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX512VLVBMI-FAST-ALL: # %bb.0: +; AVX512VLVBMI-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX512VLVBMI-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-ALL-NEXT: retq +; +; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX512VLVBMI-FAST-PERLANE: # %bb.0: +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VLVBMI-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; XOPAVX1: # %bb.0: @@ -1869,12 +1968,18 @@ ; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512VLVBMI-SLOW-NEXT: retq ; -; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: -; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] -; AVX512VLVBMI-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-NEXT: retq +; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX512VLVBMI-FAST-ALL: # %bb.0: +; AVX512VLVBMI-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-ALL-NEXT: retq +; +; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX512VLVBMI-FAST-PERLANE: # %bb.0: +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VLVBMI-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; XOPAVX1: # %bb.0: @@ -1916,12 +2021,18 @@ ; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512VLVBMI-SLOW-NEXT: retq ; -; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-NEXT: retq +; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-ALL: # %bb.0: +; AVX512VLVBMI-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-ALL-NEXT: retq +; +; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-PERLANE: # %bb.0: +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VLVBMI-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -1963,12 +2074,18 @@ ; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512VLVBMI-SLOW-NEXT: retq ; -; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-NEXT: retq +; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-ALL: # %bb.0: +; AVX512VLVBMI-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-ALL-NEXT: retq +; +; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-PERLANE: # %bb.0: +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VLVBMI-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -2010,12 +2127,18 @@ ; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512VLVBMI-SLOW-NEXT: retq ; -; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-NEXT: retq +; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-ALL: # %bb.0: +; AVX512VLVBMI-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-ALL-NEXT: retq +; +; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-PERLANE: # %bb.0: +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VLVBMI-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -2057,12 +2180,18 @@ ; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512VLVBMI-SLOW-NEXT: retq ; -; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-NEXT: retq +; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-ALL: # %bb.0: +; AVX512VLVBMI-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-FAST-ALL-NEXT: retq +; +; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI-FAST-PERLANE: # %bb.0: +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VLVBMI-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -3276,18 +3405,32 @@ ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[u,u],zero,zero,ymm1[12],zero,ymm1[u,u,u],zero,zero,ymm1[u,0,3,u,u],zero,ymm1[u],zero,zero,ymm1[21,16],zero,ymm1[26],zero,ymm1[u,20,18,20,23] -; AVX2-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <3,4,5,7,5,4,1,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,1,u,u,u,u,5,10,13,u,u,0,u,u,16,23,u,23,u,u,u,u,u,u,u,27,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255,0,0,255,0,255,255,255,255,255,255,255,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[u,u],zero,zero,ymm1[12],zero,ymm1[u,u,u],zero,zero,ymm1[u,0,3,u,u],zero,ymm1[u],zero,zero,ymm1[21,16],zero,ymm1[26],zero,ymm1[u,20,18,20,23] +; AVX2-FAST-ALL-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <3,4,5,7,5,4,1,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,1,u,u,u,u,5,10,13,u,u,0,u,u,16,23,u,23,u,u,u,u,u,u,u,27,u,u,u,u] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255,0,0,255,0,255,255,255,255,255,255,255,0,255,255,255,255] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[u,u],zero,zero,ymm1[12],zero,ymm1[u,u,u],zero,zero,ymm1[u,0,3,u,u],zero,ymm1[u],zero,zero,ymm1[21,16],zero,ymm1[26],zero,ymm1[u,20,18,20,23] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255,0,0,255,0,255,255,255,255,255,255,255,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: ; AVX512VLBW-SLOW: # %bb.0: @@ -3304,18 +3447,33 @@ ; AVX512VLBW-SLOW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k1} ; AVX512VLBW-SLOW-NEXT: retq ; -; AVX512VLBW-FAST-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: -; AVX512VLBW-FAST: # %bb.0: -; AVX512VLBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <3,4,5,7,5,4,1,u> -; AVX512VLBW-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2 -; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[10,13,u,u,3,3],zero,ymm1[8,u,u,u,12,1,u],zero,zero,ymm1[u,u,20,u,17,22],zero,zero,ymm1[16],zero,ymm1[27,u],zero,zero,zero,zero -; AVX512VLBW-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm1[12],zero,zero,zero,zero,zero,zero,zero,ymm1[0,3],zero,zero,zero,zero,zero,zero,ymm1[21,16],zero,ymm1[26],zero,zero,ymm1[20,18,20,23] -; AVX512VLBW-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512VLBW-FAST-NEXT: movl $134948620, %eax # imm = 0x80B270C -; AVX512VLBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[u,u,0,1,u,u,u,u,5,10,13,u,u,0,u,u,16,23,u,23,u,u,u,u,u,u,u,27,u,u,u,u] -; AVX512VLBW-FAST-NEXT: retq +; AVX512VLBW-FAST-ALL-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: +; AVX512VLBW-FAST-ALL: # %bb.0: +; AVX512VLBW-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <3,4,5,7,5,4,1,u> +; AVX512VLBW-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm2 +; AVX512VLBW-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[10,13,u,u,3,3],zero,ymm1[8,u,u,u,12,1,u],zero,zero,ymm1[u,u,20,u,17,22],zero,zero,ymm1[16],zero,ymm1[27,u],zero,zero,zero,zero +; AVX512VLBW-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX512VLBW-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm1[12],zero,zero,zero,zero,zero,zero,zero,ymm1[0,3],zero,zero,zero,zero,zero,zero,ymm1[21,16],zero,ymm1[26],zero,zero,ymm1[20,18,20,23] +; AVX512VLBW-FAST-ALL-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512VLBW-FAST-ALL-NEXT: movl $134948620, %eax # imm = 0x80B270C +; AVX512VLBW-FAST-ALL-NEXT: kmovd %eax, %k1 +; AVX512VLBW-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[u,u,0,1,u,u,u,u,5,10,13,u,u,0,u,u,16,23,u,23,u,u,u,u,u,u,u,27,u,u,u,u] +; AVX512VLBW-FAST-ALL-NEXT: retq +; +; AVX512VLBW-FAST-PERLANE-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: +; AVX512VLBW-FAST-PERLANE: # %bb.0: +; AVX512VLBW-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512VLBW-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512VLBW-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u] +; AVX512VLBW-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7] +; AVX512VLBW-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[10,13,u,u,3,3],zero,ymm1[8,u,u,u,12,1,u],zero,zero,ymm1[u,u,20,u,17,22],zero,zero,ymm1[16],zero,ymm1[27,u],zero,zero,zero,zero +; AVX512VLBW-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX512VLBW-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm1[12],zero,zero,zero,zero,zero,zero,zero,ymm1[0,3],zero,zero,zero,zero,zero,zero,ymm1[21,16],zero,ymm1[26],zero,zero,ymm1[20,18,20,23] +; AVX512VLBW-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512VLBW-FAST-PERLANE-NEXT: movl $134948620, %eax # imm = 0x80B270C +; AVX512VLBW-FAST-PERLANE-NEXT: kmovd %eax, %k1 +; AVX512VLBW-FAST-PERLANE-NEXT: vmovdqu8 %ymm2, %ymm0 {%k1} +; AVX512VLBW-FAST-PERLANE-NEXT: retq ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: ; AVX512VLVBMI: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1,9 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2 --check-prefix=AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2 --check-prefix=AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2 --check-prefix=AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2 --check-prefix=AVX2-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512VL-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST --check-prefix=AVX512VL-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST --check-prefix=AVX512VL-FAST-PERLANE define <4 x double> @shuffle_v4f64_0000(<4 x double> %a, <4 x double> %b) { ; AVX1-LABEL: shuffle_v4f64_0000: @@ -457,11 +459,17 @@ ; AVX512VL-SLOW-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v4f64_1054: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [1,0,5,4] -; AVX512VL-FAST-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v4f64_1054: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [1,0,5,4] +; AVX512VL-FAST-ALL-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4f64_1054: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -479,11 +487,17 @@ ; AVX512VL-SLOW-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v4f64_3254: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [3,2,5,4] -; AVX512VL-FAST-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v4f64_3254: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [3,2,5,4] +; AVX512VL-FAST-ALL-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4f64_3254: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -501,11 +515,17 @@ ; AVX512VL-SLOW-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v4f64_3276: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [3,2,7,6] -; AVX512VL-FAST-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v4f64_3276: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [3,2,7,6] +; AVX512VL-FAST-ALL-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4f64_3276: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VL-FAST-PERLANE-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -636,12 +656,18 @@ ; AVX512VL-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v4f64_0456: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [4,0,1,2] -; AVX512VL-FAST-NEXT: vpermi2pd %ymm0, %ymm1, %ymm2 -; AVX512VL-FAST-NEXT: vmovapd %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v4f64_0456: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [4,0,1,2] +; AVX512VL-FAST-ALL-NEXT: vpermi2pd %ymm0, %ymm1, %ymm2 +; AVX512VL-FAST-ALL-NEXT: vmovapd %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4f64_0456: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -685,11 +711,17 @@ ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v4f64_0044: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [0,0,4,4] -; AVX512VL-FAST-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v4f64_0044: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [0,0,4,4] +; AVX512VL-FAST-ALL-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4f64_0044: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512VL-FAST-PERLANE-NEXT: retq %1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %1 } @@ -954,11 +986,17 @@ ; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v4i64_0124: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,4] -; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_0124: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,4] +; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4i64_0124: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512VL-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -1029,11 +1067,17 @@ ; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v4i64_4012: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,0,1,2] -; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_4012: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,0,1,2] +; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4i64_4012: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,2] +; AVX512VL-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -1124,11 +1168,17 @@ ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,1] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v4i64_2u35: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,5,3,5] -; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_2u35: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [2,5,3,5] +; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4i64_2u35: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,1] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -1176,11 +1226,17 @@ ; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v4i64_1054: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,5,4] -; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_1054: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,5,4] +; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4i64_1054: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -1204,11 +1260,17 @@ ; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v4i64_3254: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,5,4] -; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_3254: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,5,4] +; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4i64_3254: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -1232,11 +1294,17 @@ ; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v4i64_3276: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,7,6] -; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_3276: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,7,6] +; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4i64_3276: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VL-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -1876,14 +1944,23 @@ ; AVX512VL-SLOW-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: add_v4i64_0246_1357: -; AVX512VL-FAST: # %bb.0: # %entry -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6] -; AVX512VL-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7] -; AVX512VL-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm3 -; AVX512VL-FAST-NEXT: vpaddq %ymm3, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: add_v4i64_0246_1357: +; AVX512VL-FAST-ALL: # %bb.0: # %entry +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6] +; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7] +; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm1, %ymm0, %ymm3 +; AVX512VL-FAST-ALL-NEXT: vpaddq %ymm3, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: add_v4i64_0246_1357: +; AVX512VL-FAST-PERLANE: # %bb.0: # %entry +; AVX512VL-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX512VL-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-FAST-PERLANE-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX512VL-FAST-PERLANE-NEXT: retq entry: %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> %shuffle1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -1923,14 +2000,23 @@ ; AVX512VL-SLOW-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: add_v4i64_4602_5713: -; AVX512VL-FAST: # %bb.0: # %entry -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6] -; AVX512VL-FAST-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7] -; AVX512VL-FAST-NEXT: vpermi2q %ymm0, %ymm1, %ymm3 -; AVX512VL-FAST-NEXT: vpaddq %ymm3, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: add_v4i64_4602_5713: +; AVX512VL-FAST-ALL: # %bb.0: # %entry +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6] +; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7] +; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm0, %ymm1, %ymm3 +; AVX512VL-FAST-ALL-NEXT: vpaddq %ymm3, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: add_v4i64_4602_5713: +; AVX512VL-FAST-PERLANE: # %bb.0: # %entry +; AVX512VL-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX512VL-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-FAST-PERLANE-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX512VL-FAST-PERLANE-NEXT: retq entry: %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> %shuffle1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -1,9 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST-PERLANE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST --check-prefix=AVX512VL-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST --check-prefix=AVX512VL-FAST-PERLANE define <8 x float> @shuffle_v8f32_00000000(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_00000000: @@ -34,11 +36,17 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v8f32_00000010: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v8f32_00000010: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_00000010: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_00000010: ; AVX512VL-SLOW: # %bb.0: @@ -46,11 +54,17 @@ ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8f32_00000010: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] -; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_00000010: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] +; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_00000010: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -69,11 +83,17 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v8f32_00000200: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v8f32_00000200: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_00000200: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_00000200: ; AVX512VL-SLOW: # %bb.0: @@ -81,11 +101,17 @@ ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8f32_00000200: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] -; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_00000200: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] +; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_00000200: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -104,11 +130,17 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v8f32_00003000: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v8f32_00003000: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_00003000: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_00003000: ; AVX512VL-SLOW: # %bb.0: @@ -116,11 +148,17 @@ ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8f32_00003000: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] -; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_00003000: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] +; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_00003000: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -234,11 +272,17 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v8f32_00001111: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v8f32_00001111: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_00001111: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_00001111: ; AVX512VL-SLOW: # %bb.0: @@ -246,11 +290,17 @@ ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8f32_00001111: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] -; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_00001111: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_00001111: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -284,11 +334,17 @@ ; AVX512VL-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8f32_08080808: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vbroadcastsd {{.*#+}} ymm2 = [34359738368,34359738368,34359738368,34359738368] -; AVX512VL-FAST-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_08080808: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm2 = [34359738368,34359738368,34359738368,34359738368] +; AVX512VL-FAST-ALL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_08080808: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -396,14 +452,23 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v8f32_08991abb: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v8f32_08991abb: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = +; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_08991abb: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8f32_08991abb: ; AVX512VL: # %bb.0: @@ -455,12 +520,19 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v8f32_09ab1def: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v8f32_09ab1def: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_09ab1def: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8f32_09ab1def: ; AVX512VL: # %bb.0: @@ -781,15 +853,25 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v8f32_c348cda0: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4,5,2,0,4,5,2,0] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7] -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v8f32_c348cda0: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4,5,2,0,4,5,2,0] +; AVX2-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_c348cda0: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4,5,2,0,4,5,2,0] +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8f32_c348cda0: ; AVX512VL: # %bb.0: @@ -821,14 +903,23 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v8f32_f511235a: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <7,u,u,u,u,u,u,2> -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v8f32_f511235a: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = <7,u,u,u,u,u,u,2> +; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_f511235a: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,2,3,7,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,0] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,2,3,5,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,1,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8f32_f511235a: ; AVX512VL: # %bb.0: @@ -852,12 +943,18 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v8f32_32103210: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v8f32_32103210: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_32103210: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_32103210: ; AVX512VL-SLOW: # %bb.0: @@ -865,12 +962,18 @@ ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8f32_32103210: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] -; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_32103210: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] +; AVX512VL-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_32103210: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -888,12 +991,18 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v8f32_76547654: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v8f32_76547654: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_76547654: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_76547654: ; AVX512VL-SLOW: # %bb.0: @@ -901,12 +1010,18 @@ ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8f32_76547654: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] -; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_76547654: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX512VL-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_76547654: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -924,11 +1039,17 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v8f32_76543210: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v8f32_76543210: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_76543210: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_76543210: ; AVX512VL-SLOW: # %bb.0: @@ -936,11 +1057,17 @@ ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8f32_76543210: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] -; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_76543210: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_76543210: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -958,11 +1085,17 @@ ; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8f32_3210ba98: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [3,2,1,0,11,10,9,8] -; AVX512VL-FAST-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_3210ba98: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [3,2,1,0,11,10,9,8] +; AVX512VL-FAST-ALL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_3210ba98: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -1002,11 +1135,17 @@ ; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8f32_7654fedc: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12] -; AVX512VL-FAST-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_7654fedc: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12] +; AVX512VL-FAST-ALL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_7654fedc: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -1024,12 +1163,18 @@ ; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8f32_fedc7654: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12] -; AVX512VL-FAST-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2 -; AVX512VL-FAST-NEXT: vmovaps %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_fedc7654: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12] +; AVX512VL-FAST-ALL-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2 +; AVX512VL-FAST-ALL-NEXT: vmovaps %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_fedc7654: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -1054,11 +1199,17 @@ ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: PR21138: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] -; AVX512VL-FAST-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: PR21138: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] +; AVX512VL-FAST-ALL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: PR21138: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %truc, <8 x float> %tchose, <8 x i32> ret <8 x float> %shuffle } @@ -1099,12 +1250,18 @@ ; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8f32_ba983210: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [3,2,1,0,11,10,9,8] -; AVX512VL-FAST-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2 -; AVX512VL-FAST-NEXT: vmovaps %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_ba983210: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [3,2,1,0,11,10,9,8] +; AVX512VL-FAST-ALL-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2 +; AVX512VL-FAST-ALL-NEXT: vmovaps %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_ba983210: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -1153,11 +1310,17 @@ ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8f32_084c195d: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,8,4,12,1,9,5,13] -; AVX512VL-FAST-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_084c195d: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [0,8,4,12,1,9,5,13] +; AVX512VL-FAST-ALL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_084c195d: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -1219,12 +1382,18 @@ ; AVX512VL-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8f32_0189abcd: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [4,0,1,2] -; AVX512VL-FAST-NEXT: vpermi2pd %ymm0, %ymm1, %ymm2 -; AVX512VL-FAST-NEXT: vmovapd %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_0189abcd: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [4,0,1,2] +; AVX512VL-FAST-ALL-NEXT: vpermi2pd %ymm0, %ymm1, %ymm2 +; AVX512VL-FAST-ALL-NEXT: vmovapd %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_0189abcd: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -1368,11 +1537,17 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v8i32_00000010: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v8i32_00000010: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_00000010: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_00000010: ; AVX512VL-SLOW: # %bb.0: @@ -1380,11 +1555,17 @@ ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8i32_00000010: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] -; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_00000010: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] +; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_00000010: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -1403,11 +1584,17 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v8i32_00000200: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v8i32_00000200: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_00000200: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_00000200: ; AVX512VL-SLOW: # %bb.0: @@ -1415,11 +1602,17 @@ ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8i32_00000200: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] -; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_00000200: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] +; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_00000200: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -1438,11 +1631,17 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v8i32_00003000: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v8i32_00003000: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_00003000: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_00003000: ; AVX512VL-SLOW: # %bb.0: @@ -1450,11 +1649,17 @@ ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8i32_00003000: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] -; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_00003000: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] +; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_00003000: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -1556,11 +1761,17 @@ ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v8i32_00112233: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v8i32_00112233: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_00112233: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_00112233: ; AVX512VL-SLOW: # %bb.0: @@ -1568,11 +1779,17 @@ ; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8i32_00112233: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] -; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_00112233: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] +; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_00112233: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -1591,11 +1808,17 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v8i32_00001111: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v8i32_00001111: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_00001111: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_00001111: ; AVX512VL-SLOW: # %bb.0: @@ -1603,11 +1826,17 @@ ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8i32_00001111: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] -; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_00001111: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_00001111: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -1641,11 +1870,17 @@ ; AVX512VL-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8i32_08080808: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [34359738368,34359738368,34359738368,34359738368] -; AVX512VL-FAST-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_08080808: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [34359738368,34359738368,34359738368,34359738368] +; AVX512VL-FAST-ALL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_08080808: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -1759,14 +1994,23 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v8i32_08991abb: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v8i32_08991abb: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = +; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_08991abb: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8i32_08991abb: ; AVX512VL: # %bb.0: @@ -1799,11 +2043,17 @@ ; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8i32_091b2d3f: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,9,1,11,2,13,3,15] -; AVX512VL-FAST-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_091b2d3f: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,9,1,11,2,13,3,15] +; AVX512VL-FAST-ALL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_091b2d3f: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512VL-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -1823,12 +2073,19 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v8i32_09ab1def: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v8i32_09ab1def: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_09ab1def: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8i32_09ab1def: ; AVX512VL: # %bb.0: @@ -2270,13 +2527,21 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v8i32_6caa87e5: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v8i32_6caa87e5: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = +; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2] +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_6caa87e5: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8i32_6caa87e5: ; AVX512VL: # %bb.0: @@ -2301,12 +2566,18 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v8i32_32103210: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v8i32_32103210: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_32103210: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_32103210: ; AVX512VL-SLOW: # %bb.0: @@ -2314,12 +2585,18 @@ ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8i32_32103210: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] -; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_32103210: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] +; AVX512VL-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_32103210: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2337,12 +2614,18 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v8i32_76547654: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v8i32_76547654: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_76547654: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_76547654: ; AVX512VL-SLOW: # %bb.0: @@ -2350,12 +2633,18 @@ ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8i32_76547654: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] -; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_76547654: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX512VL-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_76547654: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2373,11 +2662,17 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v8i32_76543210: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: shuffle_v8i32_76543210: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_76543210: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_76543210: ; AVX512VL-SLOW: # %bb.0: @@ -2385,11 +2680,17 @@ ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8i32_76543210: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] -; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_76543210: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_76543210: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2407,11 +2708,17 @@ ; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8i32_3210ba98: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,11,10,9,8] -; AVX512VL-FAST-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_3210ba98: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,11,10,9,8] +; AVX512VL-FAST-ALL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_3210ba98: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2451,11 +2758,17 @@ ; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8i32_7654fedc: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12] -; AVX512VL-FAST-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_7654fedc: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12] +; AVX512VL-FAST-ALL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_7654fedc: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VL-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2473,12 +2786,18 @@ ; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8i32_fedc7654: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12] -; AVX512VL-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 -; AVX512VL-FAST-NEXT: vmovdqa %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_fedc7654: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12] +; AVX512VL-FAST-ALL-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 +; AVX512VL-FAST-ALL-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_fedc7654: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512VL-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2552,12 +2871,18 @@ ; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8i32_089abcde: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,0,1,2,3,4,5,6] -; AVX512VL-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 -; AVX512VL-FAST-NEXT: vmovdqa %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_089abcde: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,0,1,2,3,4,5,6] +; AVX512VL-FAST-ALL-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 +; AVX512VL-FAST-ALL-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_089abcde: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: valignd {{.*#+}} ymm1 = ymm1[7,0,1,2,3,4,5,6] +; AVX512VL-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2581,12 +2906,18 @@ ; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: shuffle_v8i32_0189abcd: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,0,1,2] -; AVX512VL-FAST-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 -; AVX512VL-FAST-NEXT: vmovdqa %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_0189abcd: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,0,1,2] +; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 +; AVX512VL-FAST-ALL-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_0189abcd: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,2] +; AVX512VL-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -4,7 +4,8 @@ ; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-SLOW -; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST,AVX2-FAST-ALL +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST,AVX2-FAST-PERLANE ; ; Verify that the DAG combiner correctly folds bitwise operations across ; shuffles, nested shuffles with undef, pairs of nested shuffles, and other @@ -2401,13 +2402,20 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: combine_unneeded_subvector1: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: combine_unneeded_subvector1: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: combine_unneeded_subvector1: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: retq %b = add <8 x i32> %a, %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> ret <8 x i32> %c diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefix=VL_BW_DQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST-PERLANE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=VL_BW_DQ --check-prefix=VL_BW_DQ-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=VL_BW_DQ --check-prefix=VL_BW_DQ-FAST-PERLANE define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) { ; AVX512F-LABEL: shuf2i1_1_0: @@ -495,30 +497,55 @@ ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VL-LABEL: shuf8i1_u_2_u_u_2_u_2_u: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: kmovw %edi, %k1 -; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2] -; AVX512VL-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX512VL-NEXT: vpslld $31, %ymm1, %ymm1 -; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: shuf8i1_u_2_u_u_2_u_2_u: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: kmovw %edi, %k1 +; AVX512VL-FAST-ALL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VL-FAST-ALL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} +; AVX512VL-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2] +; AVX512VL-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX512VL-FAST-ALL-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX512VL-FAST-ALL-NEXT: vptestmd %ymm1, %ymm1, %k1 +; AVX512VL-FAST-ALL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512VL-FAST-ALL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-FAST-ALL-NEXT: vzeroupper +; AVX512VL-FAST-ALL-NEXT: retq ; -; VL_BW_DQ-LABEL: shuf8i1_u_2_u_u_2_u_2_u: -; VL_BW_DQ: # %bb.0: -; VL_BW_DQ-NEXT: kmovd %edi, %k0 -; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 -; VL_BW_DQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2] -; VL_BW_DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0 -; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0 -; VL_BW_DQ-NEXT: vzeroupper -; VL_BW_DQ-NEXT: retq +; AVX512VL-FAST-PERLANE-LABEL: shuf8i1_u_2_u_u_2_u_2_u: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: kmovw %edi, %k1 +; AVX512VL-FAST-PERLANE-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} +; AVX512VL-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX512VL-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512VL-FAST-PERLANE-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX512VL-FAST-PERLANE-NEXT: vptestmd %ymm1, %ymm1, %k1 +; AVX512VL-FAST-PERLANE-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512VL-FAST-PERLANE-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-FAST-PERLANE-NEXT: vzeroupper +; AVX512VL-FAST-PERLANE-NEXT: retq +; +; VL_BW_DQ-FAST-ALL-LABEL: shuf8i1_u_2_u_u_2_u_2_u: +; VL_BW_DQ-FAST-ALL: # %bb.0: +; VL_BW_DQ-FAST-ALL-NEXT: kmovd %edi, %k0 +; VL_BW_DQ-FAST-ALL-NEXT: vpmovm2d %k0, %ymm0 +; VL_BW_DQ-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2] +; VL_BW_DQ-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; VL_BW_DQ-FAST-ALL-NEXT: vpmovd2m %ymm0, %k0 +; VL_BW_DQ-FAST-ALL-NEXT: vpmovm2w %k0, %xmm0 +; VL_BW_DQ-FAST-ALL-NEXT: vzeroupper +; VL_BW_DQ-FAST-ALL-NEXT: retq +; +; VL_BW_DQ-FAST-PERLANE-LABEL: shuf8i1_u_2_u_u_2_u_2_u: +; VL_BW_DQ-FAST-PERLANE: # %bb.0: +; VL_BW_DQ-FAST-PERLANE-NEXT: kmovd %edi, %k0 +; VL_BW_DQ-FAST-PERLANE-NEXT: vpmovm2d %k0, %ymm0 +; VL_BW_DQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; VL_BW_DQ-FAST-PERLANE-NEXT: vpbroadcastq %xmm0, %ymm0 +; VL_BW_DQ-FAST-PERLANE-NEXT: vpmovd2m %ymm0, %k0 +; VL_BW_DQ-FAST-PERLANE-NEXT: vpmovm2w %k0, %xmm0 +; VL_BW_DQ-FAST-PERLANE-NEXT: vzeroupper +; VL_BW_DQ-FAST-PERLANE-NEXT: retq %b = bitcast i8 %a to <8 x i1> %c = shufflevector < 8 x i1> %b, <8 x i1>undef, <8 x i32> ret <8 x i1> %c diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -2,10 +2,13 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ ; ; add @@ -37,14 +40,22 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: trunc_add_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: trunc_add_v4i64_v4i32: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: trunc_add_v4i64_v4i32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512-LABEL: trunc_add_v4i64_v4i32: ; AVX512: # %bb.0: @@ -472,13 +483,21 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: trunc_add_const_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: trunc_add_const_v4i64_v4i32: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: trunc_add_const_v4i64_v4i32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-FAST-PERLANE-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512-LABEL: trunc_add_const_v4i64_v4i32: ; AVX512: # %bb.0: @@ -803,14 +822,22 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: trunc_sub_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpsubq %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: trunc_sub_v4i64_v4i32: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: trunc_sub_v4i64_v4i32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512-LABEL: trunc_sub_v4i64_v4i32: ; AVX512: # %bb.0: @@ -1206,13 +1233,21 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: trunc_sub_const_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: trunc_sub_const_v4i64_v4i32: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: trunc_sub_const_v4i64_v4i32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-FAST-PERLANE-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512-LABEL: trunc_sub_const_v4i64_v4i32: ; AVX512: # %bb.0: @@ -1574,14 +1609,24 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: trunc_mul_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: trunc_mul_v4i64_v4i32: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-ALL-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: trunc_mul_v4i64_v4i32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-FAST-PERLANE-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-LABEL: trunc_mul_v4i64_v4i32: ; AVX512F: # %bb.0: @@ -2118,13 +2163,21 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: trunc_mul_const_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: trunc_mul_const_v4i64_v4i32: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: trunc_mul_const_v4i64_v4i32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-FAST-PERLANE-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512-LABEL: trunc_mul_const_v4i64_v4i32: ; AVX512: # %bb.0: @@ -2529,14 +2582,22 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: trunc_and_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: trunc_and_v4i64_v4i32: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: trunc_and_v4i64_v4i32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512-LABEL: trunc_and_v4i64_v4i32: ; AVX512: # %bb.0: @@ -2885,13 +2946,21 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: trunc_and_const_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: trunc_and_const_v4i64_v4i32: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: trunc_and_const_v4i64_v4i32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-FAST-PERLANE-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512-LABEL: trunc_and_const_v4i64_v4i32: ; AVX512: # %bb.0: @@ -3214,14 +3283,22 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: trunc_xor_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: trunc_xor_v4i64_v4i32: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: trunc_xor_v4i64_v4i32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512-LABEL: trunc_xor_v4i64_v4i32: ; AVX512: # %bb.0: @@ -3570,13 +3647,21 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: trunc_xor_const_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: trunc_xor_const_v4i64_v4i32: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: trunc_xor_const_v4i64_v4i32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-FAST-PERLANE-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512-LABEL: trunc_xor_const_v4i64_v4i32: ; AVX512: # %bb.0: @@ -3899,14 +3984,22 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: trunc_or_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: trunc_or_v4i64_v4i32: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: trunc_or_v4i64_v4i32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512-LABEL: trunc_or_v4i64_v4i32: ; AVX512: # %bb.0: @@ -4255,13 +4348,21 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: trunc_or_const_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: trunc_or_const_v4i64_v4i32: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: trunc_or_const_v4i64_v4i32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-FAST-PERLANE-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512-LABEL: trunc_or_const_v4i64_v4i32: ; AVX512: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -4,11 +4,15 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-PERLANE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefixes=SKX ; @@ -502,19 +506,32 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: trunc_packus_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpand %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: trunc_packus_v4i64_v4i32: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; AVX2-FAST-ALL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 +; AVX2-FAST-ALL-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: trunc_packus_v4i64_v4i32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v4i64_v4i32: ; AVX512F: # %bb.0: @@ -923,25 +940,44 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: trunc_packus_v8i64_v8i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpand %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpand %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: trunc_packus_v8i64_v8i32: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 +; AVX2-FAST-ALL-NEXT: vpand %ymm1, %ymm3, %ymm1 +; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 +; AVX2-FAST-ALL-NEXT: vpand %ymm0, %ymm2, %ymm0 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: trunc_packus_v8i64_v8i32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpand %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpand %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512-LABEL: trunc_packus_v8i64_v8i32: ; AVX512: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -4,11 +4,15 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-PERLANE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefixes=SKX ; @@ -508,19 +512,32 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: trunc_ssat_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] -; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: trunc_ssat_v4i64_v4i32: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] +; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; AVX2-FAST-ALL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] +; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; AVX2-FAST-ALL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: trunc_ssat_v4i64_v4i32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] +; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] +; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-LABEL: trunc_ssat_v4i64_v4i32: ; AVX512F: # %bb.0: @@ -945,25 +962,44 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: trunc_ssat_v8i64_v8i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647] -; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: trunc_ssat_v8i64_v8i32: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647] +; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] +; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 +; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: trunc_ssat_v8i64_v8i32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647] +; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] +; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512-LABEL: trunc_ssat_v8i64_v8i32: ; AVX512: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -1,14 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-PERLANE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefixes=SKX ; @@ -349,19 +353,32 @@ ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: trunc_usat_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-FAST-NEXT: vpxor %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] -; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729] -; AVX2-FAST-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: trunc_usat_v4i64_v4i32: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-FAST-ALL-NEXT: vpxor %ymm1, %ymm0, %ymm1 +; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] +; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729] +; AVX2-FAST-ALL-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: trunc_usat_v4i64_v4i32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-FAST-PERLANE-NEXT: vpxor %ymm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] +; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729] +; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-LABEL: trunc_usat_v4i64_v4i32: ; AVX512F: # %bb.0: @@ -645,24 +662,42 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: trunc_usat_v8i64_v8i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-FAST-NEXT: vpxor %ymm3, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] -; AVX2-FAST-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpxor %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: trunc_usat_v8i64_v8i32: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm1, %ymm4 +; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] +; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 +; AVX2-FAST-ALL-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm0, %ymm3 +; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 +; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: trunc_usat_v8i64_v8i32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm1, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] +; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512-LABEL: trunc_usat_v8i64_v8i32: ; AVX512: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -4,11 +4,15 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-PERLANE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) { ; SSE-LABEL: trunc8i64_8i32: @@ -32,13 +36,20 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: trunc8i64_8i32: -; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: trunc8i64_8i32: +; AVX2-FAST-ALL: # %bb.0: # %entry +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32: +; AVX2-FAST-PERLANE: # %bb.0: # %entry +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512-LABEL: trunc8i64_8i32: ; AVX512: # %bb.0: # %entry @@ -71,13 +82,20 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: trunc8i64_8i32_ashr: -; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_ashr: +; AVX2-FAST-ALL: # %bb.0: # %entry +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7] +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32_ashr: +; AVX2-FAST-PERLANE: # %bb.0: # %entry +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512-LABEL: trunc8i64_8i32_ashr: ; AVX512: # %bb.0: # %entry @@ -112,13 +130,20 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: trunc8i64_8i32_lshr: -; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_lshr: +; AVX2-FAST-ALL: # %bb.0: # %entry +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7] +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32_lshr: +; AVX2-FAST-PERLANE: # %bb.0: # %entry +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512-LABEL: trunc8i64_8i32_lshr: ; AVX512: # %bb.0: # %entry @@ -1354,13 +1379,20 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-LABEL: trunc2x4i64_8i32: -; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-FAST-ALL-LABEL: trunc2x4i64_8i32: +; AVX2-FAST-ALL: # %bb.0: # %entry +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: trunc2x4i64_8i32: +; AVX2-FAST-PERLANE: # %bb.0: # %entry +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-LABEL: trunc2x4i64_8i32: ; AVX512F: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -4,9 +4,11 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW define <8 x i16> @zext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp { ; SSE2-LABEL: zext_16i8_to_8i16: