diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -20236,6 +20236,144 @@ return NewLd; } +template +static auto getFirstIndexOf(R &&Range, const T &Val) { + auto I = find(Range, Val); + if (I == Range.end()) + return -1L; + return std::distance(Range.begin(), I); +} + +/// Given EXTRACT_SUBVECTOR(VECTOR_SHUFFLE(Op0, Op1, Mask)), +/// try to produce VECTOR_SHUFFLE(EXTRACT_SUBVECTOR(Op?, ?), +/// EXTRACT_SUBVECTOR(Op?, ?), +/// Mask')) +/// iff it is legal and profitable to do so. Notably, the trimmed mask +/// (containing only the elements that are extracted) +/// must reference at most two subvectors. +static SDValue foldExtractSubvectorFromShuffleVector(SDNode *N, + SelectionDAG &DAG, + const TargetLowering &TLI, + bool LegalOperations) { + assert(N->getOpcode() == ISD::EXTRACT_SUBVECTOR && + "Must only be called on EXTRACT_SUBVECTOR's"); + + SDValue N0 = N->getOperand(0); + + // Only deal with non-scalable vectors. + EVT NarrowVT = N->getValueType(0); + EVT WideVT = N0.getValueType(); + if (!NarrowVT.isFixedLengthVector() || !WideVT.isFixedLengthVector()) + return SDValue(); + + // The operand must be a shufflevector. + auto *WideShuffleVector = dyn_cast(N0); + if (!WideShuffleVector) + return SDValue(); + + uint64_t FirstExtractedEltIdx = N->getConstantOperandVal(1); + int NumEltsExtracted = NarrowVT.getVectorNumElements(); + assert((FirstExtractedEltIdx % NumEltsExtracted) == 0 && + "Extract index is not a multiple of the output vector length."); + + int WideNumElts = WideVT.getVectorNumElements(); + + SmallVector NewMask; + NewMask.reserve(NumEltsExtracted); + SmallSetVector, 2> + DemandedSubvectors; + + // Try to decode the wide mask into narrow mask from at most two subvectors. + for (int M : WideShuffleVector->getMask() + .drop_front(FirstExtractedEltIdx) + .take_front(NumEltsExtracted)) { + if (M < 0) { + assert(M == -1 && "Unexpected target shuffle mask?"); + // Does not depend on operands, does not require adjustment. + NewMask.emplace_back(M); + continue; + } + + assert(M < (2 * WideNumElts) && "Out-of-bounds shuffle mask?"); + + // From which operand of the shuffle does this shuffle mask element pick? + int WideShufOpIdx = M / WideNumElts; + // Which element of that operand is picked? + int OpEltIdx = M % WideNumElts; + // And which NumEltsExtracted-sized subvector of that operand is that? + int OpSubvecIdx = OpEltIdx / NumEltsExtracted; + // And which element within that subvector of that operand is that? + int OpEltIdxInSubvec = OpEltIdx % NumEltsExtracted; + + SDValue Op = WideShuffleVector->getOperand(WideShufOpIdx); + + if (Op.isUndef()) { + // Picking from an undef operand. Let's adjust mask instead. + NewMask.emplace_back(-1); + continue; + } + + const std::pair DemandedSubvector(Op, OpSubvecIdx); + + DemandedSubvectors.insert(DemandedSubvector); + if (DemandedSubvectors.size() > 2) + return SDValue(); // We can't handle more than two subvectors. + + // Ok, but from which operand of the new shuffle will this element pick? + int NewOpIdx = + getFirstIndexOf(DemandedSubvectors.getArrayRef(), DemandedSubvector); + assert(NewOpIdx >= 0 && NewOpIdx < 2 && "Unexpected operand index."); + + int AdjM = OpEltIdxInSubvec + NewOpIdx * NumEltsExtracted; + NewMask.emplace_back(AdjM); + } + assert(NewMask.size() == (unsigned)NumEltsExtracted && "Produced bad mask."); + assert(DemandedSubvectors.size() <= 2 && + "Should have ended up demanding at most two subvectors."); + + // Did we discover that the shuffle does not actually depend on operands? + if (DemandedSubvectors.empty()) + return DAG.getUNDEF(NarrowVT); + + // Ok, looks like we will end up forming a new shuffle after all, + // which means that the old one needs to go away. + if (!WideShuffleVector->hasOneUse()) + return SDValue(); + + // And the narrow shufflevector that we'll form must be legal. + if (LegalOperations && + !TLI.isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, NarrowVT)) + return SDValue(); + + // We still perform the exact same EXTRACT_SUBVECTOR, just on different + // operand[s]/index[es], so there is no point in checking for it's legality. + + // Bail out if the target can not handle the shuffle we want to create. + if (!TLI.isShuffleMaskLegal(NewMask, NarrowVT)) + return SDValue(); + + SDLoc DL(N); + + SmallVector NewOps; + for (const std::pair + &DemandedSubvector : DemandedSubvectors) { + // How many elements into the WideVT does this subvector start? + int Index = NumEltsExtracted * DemandedSubvector.second; + // Bail out if said extraction does not fit our budget. + if (!TLI.isExtractSubvectorCheap(NarrowVT, WideVT, Index)) + return SDValue(); + SDValue IndexC = DAG.getVectorIdxConstant(Index, DL); + NewOps.emplace_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowVT, + DemandedSubvector.first, IndexC)); + } + assert(NewOps.size() <= 2 && "Should end up with at most two ops"); + + if (NewOps.size() == 1) + NewOps.emplace_back(DAG.getUNDEF(NarrowVT)); + + return DAG.getVectorShuffle(NarrowVT, DL, NewOps[0], NewOps[1], NewMask); +} + SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) { EVT NVT = N->getValueType(0); SDValue V = N->getOperand(0); @@ -20348,6 +20486,10 @@ } } + if (SDValue V = + foldExtractSubvectorFromShuffleVector(N, DAG, TLI, LegalOperations)) + return V; + V = peekThroughBitcasts(V); // If the input is a build vector. Try to make a smaller build vector. diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll --- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -282,7 +282,7 @@ define <1 x double> @ins2f1(<2 x double> %tmp1, <1 x double> %tmp2) { ; CHECK-LABEL: ins2f1: ; CHECK: // %bb.0: -; CHECK-NEXT: dup v0.2d, v0.d[1] +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp3 = extractelement <2 x double> %tmp1, i32 1 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll @@ -15,6 +15,8 @@ ; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 ; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048 +; REQUIRES: please-fix-autoupdate-script + target triple = "aarch64-unknown-linux-gnu" ; Don't use SVE when its registers are no bigger than NEON. diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll @@ -15,6 +15,8 @@ ; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 ; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048 +; REQUIRES: please-fix-autoupdate-script + target triple = "aarch64-unknown-linux-gnu" ; Don't use SVE when its registers are no bigger than NEON. diff --git a/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll b/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll --- a/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll +++ b/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=armv7 | FileCheck %s ; ; Ensure that don't crash given a largeish power-of-two shufflevector index. @@ -9,8 +10,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: mov r1, #32 ; CHECK-NEXT: vld1.32 {d16, d17}, [r0], r1 -; CHECK-NEXT: vld1.32 {d18, d19}, [r0] -; CHECK-NEXT: vtrn.32 q8, q9 +; CHECK-NEXT: vldr d18, [r0] +; CHECK-NEXT: vtrn.32 d16, d18 ; CHECK-NEXT: vadd.i32 d16, d16, d16 ; CHECK-NEXT: vmov.32 r0, d16[1] ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/ARM/fp16-insert-extract.ll b/llvm/test/CodeGen/ARM/fp16-insert-extract.ll --- a/llvm/test/CodeGen/ARM/fp16-insert-extract.ll +++ b/llvm/test/CodeGen/ARM/fp16-insert-extract.ll @@ -176,7 +176,7 @@ ; CHECKHARD-NEXT: vmov r0, s12 ; CHECKHARD-NEXT: vext.16 d16, d4, d5, #2 ; CHECKHARD-NEXT: vmovx.f16 s12, s4 -; CHECKHARD-NEXT: vdup.16 q11, d3[1] +; CHECKHARD-NEXT: vdup.32 d21, d3[1] ; CHECKHARD-NEXT: vrev32.16 d17, d16 ; CHECKHARD-NEXT: vext.16 d16, d16, d17, #3 ; CHECKHARD-NEXT: vrev32.16 d17, d3 @@ -207,8 +207,8 @@ ; CHECKHARD-NEXT: vmov r0, s11 ; CHECKHARD-NEXT: vmov.16 d20[3], r0 ; CHECKHARD-NEXT: vmov r0, s10 -; CHECKHARD-NEXT: vext.16 d20, d20, d22, #1 -; CHECKHARD-NEXT: vdup.16 q11, d3[2] +; CHECKHARD-NEXT: vext.16 d20, d20, d21, #1 +; CHECKHARD-NEXT: vdup.32 d21, d3[2] ; CHECKHARD-NEXT: vext.16 d19, d20, d20, #3 ; CHECKHARD-NEXT: vadd.f16 q8, q8, q9 ; CHECKHARD-NEXT: vext.16 d18, d0, d1, #2 @@ -223,7 +223,7 @@ ; CHECKHARD-NEXT: vmov.16 d20[2], r0 ; CHECKHARD-NEXT: vmov r0, s0 ; CHECKHARD-NEXT: vmov.16 d20[3], r0 -; CHECKHARD-NEXT: vext.16 d20, d20, d22, #1 +; CHECKHARD-NEXT: vext.16 d20, d20, d21, #1 ; CHECKHARD-NEXT: vext.16 d19, d20, d20, #3 ; CHECKHARD-NEXT: vadd.f16 q0, q8, q9 ; CHECKHARD-NEXT: bx lr @@ -235,7 +235,7 @@ ; CHECKSOFT-NEXT: vmov r0, s12 ; CHECKSOFT-NEXT: vext.16 d16, d4, d5, #2 ; CHECKSOFT-NEXT: vmovx.f16 s12, s4 -; CHECKSOFT-NEXT: vdup.16 q11, d3[1] +; CHECKSOFT-NEXT: vdup.32 d21, d3[1] ; CHECKSOFT-NEXT: vrev32.16 d17, d16 ; CHECKSOFT-NEXT: vext.16 d16, d16, d17, #3 ; CHECKSOFT-NEXT: vrev32.16 d17, d3 @@ -266,8 +266,8 @@ ; CHECKSOFT-NEXT: vmov r0, s11 ; CHECKSOFT-NEXT: vmov.16 d20[3], r0 ; CHECKSOFT-NEXT: vmov r0, s10 -; CHECKSOFT-NEXT: vext.16 d20, d20, d22, #1 -; CHECKSOFT-NEXT: vdup.16 q11, d3[2] +; CHECKSOFT-NEXT: vext.16 d20, d20, d21, #1 +; CHECKSOFT-NEXT: vdup.32 d21, d3[2] ; CHECKSOFT-NEXT: vext.16 d19, d20, d20, #3 ; CHECKSOFT-NEXT: vadd.f16 q8, q8, q9 ; CHECKSOFT-NEXT: vext.16 d18, d0, d1, #2 @@ -282,7 +282,7 @@ ; CHECKSOFT-NEXT: vmov.16 d20[2], r0 ; CHECKSOFT-NEXT: vmov r0, s0 ; CHECKSOFT-NEXT: vmov.16 d20[3], r0 -; CHECKSOFT-NEXT: vext.16 d20, d20, d22, #1 +; CHECKSOFT-NEXT: vext.16 d20, d20, d21, #1 ; CHECKSOFT-NEXT: vext.16 d19, d20, d20, #3 ; CHECKSOFT-NEXT: vadd.f16 q0, q8, q9 ; CHECKSOFT-NEXT: bx lr diff --git a/llvm/test/CodeGen/ARM/vext.ll b/llvm/test/CodeGen/ARM/vext.ll --- a/llvm/test/CodeGen/ARM/vext.ll +++ b/llvm/test/CodeGen/ARM/vext.ll @@ -218,13 +218,13 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: vldr d18, [r0, #32] ; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: vorr d22, d18, d18 +; CHECK-NEXT: vorr d21, d18, d18 ; CHECK-NEXT: vld1.16 {d16, d17}, [r1:128]! ; CHECK-NEXT: vldr d19, [r0, #48] -; CHECK-NEXT: vld1.64 {d20, d21}, [r1:128] -; CHECK-NEXT: vzip.16 d22, d19 -; CHECK-NEXT: vtrn.16 q8, q10 -; CHECK-NEXT: vext.16 d18, d18, d22, #2 +; CHECK-NEXT: vldr d20, [r1] +; CHECK-NEXT: vzip.16 d21, d19 +; CHECK-NEXT: vtrn.16 d16, d20 +; CHECK-NEXT: vext.16 d18, d18, d21, #2 ; CHECK-NEXT: vext.16 d16, d18, d16, #2 ; CHECK-NEXT: vext.16 d16, d16, d16, #2 ; CHECK-NEXT: vmov r0, r1, d16 diff --git a/llvm/test/CodeGen/X86/avx2-conversions.ll b/llvm/test/CodeGen/X86/avx2-conversions.ll --- a/llvm/test/CodeGen/X86/avx2-conversions.ll +++ b/llvm/test/CodeGen/X86/avx2-conversions.ll @@ -1,55 +1,25 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X32,X32-SLOW -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X32,X32-FAST-ALL -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X32,X32-FAST-PERLANE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X64,X64-FAST-ALL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X64,X64-FAST-PERLANE +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X32 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X32 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X64 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X64 define <4 x i32> @trunc4(<4 x i64> %A) nounwind { -; X32-SLOW-LABEL: trunc4: -; X32-SLOW: # %bb.0: -; X32-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X32-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; X32-SLOW-NEXT: vzeroupper -; X32-SLOW-NEXT: retl +; X32-LABEL: trunc4: +; X32: # %bb.0: +; X32-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X32-NEXT: vzeroupper +; X32-NEXT: retl ; -; X32-FAST-ALL-LABEL: trunc4: -; X32-FAST-ALL: # %bb.0: -; X32-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; X32-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; X32-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; X32-FAST-ALL-NEXT: vzeroupper -; X32-FAST-ALL-NEXT: retl -; -; X32-FAST-PERLANE-LABEL: trunc4: -; X32-FAST-PERLANE: # %bb.0: -; X32-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X32-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; X32-FAST-PERLANE-NEXT: vzeroupper -; X32-FAST-PERLANE-NEXT: retl -; -; X64-SLOW-LABEL: trunc4: -; X64-SLOW: # %bb.0: -; X64-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; X64-SLOW-NEXT: vzeroupper -; X64-SLOW-NEXT: retq -; -; X64-FAST-ALL-LABEL: trunc4: -; X64-FAST-ALL: # %bb.0: -; X64-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; X64-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; X64-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; X64-FAST-ALL-NEXT: vzeroupper -; X64-FAST-ALL-NEXT: retq -; -; X64-FAST-PERLANE-LABEL: trunc4: -; X64-FAST-PERLANE: # %bb.0: -; X64-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; X64-FAST-PERLANE-NEXT: vzeroupper -; X64-FAST-PERLANE-NEXT: retq +; X64-LABEL: trunc4: +; X64: # %bb.0: +; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X64-NEXT: vzeroupper +; X64-NEXT: retq %B = trunc <4 x i64> %A to <4 x i32> ret <4 x i32>%B } @@ -57,17 +27,17 @@ define <8 x i16> @trunc8(<8 x i32> %A) nounwind { ; X32-LABEL: trunc8: ; X32: # %bb.0: -; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; X32-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; X32-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X32-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; ; X64-LABEL: trunc8: ; X64: # %bb.0: -; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-NEXT: vzeroupper ; X64-NEXT: retq %B = trunc <8 x i32> %A to <8 x i16> diff --git a/llvm/test/CodeGen/X86/avx2-shift.ll b/llvm/test/CodeGen/X86/avx2-shift.ll --- a/llvm/test/CodeGen/X86/avx2-shift.ll +++ b/llvm/test/CodeGen/X86/avx2-shift.ll @@ -530,9 +530,9 @@ ; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X32-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 -; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; X32-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; X32-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X32-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; @@ -541,9 +541,9 @@ ; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X64-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 -; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-NEXT: vzeroupper ; X64-NEXT: retq %res = shl <8 x i16> %lhs, %rhs diff --git a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll --- a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll +++ b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X86,X86-SLOW -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X86,X86-FAST-ALL -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X86,X86-FAST-PERLANE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X64,X64-FAST-ALL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X64,X64-FAST-PERLANE +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X64 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X64 ; AVX2 Logical Shift Left @@ -376,65 +376,25 @@ } define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind { -; X86-SLOW-LABEL: srl_trunc_and_v4i64: -; X86-SLOW: # %bb.0: -; X86-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; X86-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; X86-SLOW-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8] -; X86-SLOW-NEXT: vandps %xmm2, %xmm1, %xmm1 -; X86-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; X86-SLOW-NEXT: vzeroupper -; X86-SLOW-NEXT: retl +; X86-LABEL: srl_trunc_and_v4i64: +; X86: # %bb.0: +; X86-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8] +; X86-NEXT: vandps %xmm2, %xmm1, %xmm1 +; X86-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; X86-NEXT: vzeroupper +; X86-NEXT: retl ; -; X86-FAST-ALL-LABEL: srl_trunc_and_v4i64: -; X86-FAST-ALL: # %bb.0: -; X86-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> -; X86-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; X86-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] -; X86-FAST-ALL-NEXT: vpand %xmm2, %xmm1, %xmm1 -; X86-FAST-ALL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; X86-FAST-ALL-NEXT: vzeroupper -; X86-FAST-ALL-NEXT: retl -; -; X86-FAST-PERLANE-LABEL: srl_trunc_and_v4i64: -; X86-FAST-PERLANE: # %bb.0: -; X86-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2 -; X86-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; X86-FAST-PERLANE-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8] -; X86-FAST-PERLANE-NEXT: vandps %xmm2, %xmm1, %xmm1 -; X86-FAST-PERLANE-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; X86-FAST-PERLANE-NEXT: vzeroupper -; X86-FAST-PERLANE-NEXT: retl -; -; X64-SLOW-LABEL: srl_trunc_and_v4i64: -; X64-SLOW: # %bb.0: -; X64-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; X64-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; X64-SLOW-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8] -; X64-SLOW-NEXT: vandps %xmm2, %xmm1, %xmm1 -; X64-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; X64-SLOW-NEXT: vzeroupper -; X64-SLOW-NEXT: retq -; -; X64-FAST-ALL-LABEL: srl_trunc_and_v4i64: -; X64-FAST-ALL: # %bb.0: -; X64-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> -; X64-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; X64-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] -; X64-FAST-ALL-NEXT: vpand %xmm2, %xmm1, %xmm1 -; X64-FAST-ALL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; X64-FAST-ALL-NEXT: vzeroupper -; X64-FAST-ALL-NEXT: retq -; -; X64-FAST-PERLANE-LABEL: srl_trunc_and_v4i64: -; X64-FAST-PERLANE: # %bb.0: -; X64-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2 -; X64-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; X64-FAST-PERLANE-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8] -; X64-FAST-PERLANE-NEXT: vandps %xmm2, %xmm1, %xmm1 -; X64-FAST-PERLANE-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; X64-FAST-PERLANE-NEXT: vzeroupper -; X64-FAST-PERLANE-NEXT: retq +; X64-LABEL: srl_trunc_and_v4i64: +; X64: # %bb.0: +; X64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8] +; X64-NEXT: vandps %xmm2, %xmm1, %xmm1 +; X64-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; X64-NEXT: vzeroupper +; X64-NEXT: retq %and = and <4 x i64> %y, %trunc = trunc <4 x i64> %and to <4 x i32> %sra = lshr <4 x i32> %x, %trunc @@ -451,9 +411,9 @@ ; X86-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X86-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 -; X86-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; X86-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; X86-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; X86-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; @@ -462,9 +422,9 @@ ; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X64-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 -; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-NEXT: vzeroupper ; X64-NEXT: retq %shl = shl <8 x i16> %r, %a diff --git a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll --- a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll +++ b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll @@ -159,11 +159,9 @@ define <4 x double> @fadd_noundef_high(<8 x double> %x225, <8 x double> %x227) { ; CHECK-LABEL: fadd_noundef_high: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm1 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm1 -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq %x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> %x228 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -1416,25 +1416,46 @@ ret <4 x i32> %res } define <4 x i32> @test_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec) { -; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask3: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,0,0,13] -; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_16xi32_to_4xi32_perm_mask3: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm1 = [3,0,0,13] +; CHECK-FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-FAST-NEXT: vzeroupper +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_16xi32_to_4xi32_perm_mask3: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; CHECK-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [3,0,0,5] +; CHECK-FAST-PERLANE-NEXT: vpermi2d %xmm2, %xmm0, %xmm1 +; CHECK-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-FAST-PERLANE-NEXT: vzeroupper +; CHECK-FAST-PERLANE-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> ret <4 x i32> %res } define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { -; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask3: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,0,0,13] -; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_16xi32_to_4xi32_perm_mask3: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [3,0,0,13] +; CHECK-FAST-NEXT: vpermd %zmm0, %zmm3, %zmm0 +; CHECK-FAST-NEXT: vptestnmd %xmm2, %xmm2, %k1 +; CHECK-FAST-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-FAST-NEXT: vzeroupper +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_16xi32_to_4xi32_perm_mask3: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm3 +; CHECK-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [3,0,0,5] +; CHECK-FAST-PERLANE-NEXT: vpermi2d %xmm3, %xmm0, %xmm4 +; CHECK-FAST-PERLANE-NEXT: vptestnmd %xmm2, %xmm2, %k1 +; CHECK-FAST-PERLANE-NEXT: vpblendmd %xmm4, %xmm1, %xmm0 {%k1} +; CHECK-FAST-PERLANE-NEXT: vzeroupper +; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 @@ -1442,14 +1463,25 @@ } define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %mask) { -; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask3: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [3,0,0,13] -; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask3: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [3,0,0,13] +; CHECK-FAST-NEXT: vptestnmd %xmm1, %xmm1, %k1 +; CHECK-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-FAST-NEXT: vzeroupper +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask3: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; CHECK-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 +; CHECK-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [3,0,0,5] +; CHECK-FAST-PERLANE-NEXT: vptestnmd %xmm1, %xmm1, %k1 +; CHECK-FAST-PERLANE-NEXT: vpermi2d %xmm3, %xmm0, %xmm2 {%k1} {z} +; CHECK-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm0 +; CHECK-FAST-PERLANE-NEXT: vzeroupper +; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer @@ -2737,12 +2769,9 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vpunpcklqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] -; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2 +; CHECK-NEXT: vmovdqa 48(%rdi), %xmm2 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1} -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vpunpcklqdq 16(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0] ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> @@ -2754,12 +2783,9 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpunpcklqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 +; CHECK-NEXT: vmovdqa 48(%rdi), %xmm1 ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vpunpcklqdq 16(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0] ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> @@ -3207,26 +3233,47 @@ ret <8 x float> %res } define <4 x float> @test_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec) { -; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask0: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,8,9,10] -; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_16xfloat_to_4xfloat_perm_mask0: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm1 = [4,8,9,10] +; CHECK-FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-FAST-NEXT: vzeroupper +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_16xfloat_to_4xfloat_perm_mask0: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm1 +; CHECK-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm2 +; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm0 = [4,0,1,2] +; CHECK-FAST-PERLANE-NEXT: vpermi2ps %xmm2, %xmm1, %xmm0 +; CHECK-FAST-PERLANE-NEXT: vzeroupper +; CHECK-FAST-PERLANE-NEXT: retq %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> ret <4 x float> %res } define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { -; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask0: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [4,8,9,10] -; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 -; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask0: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm3 = [4,8,9,10] +; CHECK-FAST-NEXT: vpermps %zmm0, %zmm3, %zmm0 +; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-FAST-NEXT: vcmpeqps %xmm3, %xmm2, %k1 +; CHECK-FAST-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-FAST-NEXT: vzeroupper +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask0: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; CHECK-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm4 = [4,0,1,2] +; CHECK-FAST-PERLANE-NEXT: vpermi2ps %xmm0, %xmm3, %xmm4 +; CHECK-FAST-PERLANE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-FAST-PERLANE-NEXT: vcmpeqps %xmm0, %xmm2, %k1 +; CHECK-FAST-PERLANE-NEXT: vblendmps %xmm4, %xmm1, %xmm0 {%k1} +; CHECK-FAST-PERLANE-NEXT: vzeroupper +; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 @@ -3234,15 +3281,26 @@ } define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %mask) { -; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask0: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [4,8,9,10] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 -; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask0: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm2 = [4,8,9,10] +; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-FAST-NEXT: vcmpeqps %xmm3, %xmm1, %k1 +; CHECK-FAST-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-FAST-NEXT: vzeroupper +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask0: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; CHECK-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm3 +; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm0 = [4,0,1,2] +; CHECK-FAST-PERLANE-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; CHECK-FAST-PERLANE-NEXT: vcmpeqps %xmm4, %xmm1, %k1 +; CHECK-FAST-PERLANE-NEXT: vpermi2ps %xmm3, %xmm2, %xmm0 {%k1} {z} +; CHECK-FAST-PERLANE-NEXT: vzeroupper +; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer @@ -3305,11 +3363,12 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,0],ymm0[0,1],ymm3[4,4],ymm0[4,5] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 -; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1 +; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[0,0],xmm3[0,1] ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -3322,11 +3381,12 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,0],ymm0[0,1],ymm2[4,4],ymm0[4,5] -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 -; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z} +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 +; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0],xmm2[0,1] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> @@ -3643,14 +3703,13 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [60129542148,60129542148] -; CHECK-NEXT: # xmm2 = mem[0,0] -; CHECK-NEXT: vmovaps 32(%rdi), %ymm3 -; CHECK-NEXT: vpermt2ps (%rdi), %ymm2, %ymm3 +; CHECK-NEXT: vmovaps 48(%rdi), %xmm2 +; CHECK-NEXT: vmovddup {{.*#+}} xmm3 = [25769803776,25769803776] +; CHECK-NEXT: # xmm3 = mem[0,0] +; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> @@ -3662,14 +3721,13 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [60129542148,60129542148] -; CHECK-NEXT: # xmm2 = mem[0,0] -; CHECK-NEXT: vmovaps 32(%rdi), %ymm1 +; CHECK-NEXT: vmovaps 48(%rdi), %xmm2 +; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = [25769803776,25769803776] +; CHECK-NEXT: # xmm1 = mem[0,0] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 -; CHECK-NEXT: vpermt2ps (%rdi), %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z} ; CHECK-NEXT: vmovaps %xmm1, %xmm0 -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> @@ -4238,11 +4296,13 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd {{.*#+}} xmm3 = [3,7] -; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1 -; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm3 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %xmm4, %xmm2, %k1 +; CHECK-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],xmm3[1] +; CHECK-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> @@ -4254,11 +4314,12 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd {{.*#+}} xmm2 = [3,7] +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 -; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm2[1] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> diff --git a/llvm/test/CodeGen/X86/cast-vsel.ll b/llvm/test/CodeGen/X86/cast-vsel.ll --- a/llvm/test/CodeGen/X86/cast-vsel.ll +++ b/llvm/test/CodeGen/X86/cast-vsel.ll @@ -212,9 +212,11 @@ ; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] ; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-NEXT: vpblendvb %xmm0, %xmm2, %xmm1, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll --- a/llvm/test/CodeGen/X86/combine-shl.ll +++ b/llvm/test/CodeGen/X86/combine-shl.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX-FAST-ALL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX-FAST-PERLANE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX ; fold (shl 0, x) -> 0 define <4 x i32> @combine_vec_shl_zero(<4 x i32> %x) { @@ -137,32 +137,14 @@ ; SSE41-NEXT: pmulld %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-SLOW-LABEL: combine_vec_shl_trunc_and: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-SLOW-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq -; -; AVX-FAST-ALL-LABEL: combine_vec_shl_trunc_and: -; AVX-FAST-ALL: # %bb.0: -; AVX-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> -; AVX-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-FAST-ALL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 -; AVX-FAST-ALL-NEXT: vzeroupper -; AVX-FAST-ALL-NEXT: retq -; -; AVX-FAST-PERLANE-LABEL: combine_vec_shl_trunc_and: -; AVX-FAST-PERLANE: # %bb.0: -; AVX-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX-FAST-PERLANE-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-FAST-PERLANE-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 -; AVX-FAST-PERLANE-NEXT: vzeroupper -; AVX-FAST-PERLANE-NEXT: retq +; AVX-LABEL: combine_vec_shl_trunc_and: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %1 = and <4 x i64> %y, %2 = trunc <4 x i64> %1 to <4 x i32> %3 = shl <4 x i32> %x, %2 diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll --- a/llvm/test/CodeGen/X86/combine-sra.ll +++ b/llvm/test/CodeGen/X86/combine-sra.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST-ALL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST-PERLANE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX ; fold (sra 0, x) -> 0 define <4 x i32> @combine_vec_ashr_zero(<4 x i32> %x) { @@ -167,32 +167,14 @@ ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] ; SSE-NEXT: retq ; -; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_and: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpsravd %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_and: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2-FAST-ALL-NEXT: vpsravd %xmm1, %xmm0, %xmm0 -; AVX2-FAST-ALL-NEXT: vzeroupper -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: combine_vec_ashr_trunc_and: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-FAST-PERLANE-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpsravd %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX-LABEL: combine_vec_ashr_trunc_and: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsravd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %1 = and <4 x i64> %y, %2 = trunc <4 x i64> %1 to <4 x i32> %3 = ashr <4 x i32> %x, %2 @@ -215,29 +197,13 @@ ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_lshr: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; AVX2-SLOW-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_lshr: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7] -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-ALL-NEXT: vzeroupper -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: combine_vec_ashr_trunc_lshr: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; AVX2-FAST-PERLANE-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX-LABEL: combine_vec_ashr_trunc_lshr: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %1 = lshr <4 x i64> %x, %2 = trunc <4 x i64> %1 to <4 x i32> %3 = ashr <4 x i32> %2, @@ -288,29 +254,13 @@ ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_ashr: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; AVX2-SLOW-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_ashr: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7] -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-ALL-NEXT: vzeroupper -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: combine_vec_ashr_trunc_ashr: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; AVX2-FAST-PERLANE-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX-LABEL: combine_vec_ashr_trunc_ashr: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %1 = ashr <4 x i64> %x, %2 = trunc <4 x i64> %1 to <4 x i32> %3 = ashr <4 x i32> %2, diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll --- a/llvm/test/CodeGen/X86/combine-srl.ll +++ b/llvm/test/CodeGen/X86/combine-srl.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST-ALL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST-PERLANE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX ; fold (srl 0, x) -> 0 define <4 x i32> @combine_vec_lshr_zero(<4 x i32> %x) { @@ -202,32 +202,14 @@ ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] ; SSE-NEXT: retq ; -; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_lshr1: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: combine_vec_lshr_trunc_lshr1: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-ALL-NEXT: vzeroupper -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: combine_vec_lshr_trunc_lshr1: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-FAST-PERLANE-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX-LABEL: combine_vec_lshr_trunc_lshr1: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %1 = lshr <4 x i64> %x, %2 = trunc <4 x i64> %1 to <4 x i32> %3 = lshr <4 x i32> %2, @@ -424,32 +406,14 @@ ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] ; SSE-NEXT: retq ; -; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_and: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: combine_vec_lshr_trunc_and: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2-FAST-ALL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; AVX2-FAST-ALL-NEXT: vzeroupper -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: combine_vec_lshr_trunc_and: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-FAST-PERLANE-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX-LABEL: combine_vec_lshr_trunc_and: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %1 = and <4 x i64> %y, %2 = trunc <4 x i64> %1 to <4 x i32> %3 = lshr <4 x i32> %x, %2 diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -134,8 +134,9 @@ ; X64-AVX2-LABEL: signbits_ashr_sitofp_1: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vpsrad $16, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; X64-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; X64-AVX2-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -3539,8 +3539,9 @@ ; AVX2-LABEL: truncstore_v8i32_v8i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax ; AVX2-NEXT: notl %eax diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -1690,7 +1690,6 @@ define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind "min-legal-vector-width"="256" { ; CHECK-AVX512-LABEL: splatvar_rotate_v32i8: ; CHECK-AVX512: # %bb.0: -; CHECK-AVX512-NEXT: vpbroadcastb %xmm1, %xmm1 ; CHECK-AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; CHECK-AVX512-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; CHECK-AVX512-NEXT: vpsllw %xmm2, %ymm0, %ymm3 @@ -1710,7 +1709,6 @@ ; ; CHECK-VBMI-LABEL: splatvar_rotate_v32i8: ; CHECK-VBMI: # %bb.0: -; CHECK-VBMI-NEXT: vpbroadcastb %xmm1, %xmm1 ; CHECK-VBMI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; CHECK-VBMI-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; CHECK-VBMI-NEXT: vpsllw %xmm2, %ymm0, %ymm3 diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -3,9 +3,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE2OR3,SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 @@ -934,8 +934,9 @@ ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vzeroupper @@ -2575,60 +2576,23 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: test33: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm2, %ymm4 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 -; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1 -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6] -; AVX2-SLOW-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: test33: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm1, %ymm4 -; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] -; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-FAST-ALL-NEXT: vblendvpd %ymm4, %ymm1, %ymm6, %ymm1 -; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm4, %ymm1 -; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm2, %ymm3 -; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 -; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm2, %ymm6, %ymm2 -; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm4, %ymm2 -; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-FAST-ALL-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: test33: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm2, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] -; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6] -; AVX2-FAST-PERLANE-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-LABEL: test33: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm4 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3] +; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6] +; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test33: ; AVX512: # %bb.0: @@ -2811,66 +2775,25 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: test34: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] -; AVX2-SLOW-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm2, %ymm4 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 -; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1 -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6] -; AVX2-SLOW-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: test34: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] -; AVX2-FAST-ALL-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm1, %ymm4 -; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] -; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-FAST-ALL-NEXT: vblendvpd %ymm4, %ymm1, %ymm6, %ymm1 -; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm4, %ymm1 -; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm2, %ymm3 -; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 -; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm2, %ymm6, %ymm2 -; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm4, %ymm2 -; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-FAST-ALL-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: test34: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm2, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] -; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6] -; AVX2-FAST-PERLANE-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-LABEL: test34: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm4 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3] +; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6] +; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test34: ; AVX512: # %bb.0: diff --git a/llvm/test/CodeGen/X86/reduce-trunc-shl.ll b/llvm/test/CodeGen/X86/reduce-trunc-shl.ll --- a/llvm/test/CodeGen/X86/reduce-trunc-shl.ll +++ b/llvm/test/CodeGen/X86/reduce-trunc-shl.ll @@ -38,8 +38,9 @@ ; ; AVX2-LABEL: trunc_shl_15_v8i16_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512VL @@ -202,8 +202,9 @@ ; AVX2-LABEL: trunc_v8i32_to_v8i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vmovdqa %xmm0, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -300,34 +301,12 @@ } define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind { -; AVX1-LABEL: trunc_v4i64_to_v4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps (%rdi), %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] -; AVX1-NEXT: vmovaps %xmm0, (%rsi) -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] -; AVX2-SLOW-NEXT: vmovaps %xmm0, (%rsi) -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i32: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm0 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-ALL-NEXT: vpermps (%rdi), %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovaps %xmm0, (%rsi) -; AVX2-FAST-ALL-NEXT: vzeroupper -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: trunc_v4i64_to_v4i32: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, (%rsi) -; AVX2-FAST-PERLANE-NEXT: retq +; AVX-LABEL: trunc_v4i64_to_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] +; AVX-NEXT: vmovaps %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_v4i64_to_v4i32: ; AVX512F: # %bb.0: @@ -559,8 +538,9 @@ ; ; AVX2-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -617,8 +597,9 @@ ; ; AVX2-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -786,37 +767,13 @@ } define <8 x i16> @trunc_v4i64_to_v4i16_with_zext_return_v8i16(<4 x i64> %vec) nounwind { -; AVX1-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-ALL-NEXT: vzeroupper -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: ; AVX512F: # %bb.0: @@ -857,37 +814,13 @@ } define <8 x i16> @trunc_v4i64_to_v4i16_via_v4i32_return_v8i16(<4 x i64> %vec) nounwind { -; AVX1-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-ALL-NEXT: vzeroupper -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: ; AVX512F: # %bb.0: diff --git a/llvm/test/CodeGen/X86/trunc-subvector.ll b/llvm/test/CodeGen/X86/trunc-subvector.ll --- a/llvm/test/CodeGen/X86/trunc-subvector.ll +++ b/llvm/test/CodeGen/X86/trunc-subvector.ll @@ -79,9 +79,11 @@ ; ; AVX2-LABEL: test5: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm1 = [3,4,4,4] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -177,9 +179,11 @@ ; ; AVX2-LABEL: test10: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm1 = [3,4,4,4] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -575,8 +575,9 @@ ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -1466,14 +1466,12 @@ ; AVX1-NEXT: vmovd %ecx, %xmm3 ; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 ; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [31,31,31,31] ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm2[0],zero,xmm2[1],zero ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32] ; AVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm4 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm4[0],zero,xmm4[1],zero ; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -378,8 +378,9 @@ ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -663,8 +663,8 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -672,8 +672,8 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v4i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm2, %xmm2 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 @@ -686,7 +686,6 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero @@ -704,7 +703,6 @@ ; ; AVX2-LABEL: splatvar_funnnel_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero @@ -760,8 +758,8 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v8i32: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vprotd %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -769,8 +767,8 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v8i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm2, %xmm2 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 @@ -783,8 +781,6 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero @@ -802,7 +798,6 @@ ; ; AVX2-LABEL: splatvar_funnnel_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm2 @@ -815,7 +810,6 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm2 @@ -828,7 +822,6 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm2 @@ -841,7 +834,6 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v16i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512BW-NEXT: vpsllw %xmm2, %ymm0, %ymm2 @@ -854,7 +846,6 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512VLBW-NEXT: vpsllw %xmm2, %ymm0, %ymm2 @@ -881,9 +872,9 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v16i16: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -891,8 +882,8 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v16i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm2, %xmm2 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 @@ -905,35 +896,33 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpsllw %xmm3, %xmm4, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpsllw %xmm3, %xmm6, %xmm7 -; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5 +; AVX1-NEXT: vpsllw %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpsllw %xmm3, %xmm5, %xmm6 +; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX1-NEXT: vpsubb %xmm1, %xmm7, %xmm1 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpsrlw %xmm1, %xmm6, %xmm6 -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw %xmm1, %xmm5, %xmm5 +; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm3 @@ -954,7 +943,6 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm3 @@ -975,7 +963,6 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm3 @@ -1073,8 +1060,8 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v32i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm2, %xmm2 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -575,8 +575,9 @@ ; AVX2-NEXT: vpsllw $1, %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -400,8 +400,9 @@ ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -709,9 +709,9 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0 @@ -736,7 +736,6 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -756,7 +755,6 @@ ; ; AVX2-LABEL: splatvar_funnnel_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] @@ -841,8 +839,6 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -862,7 +858,6 @@ ; ; AVX2-LABEL: splatvar_funnnel_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -877,7 +872,6 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpsubw %xmm1, %xmm2, %xmm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -892,7 +886,6 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vpsubw %xmm1, %xmm2, %xmm1 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -907,7 +900,6 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v16i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsubw %xmm1, %xmm2, %xmm1 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -922,7 +914,6 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm2, %xmm1 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -951,10 +942,10 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v16i16: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 @@ -980,7 +971,6 @@ ; AVX1-LABEL: splatvar_funnnel_v32i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero @@ -1008,7 +998,6 @@ ; ; AVX2-LABEL: splatvar_funnnel_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -1031,7 +1020,6 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -1054,7 +1042,6 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-narrow-binop.ll b/llvm/test/CodeGen/X86/vector-narrow-binop.ll --- a/llvm/test/CodeGen/X86/vector-narrow-binop.ll +++ b/llvm/test/CodeGen/X86/vector-narrow-binop.ll @@ -160,21 +160,23 @@ ; ; AVX1-LABEL: fmul_v2f64: ; AVX1: # %bb.0: -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-NEXT: vmulpd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmulpd %ymm0, %ymm0, %ymm0 ; AVX1-NEXT: vmulpd %xmm2, %xmm2, %xmm1 -; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: retq ; ; AVX2-LABEL: fmul_v2f64: ; AVX2: # %bb.0: -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-NEXT: vmulpd %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1] +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vmulpd %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: vmulpd %xmm2, %xmm2, %xmm1 -; AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-pack-256.ll b/llvm/test/CodeGen/X86/vector-pack-256.ll --- a/llvm/test/CodeGen/X86/vector-pack-256.ll +++ b/llvm/test/CodeGen/X86/vector-pack-256.ll @@ -243,8 +243,9 @@ ; AVX2-NEXT: vpsrld $17, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -364,8 +364,9 @@ ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -638,8 +638,8 @@ ; ; XOPAVX1-LABEL: splatvar_rotate_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -647,8 +647,8 @@ ; ; XOPAVX2-LABEL: splatvar_rotate_v4i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm2, %xmm2 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 @@ -664,7 +664,6 @@ define <8 x i32> @splatvar_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; AVX1-LABEL: splatvar_rotate_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero @@ -682,7 +681,6 @@ ; ; AVX2-LABEL: splatvar_rotate_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero @@ -738,8 +736,8 @@ ; ; XOPAVX1-LABEL: splatvar_rotate_v8i32: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vprotd %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -747,8 +745,8 @@ ; ; XOPAVX2-LABEL: splatvar_rotate_v8i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm2, %xmm2 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 @@ -764,8 +762,6 @@ define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; AVX1-LABEL: splatvar_rotate_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero @@ -783,7 +779,6 @@ ; ; AVX2-LABEL: splatvar_rotate_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm2 @@ -796,7 +791,6 @@ ; ; AVX512F-LABEL: splatvar_rotate_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm2 @@ -809,7 +803,6 @@ ; ; AVX512VL-LABEL: splatvar_rotate_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm2 @@ -822,7 +815,6 @@ ; ; AVX512BW-LABEL: splatvar_rotate_v16i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512BW-NEXT: vpsllw %xmm2, %ymm0, %ymm2 @@ -835,7 +827,6 @@ ; ; AVX512VLBW-LABEL: splatvar_rotate_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512VLBW-NEXT: vpsllw %xmm2, %ymm0, %ymm2 @@ -862,9 +853,9 @@ ; ; XOPAVX1-LABEL: splatvar_rotate_v16i16: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -872,8 +863,8 @@ ; ; XOPAVX2-LABEL: splatvar_rotate_v16i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm2, %xmm2 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 @@ -889,35 +880,33 @@ define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX1-LABEL: splatvar_rotate_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpsllw %xmm3, %xmm4, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpsllw %xmm3, %xmm6, %xmm7 -; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5 +; AVX1-NEXT: vpsllw %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpsllw %xmm3, %xmm5, %xmm6 +; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX1-NEXT: vpsubb %xmm1, %xmm7, %xmm1 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpsrlw %xmm1, %xmm6, %xmm6 -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw %xmm1, %xmm5, %xmm5 +; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_rotate_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm3 @@ -938,7 +927,6 @@ ; ; AVX512F-LABEL: splatvar_rotate_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm3 @@ -959,7 +947,6 @@ ; ; AVX512VL-LABEL: splatvar_rotate_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm3 @@ -979,7 +966,6 @@ ; ; AVX512BW-LABEL: splatvar_rotate_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm2 @@ -993,7 +979,6 @@ ; ; AVX512VLBW-LABEL: splatvar_rotate_v32i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm2 @@ -1007,7 +992,6 @@ ; ; AVX512VBMI2-LABEL: splatvar_rotate_v32i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VBMI2-NEXT: vpsllw %xmm2, %zmm0, %zmm2 @@ -1021,7 +1005,6 @@ ; ; AVX512VLVBMI2-LABEL: splatvar_rotate_v32i8: ; AVX512VLVBMI2: # %bb.0: -; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %zmm0, %zmm2 @@ -1045,8 +1028,8 @@ ; ; XOPAVX2-LABEL: splatvar_rotate_v32i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm2, %xmm2 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll --- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -677,9 +677,9 @@ ; ; XOPAVX1-LABEL: splatvar_shift_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vpshaq %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll --- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll @@ -212,9 +212,9 @@ ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll --- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -669,8 +669,8 @@ ; ; XOPAVX2-LABEL: splatvar_shift_v32i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll --- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll @@ -148,9 +148,9 @@ ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -275,9 +275,9 @@ ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -3477,39 +3477,19 @@ ; This test used to crash due to bad handling of concat_vectors after a bitcast ; in lowerVectorShuffleAsBroadcast. define <8 x float> @broadcast_concat_crash(<4 x float> %x, <4 x float> %y, float %z) { -; AVX1-LABEL: broadcast_concat_crash: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,3,3,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1OR2-LABEL: broadcast_concat_crash: +; AVX1OR2: # %bb.0: # %entry +; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,3,3,3] +; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1OR2-NEXT: retq ; -; AVX2-LABEL: broadcast_concat_crash: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-SLOW-LABEL: broadcast_concat_crash: -; AVX512VL-SLOW: # %bb.0: # %entry -; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512VL-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] -; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: broadcast_concat_crash: -; AVX512VL-FAST: # %bb.0: # %entry -; AVX512VL-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} xmm1 = [1,4,3,3] -; AVX512VL-FAST-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 -; AVX512VL-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-LABEL: broadcast_concat_crash: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vmovaps {{.*#+}} xmm0 = [3,4,3,3] +; AVX512VL-NEXT: vpermi2ps %xmm2, %xmm1, %xmm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX512VL-NEXT: retq entry: %tmp = shufflevector <4 x float> %x, <4 x float> %y, <8 x i32> %bc = bitcast <8 x float> %tmp to <4 x i64> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -804,8 +804,9 @@ ; ALL-NEXT: vbroadcastss {{.*#+}} ymm0 = [-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0] ; ALL-NEXT: vmulps 32(%rdi), %ymm0, %ymm0 ; ALL-NEXT: vcvtps2pd %ymm0, %zmm0 -; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,4,5,0,1,0,1] -; ALL-NEXT: vmovapd %ymm0, {{[0-9]+}}(%rsp) +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; ALL-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) ; ALL-NEXT: movq %rbp, %rsp ; ALL-NEXT: popq %rbp ; ALL-NEXT: .cfi_def_cfa %rsp, 8 diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW @@ -32,30 +32,13 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: trunc_add_v4i64_v4i32: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-ALL-NEXT: vzeroupper -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: trunc_add_v4i64_v4i32: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-LABEL: trunc_add_v4i64_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_add_v4i64_v4i32: ; AVX512: # %bb.0: @@ -163,9 +146,9 @@ ; AVX2-LABEL: trunc_add_v8i32_v8i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -434,9 +417,10 @@ ; ; AVX2-LABEL: trunc_add_v8i32_v8i16_sext_8i8: ; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -475,29 +459,13 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: trunc_add_const_v4i64_v4i32: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-ALL-NEXT: vzeroupper -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: trunc_add_const_v4i64_v4i32: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-FAST-PERLANE-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-LABEL: trunc_add_const_v4i64_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_add_const_v4i64_v4i32: ; AVX512: # %bb.0: @@ -590,8 +558,9 @@ ; ; AVX2-LABEL: trunc_add_const_v8i32_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -814,30 +783,13 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: trunc_sub_v4i64_v4i32: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpsubq %ymm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-ALL-NEXT: vzeroupper -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: trunc_sub_v4i64_v4i32: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpsubq %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-LABEL: trunc_sub_v4i64_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_sub_v4i64_v4i32: ; AVX512: # %bb.0: @@ -945,9 +897,9 @@ ; AVX2-LABEL: trunc_sub_v8i32_v8i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1225,29 +1177,13 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: trunc_sub_const_v4i64_v4i32: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-ALL-NEXT: vzeroupper -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: trunc_sub_const_v4i64_v4i32: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-FAST-PERLANE-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-LABEL: trunc_sub_const_v4i64_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_sub_const_v4i64_v4i32: ; AVX512: # %bb.0: @@ -1340,8 +1276,9 @@ ; ; AVX2-LABEL: trunc_sub_const_v8i32_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1599,34 +1536,15 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: trunc_mul_v4i64_v4i32: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-ALL-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-FAST-ALL-NEXT: vzeroupper -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: trunc_mul_v4i64_v4i32: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-FAST-PERLANE-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-LABEL: trunc_mul_v4i64_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_mul_v4i64_v4i32: ; AVX512F: # %bb.0: @@ -1796,9 +1714,9 @@ ; AVX2-LABEL: trunc_mul_v8i32_v8i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2111,9 +2029,10 @@ ; ; AVX2-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: ; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2155,29 +2074,13 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: trunc_mul_const_v4i64_v4i32: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-ALL-NEXT: vzeroupper -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: trunc_mul_const_v4i64_v4i32: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-FAST-PERLANE-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-LABEL: trunc_mul_const_v4i64_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_mul_const_v4i64_v4i32: ; AVX512: # %bb.0: @@ -2270,8 +2173,9 @@ ; ; AVX2-LABEL: trunc_mul_const_v8i32_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2574,30 +2478,13 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: trunc_and_v4i64_v4i32: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-ALL-NEXT: vzeroupper -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: trunc_and_v4i64_v4i32: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-LABEL: trunc_and_v4i64_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_and_v4i64_v4i32: ; AVX512: # %bb.0: @@ -2697,9 +2584,9 @@ ; AVX2-LABEL: trunc_and_v8i32_v8i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2938,29 +2825,13 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: trunc_and_const_v4i64_v4i32: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-ALL-NEXT: vzeroupper -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: trunc_and_const_v4i64_v4i32: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-FAST-PERLANE-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-LABEL: trunc_and_const_v4i64_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_and_const_v4i64_v4i32: ; AVX512: # %bb.0: @@ -3053,8 +2924,9 @@ ; ; AVX2-LABEL: trunc_and_const_v8i32_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -3275,30 +3147,13 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: trunc_xor_v4i64_v4i32: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-ALL-NEXT: vzeroupper -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: trunc_xor_v4i64_v4i32: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-LABEL: trunc_xor_v4i64_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_xor_v4i64_v4i32: ; AVX512: # %bb.0: @@ -3398,9 +3253,9 @@ ; AVX2-LABEL: trunc_xor_v8i32_v8i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3639,29 +3494,13 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: trunc_xor_const_v4i64_v4i32: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-ALL-NEXT: vzeroupper -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: trunc_xor_const_v4i64_v4i32: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-FAST-PERLANE-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-LABEL: trunc_xor_const_v4i64_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_xor_const_v4i64_v4i32: ; AVX512: # %bb.0: @@ -3754,8 +3593,9 @@ ; ; AVX2-LABEL: trunc_xor_const_v8i32_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -3976,30 +3816,13 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: trunc_or_v4i64_v4i32: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-ALL-NEXT: vzeroupper -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: trunc_or_v4i64_v4i32: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-LABEL: trunc_or_v4i64_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_or_v4i64_v4i32: ; AVX512: # %bb.0: @@ -4099,9 +3922,9 @@ ; AVX2-LABEL: trunc_or_v8i32_v8i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4340,29 +4163,13 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: trunc_or_const_v4i64_v4i32: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-ALL-NEXT: vzeroupper -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: trunc_or_const_v4i64_v4i32: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-FAST-PERLANE-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-LABEL: trunc_or_const_v4i64_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_or_const_v4i64_v4i32: ; AVX512: # %bb.0: @@ -4455,8 +4262,9 @@ ; ; AVX2-LABEL: trunc_or_const_v8i32_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -4,8 +4,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-ALL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-PERLANE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL @@ -493,45 +493,18 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: trunc_packus_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpand %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: trunc_packus_v4i64_v4i32: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 -; AVX2-FAST-ALL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 -; AVX2-FAST-ALL-NEXT: vpand %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-ALL-NEXT: vzeroupper -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: trunc_packus_v4i64_v4i32: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpand %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-LABEL: trunc_packus_v4i64_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v4i64_v4i32: ; AVX512F: # %bb.0: @@ -921,63 +894,24 @@ ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: trunc_packus_v8i64_v8i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vpand %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpand %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: trunc_packus_v8i64_v8i32: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 -; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 -; AVX2-FAST-ALL-NEXT: vpand %ymm1, %ymm3, %ymm1 -; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 -; AVX2-FAST-ALL-NEXT: vpand %ymm0, %ymm2, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: trunc_packus_v8i64_v8i32: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpand %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpand %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-LABEL: trunc_packus_v8i64_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vpand %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_packus_v8i64_v8i32: ; AVX512: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -4,8 +4,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-ALL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-PERLANE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL @@ -499,45 +499,18 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: trunc_ssat_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: trunc_ssat_v4i64_v4i32: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] -; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 -; AVX2-FAST-ALL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; AVX2-FAST-ALL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-ALL-NEXT: vzeroupper -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: trunc_ssat_v4i64_v4i32: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] -; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-LABEL: trunc_ssat_v4i64_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_ssat_v4i64_v4i32: ; AVX512F: # %bb.0: @@ -943,63 +916,24 @@ ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: trunc_ssat_v8i64_v8i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: trunc_ssat_v8i64_v8i32: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647] -; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 -; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 -; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 -; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: trunc_ssat_v8i64_v8i32: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647] -; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-LABEL: trunc_ssat_v8i64_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647] +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_ssat_v8i64_v8i32: ; AVX512: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -4,8 +4,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-ALL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-PERLANE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL @@ -340,45 +340,18 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: trunc_usat_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-SLOW-NEXT: vpxor %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729] -; AVX2-SLOW-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: trunc_usat_v4i64_v4i32: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-FAST-ALL-NEXT: vpxor %ymm1, %ymm0, %ymm1 -; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] -; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1 -; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729] -; AVX2-FAST-ALL-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-ALL-NEXT: vzeroupper -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: trunc_usat_v4i64_v4i32: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-FAST-PERLANE-NEXT: vpxor %ymm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] -; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729] -; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-LABEL: trunc_usat_v4i64_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729] +; AVX2-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_usat_v4i64_v4i32: ; AVX512F: # %bb.0: @@ -644,60 +617,23 @@ ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: trunc_usat_v8i64_v8i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm4 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: trunc_usat_v8i64_v8i32: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm1, %ymm4 -; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] -; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-FAST-ALL-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm0, %ymm3 -; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 -; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: trunc_usat_v8i64_v8i32: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm1, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] -; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-LABEL: trunc_usat_v8i64_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm4 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_usat_v8i64_v8i32: ; AVX512: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -3,9 +3,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-ALL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-PERLANE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL @@ -22,34 +22,12 @@ ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: retq ; -; AVX1-LABEL: trunc8i64_8i32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc8i64_8i32: -; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: trunc8i64_8i32: -; AVX2-FAST-ALL: # %bb.0: # %entry -; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32: -; AVX2-FAST-PERLANE: # %bb.0: # %entry -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] -; AVX2-FAST-PERLANE-NEXT: retq +; AVX-LABEL: trunc8i64_8i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX-NEXT: retq ; ; AVX512-LABEL: trunc8i64_8i32: ; AVX512: # %bb.0: # %entry @@ -68,34 +46,12 @@ ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: retq ; -; AVX1-LABEL: trunc8i64_8i32_ashr: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr: -; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_ashr: -; AVX2-FAST-ALL: # %bb.0: # %entry -; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7] -; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32_ashr: -; AVX2-FAST-PERLANE: # %bb.0: # %entry -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] -; AVX2-FAST-PERLANE-NEXT: retq +; AVX-LABEL: trunc8i64_8i32_ashr: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] +; AVX-NEXT: retq ; ; AVX512-LABEL: trunc8i64_8i32_ashr: ; AVX512: # %bb.0: # %entry @@ -116,34 +72,12 @@ ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: retq ; -; AVX1-LABEL: trunc8i64_8i32_lshr: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr: -; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_lshr: -; AVX2-FAST-ALL: # %bb.0: # %entry -; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7] -; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32_lshr: -; AVX2-FAST-PERLANE: # %bb.0: # %entry -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] -; AVX2-FAST-PERLANE-NEXT: retq +; AVX-LABEL: trunc8i64_8i32_lshr: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] +; AVX-NEXT: retq ; ; AVX512-LABEL: trunc8i64_8i32_lshr: ; AVX512: # %bb.0: # %entry @@ -355,9 +289,9 @@ ; ; AVX2-LABEL: trunc8i32_8i16: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1365,34 +1299,12 @@ ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: retq ; -; AVX1-LABEL: trunc2x4i64_8i32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc2x4i64_8i32: -; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: trunc2x4i64_8i32: -; AVX2-FAST-ALL: # %bb.0: # %entry -; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: trunc2x4i64_8i32: -; AVX2-FAST-PERLANE: # %bb.0: # %entry -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] -; AVX2-FAST-PERLANE-NEXT: retq +; AVX-LABEL: trunc2x4i64_8i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc2x4i64_8i32: ; AVX512F: # %bb.0: # %entry @@ -1904,25 +1816,11 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: retq ; -; AVX1-LABEL: PR32160: -; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: PR32160: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: PR32160: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9] -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX-LABEL: PR32160: +; AVX: # %bb.0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9] +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq ; ; AVX512F-LABEL: PR32160: ; AVX512F: # %bb.0: @@ -2141,9 +2039,11 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-NEXT: shlq $4, %rdi ; AVX2-NEXT: vmovdqu %xmm0, (%rsi,%rdi) ; AVX2-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -1932,19 +1932,11 @@ } define <2 x i64> @PR37616(<16 x i64>* %a0) { -; AVX1-LABEL: PR37616: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-NEXT: retq -; -; AVX2OR512-LABEL: PR37616: -; AVX2OR512: # %bb.0: -; AVX2OR512-NEXT: vmovaps (%rdi), %ymm0 -; AVX2OR512-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2OR512-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2OR512-NEXT: vzeroupper -; AVX2OR512-NEXT: retq +; AVX-LABEL: PR37616: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps 16(%rdi), %xmm0 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX-NEXT: retq %load = load <16 x i64>, <16 x i64>* %a0, align 128 %shuffle = shufflevector <16 x i64> %load, <16 x i64> undef, <2 x i32> ret <2 x i64> %shuffle