Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -33842,6 +33842,36 @@ unsigned NumMaskElts = Mask.size(); unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); + if (AllowIntDomain && EltSizeInBits == 64 && + ((MaskVT.is128BitVector() && Subtarget.hasVLX()) || + (MaskVT.is256BitVector() && Subtarget.hasVLX()) || + (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { + if (!any_of(Mask, [](int M) { return M == SM_SentinelZero; })) { + int Rotation = matchShuffleAsElementRotate(V1, V2, Mask); + if (0 < Rotation) { + Shuffle = X86ISD::VALIGN; + ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64); + PermuteImm = Rotation; + return true; + } + } + } + + if (AllowIntDomain && EltSizeInBits == 32 && + ((MaskVT.is128BitVector() && Subtarget.hasVLX()) || + (MaskVT.is256BitVector() && Subtarget.hasVLX()) || + (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { + if (!any_of(Mask, [](int M) { return M == SM_SentinelZero; })) { + int Rotation = matchShuffleAsElementRotate(V1, V2, Mask); + if (0 < Rotation) { + Shuffle = X86ISD::VALIGN; + ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32); + PermuteImm = Rotation; + return true; + } + } + } + // Attempt to match against PALIGNR byte rotate. if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) || (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || @@ -34035,10 +34065,14 @@ // is different from the root element size - this would prevent writemasks // from being reused. // TODO - this currently prevents all lane shuffles from occurring. - // TODO - check for writemasks usage instead of always preventing combining. // TODO - attempt to narrow Mask back to writemask size. - bool IsEVEXShuffle = - RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128); + bool IsMaskedShuffle = false; + if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) { + if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT && + Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) { + IsMaskedShuffle = true; + } + } // Attempt to match a subvector broadcast. // shuffle(insert_subvector(undef, sub, 0), undef, 0, 0, 0, 0) @@ -34079,7 +34113,60 @@ return DAG.getBitcast(RootVT, Res); } - // TODO - handle AVX512 cases with X86ISD::SHUF128. + SmallVector Mask; + if (BaseMaskEltSizeInBits > 128) { + assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size"); + int MaskScale = BaseMaskEltSizeInBits / 128; + scaleShuffleMask(MaskScale, BaseMask, Mask); + } else { + Mask.assign(BaseMask.begin(), BaseMask.end()); + } + + // Try to lower to vshuf64x2/vshuf32x4. + auto MatchSHUF128 = [](MVT ShuffleVT, const SDLoc &DL, ArrayRef Mask, + SDValue V1, SDValue V2, SelectionDAG &DAG) { + unsigned PermMask = 0; + // Insure elements came from the same Op. + SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)}; + for (int i = 0; i < 4; ++i) { + assert(Mask[i] >= -1 && "Illegal shuffle sentinel value"); + if (Mask[i] < 0) + continue; + + SDValue Op = Mask[i] >= 4 ? V2 : V1; + unsigned OpIndex = i / 2; + if (Ops[OpIndex].isUndef()) + Ops[OpIndex] = Op; + else if (Ops[OpIndex] != Op) + return SDValue(); + + // Convert the 128-bit shuffle mask selection values into 128-bit selection + // bits defined by a vshuf64x2 instruction's immediate control byte. + PermMask |= (Mask[i] % 4) << (i * 2); + } + + return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, + DAG.getBitcast(ShuffleVT, Ops[0]), + DAG.getBitcast(ShuffleVT, Ops[1]), + DAG.getTargetConstant(PermMask, DL, MVT::i8)); + }; + + // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask + // doesn't work because our mask is for 128 bits and we don't have an MVT + // to match that. + bool PreferPERMQ = UnaryShuffle && + isUndefOrInRange(Mask[0], 0, 2) && + isUndefOrInRange(Mask[1], 0, 2) && + isUndefOrInRange(Mask[2], 2, 4) && + isUndefOrInRange(Mask[3], 2, 4) && + (Mask[0] < 0 || Mask[2] < 0 || Mask[0] == (Mask[2] % 2)) && + (Mask[1] < 0 || Mask[3] < 0 || Mask[1] == (Mask[3] % 2)); + + + if (!isAnyZero(Mask) && !PreferPERMQ) { + if (SDValue V = MatchSHUF128(ShuffleVT, DL, Mask, V1, V2, DAG)) + return DAG.getBitcast(RootVT, V); + } } // Handle 128-bit lane shuffles of 256-bit vectors. @@ -34119,8 +34206,11 @@ return DAG.getBitcast(RootVT, Res); } + if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128) + return SDValue(); // Nothing to do! + // TODO - handle AVX512VL cases with X86ISD::SHUF128. - if (!UnaryShuffle && !IsEVEXShuffle) { + if (!UnaryShuffle && !IsMaskedShuffle) { assert(llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"); // Prefer blends to X86ISD::VPERM2X128. @@ -34200,7 +34290,7 @@ // Attempt to match against broadcast-from-vector. // Limit AVX1 to cases where we're loading+broadcasting a scalar element. if ((Subtarget.hasAVX2() || (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) - && (!IsEVEXShuffle || NumRootElts == NumMaskElts)) { + && (!IsMaskedShuffle || NumRootElts == NumMaskElts)) { SmallVector BroadcastMask(NumMaskElts, 0); if (isTargetShuffleEquivalent(Mask, BroadcastMask)) { if (V1.getValueType() == MaskVT && @@ -34226,7 +34316,7 @@ if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) && - (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { + (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 0 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! Res = DAG.getBitcast(ShuffleSrcVT, NewV1); @@ -34237,7 +34327,7 @@ if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, Subtarget, Shuffle, ShuffleVT, PermuteImm) && - (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { + (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 0 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! Res = DAG.getBitcast(ShuffleVT, V1); @@ -34252,7 +34342,7 @@ if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1, NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT, UnaryShuffle) && - (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { + (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 0 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1); @@ -34266,7 +34356,7 @@ if (matchBinaryPermuteShuffle( MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1, NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) && - (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { + (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 0 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! NewV1 = DAG.getBitcast(ShuffleVT, NewV1); Index: llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll =================================================================== --- llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll +++ llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll @@ -655,11 +655,10 @@ ; X86-AVX512VL-LABEL: test_x86_sse2_storeu_pd: ; X86-AVX512VL: # %bb.0: ; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512VL-NEXT: vmovsd {{\.LCPI.*}}, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x0d,A,A,A,A] +; X86-AVX512VL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x57,0xc9] +; X86-AVX512VL-NEXT: vmovhpd {{\.LCPI.*}}, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x16,0x0d,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 -; X86-AVX512VL-NEXT: # xmm1 = mem[0],zero -; X86-AVX512VL-NEXT: vpslldq $8, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x73,0xf9,0x08] -; X86-AVX512VL-NEXT: # xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X86-AVX512VL-NEXT: # xmm1 = xmm1[0],mem[0] ; X86-AVX512VL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1] ; X86-AVX512VL-NEXT: vmovupd %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x11,0x00] ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] @@ -676,11 +675,10 @@ ; ; X64-AVX512VL-LABEL: test_x86_sse2_storeu_pd: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovsd {{.*}}(%rip), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x0d,A,A,A,A] +; X64-AVX512VL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x57,0xc9] +; X64-AVX512VL-NEXT: vmovhpd {{.*}}(%rip), %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x16,0x0d,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte -; X64-AVX512VL-NEXT: # xmm1 = mem[0],zero -; X64-AVX512VL-NEXT: vpslldq $8, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x73,0xf9,0x08] -; X64-AVX512VL-NEXT: # xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-AVX512VL-NEXT: # xmm1 = xmm1[0],mem[0] ; X64-AVX512VL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1] ; X64-AVX512VL-NEXT: vmovupd %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x11,0x07] ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] Index: llvm/test/CodeGen/X86/avx512-cvt.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-cvt.ll +++ llvm/test/CodeGen/X86/avx512-cvt.ll @@ -120,30 +120,20 @@ } define <2 x float> @sltof2f32(<2 x i64> %a) { -; NOVLDQ-LABEL: sltof2f32: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: vpextrq $1, %xmm0, %rax -; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; NOVLDQ-NEXT: vmovq %xmm0, %rax -; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; NOVLDQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; NOVLDQ-NEXT: retq +; NODQ-LABEL: sltof2f32: +; NODQ: # %bb.0: +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; NODQ-NEXT: retq ; ; VLDQ-LABEL: sltof2f32: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvtqq2ps %xmm0, %xmm0 ; VLDQ-NEXT: retq ; -; VLNODQ-LABEL: sltof2f32: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; VLNODQ-NEXT: vmovq %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; VLNODQ-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; VLNODQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; VLNODQ-NEXT: retq -; ; DQNOVL-LABEL: sltof2f32: ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 Index: llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -2171,8 +2171,8 @@ define <2 x i64> @test_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec) { ; CHECK-LABEL: test_8xi64_to_2xi64_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,0,2,3,7,4,6,7] -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> @@ -2181,9 +2181,10 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,0,2,3,7,4,6,7] +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3 ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1 -; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: valignq {{.*#+}} xmm1 {%k1} = xmm3[1],xmm0[0] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> @@ -2195,9 +2196,9 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask0: ; CHECK: # %bb.0: +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 -; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,2,3,7,4,6,7] -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: valignq {{.*#+}} xmm0 {%k1} {z} = xmm2[1],xmm0[0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> Index: llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll =================================================================== --- llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll +++ llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll @@ -350,11 +350,10 @@ ; X86-AVX512-LABEL: test_x86_sse2_storeu_pd: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovsd LCPI11_0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x0d,A,A,A,A] +; X86-AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x57,0xc9] +; X86-AVX512-NEXT: vmovhpd LCPI11_0, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x16,0x0d,A,A,A,A] ; X86-AVX512-NEXT: ## fixup A - offset: 4, value: LCPI11_0, kind: FK_Data_4 -; X86-AVX512-NEXT: ## xmm1 = mem[0],zero -; X86-AVX512-NEXT: vpslldq $8, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x73,0xf9,0x08] -; X86-AVX512-NEXT: ## xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X86-AVX512-NEXT: ## xmm1 = xmm1[0],mem[0] ; X86-AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1] ; X86-AVX512-NEXT: vmovupd %xmm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x11,0x00] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] @@ -381,11 +380,10 @@ ; ; X64-AVX512-LABEL: test_x86_sse2_storeu_pd: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vmovsd {{.*}}(%rip), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x0d,A,A,A,A] +; X64-AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x57,0xc9] +; X64-AVX512-NEXT: vmovhpd {{.*}}(%rip), %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x16,0x0d,A,A,A,A] ; X64-AVX512-NEXT: ## fixup A - offset: 4, value: LCPI11_0-4, kind: reloc_riprel_4byte -; X64-AVX512-NEXT: ## xmm1 = mem[0],zero -; X64-AVX512-NEXT: vpslldq $8, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x73,0xf9,0x08] -; X64-AVX512-NEXT: ## xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-AVX512-NEXT: ## xmm1 = xmm1[0],mem[0] ; X64-AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1] ; X64-AVX512-NEXT: vmovupd %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x11,0x07] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] Index: llvm/test/CodeGen/X86/sse41.ll =================================================================== --- llvm/test/CodeGen/X86/sse41.ll +++ llvm/test/CodeGen/X86/sse41.ll @@ -1976,10 +1976,8 @@ ; ; AVX512-LABEL: insertps_5: ; AVX512: ## %bb.0: -; AVX512-NEXT: vpblendd $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x02] -; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512-NEXT: vmovq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc0] -; AVX512-NEXT: ## xmm0 = xmm0[0],zero +; AVX512-NEXT: vinsertps $92, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x5c] +; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],zero,zero ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x float> %A, i32 0 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 @@ -2060,8 +2058,6 @@ ; AVX512: ## %bb.0: ; AVX512-NEXT: vinsertps $28, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x1c] ; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[0],zero,zero -; AVX512-NEXT: vmovq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc0] -; AVX512-NEXT: ## xmm0 = xmm0[0],zero ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x float> %A, i32 0 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 Index: llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll =================================================================== --- llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll +++ llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll @@ -1350,7 +1350,8 @@ ; ; AVX512VL-32-LABEL: uitofp_v2i64_v2f64: ; AVX512VL-32: # %bb.0: -; AVX512VL-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm1 +; AVX512VL-32-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX512VL-32-NEXT: vpor {{\.LCPI.*}}, %xmm1, %xmm1 ; AVX512VL-32-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX512VL-32-NEXT: vpor {{\.LCPI.*}}, %xmm0, %xmm0 @@ -1360,7 +1361,8 @@ ; ; AVX512VL-64-LABEL: uitofp_v2i64_v2f64: ; AVX512VL-64: # %bb.0: -; AVX512VL-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 +; AVX512VL-64-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-64-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX512VL-64-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 ; AVX512VL-64-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX512VL-64-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 Index: llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll =================================================================== --- llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll +++ llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll @@ -830,7 +830,8 @@ ; ; AVX512VL-32-LABEL: uitofp_v4i64_v4f64: ; AVX512VL-32: # %bb.0: -; AVX512VL-32-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm1 +; AVX512VL-32-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX512VL-32-NEXT: vpor {{\.LCPI.*}}, %ymm1, %ymm1 ; AVX512VL-32-NEXT: vpsrlq $32, %ymm0, %ymm0 ; AVX512VL-32-NEXT: vpor {{\.LCPI.*}}, %ymm0, %ymm0 @@ -840,7 +841,8 @@ ; ; AVX512VL-64-LABEL: uitofp_v4i64_v4f64: ; AVX512VL-64: # %bb.0: -; AVX512VL-64-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm1 +; AVX512VL-64-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-64-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX512VL-64-NEXT: vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1 ; AVX512VL-64-NEXT: vpsrlq $32, %ymm0, %ymm0 ; AVX512VL-64-NEXT: vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0 Index: llvm/test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -624,7 +624,8 @@ ; ; AVX512VL-LABEL: uitofp_2i64_to_2f64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 ; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 @@ -970,7 +971,8 @@ ; ; AVX512VL-LABEL: uitofp_4i64_to_4f64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm1 +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0 ; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0 @@ -1249,8 +1251,7 @@ ; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: sitofp_2i64_to_4f32: @@ -1319,8 +1320,7 @@ ; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: sitofp_2i64_to_4f32_zero: @@ -1387,8 +1387,7 @@ ; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: sitofp_4i64_to_4f32_undef: @@ -1947,8 +1946,7 @@ ; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 -; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_2i64_to_4f32: @@ -2067,8 +2065,7 @@ ; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 -; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_2i64_to_2f32: @@ -2214,8 +2211,7 @@ ; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 -; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_4i64_to_4f32_undef: @@ -3353,7 +3349,8 @@ ; AVX512VL-LABEL: uitofp_load_2i64_to_2f64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 ; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 @@ -3726,7 +3723,8 @@ ; AVX512VL-LABEL: uitofp_load_4i64_to_4f64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm1 +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0 ; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0 @@ -5881,8 +5879,8 @@ ; AVX512VL-LABEL: PR43609: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4294967295,4294967295] -; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm3 +; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] ; AVX512VL-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0 @@ -5891,7 +5889,7 @@ ; AVX512VL-NEXT: vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] ; AVX512VL-NEXT: vsubpd %xmm6, %xmm0, %xmm0 ; AVX512VL-NEXT: vaddpd %xmm0, %xmm3, %xmm0 -; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX512VL-NEXT: vpor %xmm4, %xmm2, %xmm2 ; AVX512VL-NEXT: vpsrlq $32, %xmm1, %xmm1 ; AVX512VL-NEXT: vpor %xmm5, %xmm1, %xmm1 Index: llvm/test/CodeGen/X86/vector-fshl-256.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshl-256.ll +++ llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -2115,15 +2115,14 @@ ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3 -; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] +; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4 +; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] ; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VL-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 +; AVX512VL-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 Index: llvm/test/CodeGen/X86/vector-fshl-512.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshl-512.ll +++ llvm/test/CodeGen/X86/vector-fshl-512.ll @@ -1213,19 +1213,18 @@ ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm7 ; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm10 ; AVX512VL-NEXT: vpblendvb %ymm10, %ymm7, %ymm4, %ymm4 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512VL-NEXT: vpsrlw $8, %ymm7, %ymm7 -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] -; AVX512VL-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512VL-NEXT: vpmullw %ymm7, %ymm11, %ymm7 -; AVX512VL-NEXT: vpsrlw $8, %ymm7, %ymm7 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] +; AVX512VL-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm2[8],ymm7[8],ymm2[9],ymm7[9],ymm2[10],ymm7[10],ymm2[11],ymm7[11],ymm2[12],ymm7[12],ymm2[13],ymm7[13],ymm2[14],ymm7[14],ymm2[15],ymm7[15],ymm2[24],ymm7[24],ymm2[25],ymm7[25],ymm2[26],ymm7[26],ymm2[27],ymm7[27],ymm2[28],ymm7[28],ymm2[29],ymm7[29],ymm2[30],ymm7[30],ymm2[31],ymm7[31] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] ; AVX512VL-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512VL-NEXT: vpmullw %ymm2, %ymm12, %ymm2 +; AVX512VL-NEXT: vpmullw %ymm12, %ymm11, %ymm11 +; AVX512VL-NEXT: vpsrlw $8, %ymm11, %ymm11 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[1],ymm7[1],ymm2[2],ymm7[2],ymm2[3],ymm7[3],ymm2[4],ymm7[4],ymm2[5],ymm7[5],ymm2[6],ymm7[6],ymm2[7],ymm7[7],ymm2[16],ymm7[16],ymm2[17],ymm7[17],ymm2[18],ymm7[18],ymm2[19],ymm7[19],ymm2[20],ymm7[20],ymm2[21],ymm7[21],ymm2[22],ymm7[22],ymm2[23],ymm7[23] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] +; AVX512VL-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512VL-NEXT: vpmullw %ymm2, %ymm13, %ymm2 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512VL-NEXT: vpackuswb %ymm7, %ymm2, %ymm2 +; AVX512VL-NEXT: vpackuswb %ymm11, %ymm2, %ymm2 ; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360] ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 @@ -1237,13 +1236,11 @@ ; AVX512VL-NEXT: vpblendvb %ymm9, %ymm5, %ymm3, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm5 ; AVX512VL-NEXT: vpblendvb %ymm10, %ymm5, %ymm3, %ymm3 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm7[8],ymm1[9],ymm7[9],ymm1[10],ymm7[10],ymm1[11],ymm7[11],ymm1[12],ymm7[12],ymm1[13],ymm7[13],ymm1[14],ymm7[14],ymm1[15],ymm7[15],ymm1[24],ymm7[24],ymm1[25],ymm7[25],ymm1[26],ymm7[26],ymm1[27],ymm7[27],ymm1[28],ymm7[28],ymm1[29],ymm7[29],ymm1[30],ymm7[30],ymm1[31],ymm7[31] +; AVX512VL-NEXT: vpmullw %ymm5, %ymm12, %ymm5 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512VL-NEXT: vpmullw %ymm5, %ymm11, %ymm5 -; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VL-NEXT: vpmullw %ymm1, %ymm12, %ymm1 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[4],ymm7[4],ymm1[5],ymm7[5],ymm1[6],ymm7[6],ymm1[7],ymm7[7],ymm1[16],ymm7[16],ymm1[17],ymm7[17],ymm1[18],ymm7[18],ymm1[19],ymm7[19],ymm1[20],ymm7[20],ymm1[21],ymm7[21],ymm1[22],ymm7[22],ymm1[23],ymm7[23] +; AVX512VL-NEXT: vpmullw %ymm1, %ymm13, %ymm1 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512VL-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 @@ -1267,15 +1264,14 @@ ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] +; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm4, %zmm4 +; AVX512BW-NEXT: vpsrlw $8, %zmm4, %zmm4 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpackuswb %zmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101 ; AVX512BW-NEXT: kmovq %rax, %k1 @@ -1299,15 +1295,14 @@ ; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2 ; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1 ; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1} -; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512VBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2 -; AVX512VBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512VBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] +; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm4, %zmm4 +; AVX512VBMI2-NEXT: vpsrlw $8, %zmm4, %zmm4 +; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] ; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512VBMI2-NEXT: vpackuswb %zmm2, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpackuswb %zmm4, %zmm1, %zmm1 ; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512VBMI2-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101 ; AVX512VBMI2-NEXT: kmovq %rax, %k1 @@ -1331,15 +1326,14 @@ ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1 ; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1} -; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512VLBW-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2 -; AVX512VLBW-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] +; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm4, %zmm4 +; AVX512VLBW-NEXT: vpsrlw $8, %zmm4, %zmm4 +; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512VLBW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpackuswb %zmm4, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512VLBW-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101 ; AVX512VLBW-NEXT: kmovq %rax, %k1 @@ -1363,15 +1357,14 @@ ; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1 ; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1} -; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2 -; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] +; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm4, %zmm4 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm4, %zmm4 +; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] ; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512VLVBMI2-NEXT: vpackuswb %zmm2, %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpackuswb %zmm4, %zmm1, %zmm1 ; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512VLVBMI2-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101 ; AVX512VLVBMI2-NEXT: kmovq %rax, %k1 Index: llvm/test/CodeGen/X86/vector-fshl-rot-256.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -1239,15 +1239,14 @@ ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2 -; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] +; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] ; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-fshl-rot-512.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshl-rot-512.ll +++ llvm/test/CodeGen/X86/vector-fshl-rot-512.ll @@ -632,19 +632,18 @@ ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] -; AVX512VL-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512VL-NEXT: vpmullw %ymm5, %ymm9, %ymm5 -; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] +; AVX512VL-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] ; AVX512VL-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512VL-NEXT: vpmullw %ymm1, %ymm10, %ymm1 +; AVX512VL-NEXT: vpmullw %ymm10, %ymm9, %ymm9 +; AVX512VL-NEXT: vpsrlw $8, %ymm9, %ymm9 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] +; AVX512VL-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512VL-NEXT: vpmullw %ymm1, %ymm11, %ymm1 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VL-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 +; AVX512VL-NEXT: vpackuswb %ymm9, %ymm1, %ymm1 ; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 @@ -654,13 +653,11 @@ ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512VL-NEXT: vpmullw %ymm3, %ymm9, %ymm3 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31] +; AVX512VL-NEXT: vpmullw %ymm3, %ymm10, %ymm3 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmullw %ymm0, %ymm10, %ymm0 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23] +; AVX512VL-NEXT: vpmullw %ymm0, %ymm11, %ymm0 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 @@ -683,15 +680,14 @@ ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] +; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3 +; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; @@ -711,15 +707,14 @@ ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1} -; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] +; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLBW-NEXT: vpsrlw $8, %zmm3, %zmm3 +; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512VLBW-NEXT: retq %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> ) Index: llvm/test/CodeGen/X86/vector-fshr-256.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshr-256.ll +++ llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -2118,15 +2118,14 @@ ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2 -; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 +; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] ; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512VL-NEXT: vpackuswb %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] +; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512VL-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 Index: llvm/test/CodeGen/X86/vector-fshr-512.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshr-512.ll +++ llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -1201,22 +1201,21 @@ ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm9 ; AVX512VL-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4 -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] -; AVX512VL-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512VL-NEXT: vpmullw %ymm4, %ymm10, %ymm4 -; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512VL-NEXT: vpsrlw $8, %ymm11, %ymm11 -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] -; AVX512VL-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512VL-NEXT: vpmullw %ymm12, %ymm11, %ymm11 -; AVX512VL-NEXT: vpsrlw $8, %ymm11, %ymm11 -; AVX512VL-NEXT: vpackuswb %ymm4, %ymm11, %ymm4 -; AVX512VL-NEXT: vpor %ymm4, %ymm3, %ymm3 -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360] -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] +; AVX512VL-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512VL-NEXT: vpmullw %ymm11, %ymm10, %ymm10 +; AVX512VL-NEXT: vpsrlw $8, %ymm10, %ymm10 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] +; AVX512VL-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512VL-NEXT: vpmullw %ymm13, %ymm12, %ymm12 +; AVX512VL-NEXT: vpsrlw $8, %ymm12, %ymm12 +; AVX512VL-NEXT: vpackuswb %ymm10, %ymm12, %ymm10 +; AVX512VL-NEXT: vpor %ymm3, %ymm10, %ymm3 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360] +; AVX512VL-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3 ; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm0 @@ -1225,17 +1224,15 @@ ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm9, %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15],ymm1[24],ymm4[24],ymm1[25],ymm4[25],ymm1[26],ymm4[26],ymm1[27],ymm4[27],ymm1[28],ymm4[28],ymm1[29],ymm4[29],ymm1[30],ymm4[30],ymm1[31],ymm4[31] +; AVX512VL-NEXT: vpmullw %ymm3, %ymm11, %ymm3 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512VL-NEXT: vpmullw %ymm3, %ymm10, %ymm3 -; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512VL-NEXT: vpmullw %ymm5, %ymm12, %ymm5 -; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512VL-NEXT: vpackuswb %ymm3, %ymm5, %ymm3 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[16],ymm4[16],ymm1[17],ymm4[17],ymm1[18],ymm4[18],ymm1[19],ymm4[19],ymm1[20],ymm4[20],ymm1[21],ymm4[21],ymm1[22],ymm4[22],ymm1[23],ymm4[23] +; AVX512VL-NEXT: vpmullw %ymm4, %ymm13, %ymm4 +; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4 +; AVX512VL-NEXT: vpackuswb %ymm3, %ymm4, %ymm3 ; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; @@ -1255,15 +1252,14 @@ ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3 +; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3 ; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] +; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 +; AVX512BW-NEXT: vpackuswb %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vporq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101 ; AVX512BW-NEXT: kmovq %rax, %k1 @@ -1286,15 +1282,14 @@ ; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2 ; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1 ; AVX512VBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1} -; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512VBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2 -; AVX512VBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512VBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3 -; AVX512VBMI2-NEXT: vpackuswb %zmm2, %zmm3, %zmm2 +; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] +; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2 +; AVX512VBMI2-NEXT: vpackuswb %zmm3, %zmm2, %zmm2 ; AVX512VBMI2-NEXT: vporq %zmm2, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101 ; AVX512VBMI2-NEXT: kmovq %rax, %k1 @@ -1317,15 +1312,14 @@ ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1 ; AVX512VLBW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1} -; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512VLBW-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2 -; AVX512VLBW-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512VLBW-NEXT: vpsrlw $8, %zmm3, %zmm3 +; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm3, %zmm3 -; AVX512VLBW-NEXT: vpackuswb %zmm2, %zmm3, %zmm2 +; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] +; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpsrlw $8, %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpackuswb %zmm3, %zmm2, %zmm2 ; AVX512VLBW-NEXT: vporq %zmm2, %zmm0, %zmm0 ; AVX512VLBW-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101 ; AVX512VLBW-NEXT: kmovq %rax, %k1 @@ -1348,15 +1342,14 @@ ; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1 ; AVX512VLVBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1} -; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2 -; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3 -; AVX512VLVBMI2-NEXT: vpackuswb %zmm2, %zmm3, %zmm2 +; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] +; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2 +; AVX512VLVBMI2-NEXT: vpackuswb %zmm3, %zmm2, %zmm2 ; AVX512VLVBMI2-NEXT: vporq %zmm2, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101 ; AVX512VLVBMI2-NEXT: kmovq %rax, %k1 Index: llvm/test/CodeGen/X86/vector-fshr-rot-256.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -1317,15 +1317,14 @@ ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2 -; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] +; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] ; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-fshr-rot-512.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshr-rot-512.ll +++ llvm/test/CodeGen/X86/vector-fshr-rot-512.ll @@ -652,19 +652,18 @@ ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] -; AVX512VL-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512VL-NEXT: vpmullw %ymm5, %ymm9, %ymm5 -; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] +; AVX512VL-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] ; AVX512VL-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512VL-NEXT: vpmullw %ymm1, %ymm10, %ymm1 +; AVX512VL-NEXT: vpmullw %ymm10, %ymm9, %ymm9 +; AVX512VL-NEXT: vpsrlw $8, %ymm9, %ymm9 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] +; AVX512VL-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512VL-NEXT: vpmullw %ymm1, %ymm11, %ymm1 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VL-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 +; AVX512VL-NEXT: vpackuswb %ymm9, %ymm1, %ymm1 ; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 @@ -674,13 +673,11 @@ ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512VL-NEXT: vpmullw %ymm3, %ymm9, %ymm3 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31] +; AVX512VL-NEXT: vpmullw %ymm3, %ymm10, %ymm3 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmullw %ymm0, %ymm10, %ymm0 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23] +; AVX512VL-NEXT: vpmullw %ymm0, %ymm11, %ymm0 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 @@ -703,15 +700,14 @@ ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] +; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3 +; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; @@ -731,15 +727,14 @@ ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1} -; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] +; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLBW-NEXT: vpsrlw $8, %zmm3, %zmm3 +; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512VLBW-NEXT: retq %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> ) Index: llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll +++ llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll @@ -257,41 +257,34 @@ ; ; AVX512BW-LABEL: test_divconstant_64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512BW-NEXT: vpackuswb %zmm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] +; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3 +; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm3, %zmm3 ; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vpackuswb %zmm3, %zmm1, %zmm1 -; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm2[8],zmm0[9],zmm2[9],zmm0[10],zmm2[10],zmm0[11],zmm2[11],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[24],zmm2[24],zmm0[25],zmm2[25],zmm0[26],zmm2[26],zmm0[27],zmm2[27],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31],zmm0[40],zmm2[40],zmm0[41],zmm2[41],zmm0[42],zmm2[42],zmm0[43],zmm2[43],zmm0[44],zmm2[44],zmm0[45],zmm2[45],zmm0[46],zmm2[46],zmm0[47],zmm2[47],zmm0[56],zmm2[56],zmm0[57],zmm2[57],zmm0[58],zmm2[58],zmm0[59],zmm2[59],zmm0[60],zmm2[60],zmm0[61],zmm2[61],zmm0[62],zmm2[62],zmm0[63],zmm2[63] +; AVX512BW-NEXT: vpackuswb %zmm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm3, %zmm3 ; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm2[0],zmm0[1],zmm2[1],zmm0[2],zmm2[2],zmm0[3],zmm2[3],zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[16],zmm2[16],zmm0[17],zmm2[17],zmm0[18],zmm2[18],zmm0[19],zmm2[19],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[32],zmm2[32],zmm0[33],zmm2[33],zmm0[34],zmm2[34],zmm0[35],zmm2[35],zmm0[36],zmm2[36],zmm0[37],zmm2[37],zmm0[38],zmm2[38],zmm0[39],zmm2[39],zmm0[48],zmm2[48],zmm0[49],zmm2[49],zmm0[50],zmm2[50],zmm0[51],zmm2[51],zmm0[52],zmm2[52],zmm0[53],zmm2[53],zmm0[54],zmm2[54],zmm0[55],zmm2[55] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] +; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %res = udiv <64 x i8> %a, ret <64 x i8> %res @@ -624,46 +617,39 @@ ; ; AVX512BW-LABEL: test_remconstant_64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512BW-NEXT: vpackuswb %zmm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] +; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3 +; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm3, %zmm3 ; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vpackuswb %zmm3, %zmm1, %zmm1 -; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm3[8],zmm2[8],zmm3[9],zmm2[9],zmm3[10],zmm2[10],zmm3[11],zmm2[11],zmm3[12],zmm2[12],zmm3[13],zmm2[13],zmm3[14],zmm2[14],zmm3[15],zmm2[15],zmm3[24],zmm2[24],zmm3[25],zmm2[25],zmm3[26],zmm2[26],zmm3[27],zmm2[27],zmm3[28],zmm2[28],zmm3[29],zmm2[29],zmm3[30],zmm2[30],zmm3[31],zmm2[31],zmm3[40],zmm2[40],zmm3[41],zmm2[41],zmm3[42],zmm2[42],zmm3[43],zmm2[43],zmm3[44],zmm2[44],zmm3[45],zmm2[45],zmm3[46],zmm2[46],zmm3[47],zmm2[47],zmm3[56],zmm2[56],zmm3[57],zmm2[57],zmm3[58],zmm2[58],zmm3[59],zmm2[59],zmm3[60],zmm2[60],zmm3[61],zmm2[61],zmm3[62],zmm2[62],zmm3[63],zmm2[63] +; AVX512BW-NEXT: vpackuswb %zmm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm3[8],zmm1[8],zmm3[9],zmm1[9],zmm3[10],zmm1[10],zmm3[11],zmm1[11],zmm3[12],zmm1[12],zmm3[13],zmm1[13],zmm3[14],zmm1[14],zmm3[15],zmm1[15],zmm3[24],zmm1[24],zmm3[25],zmm1[25],zmm3[26],zmm1[26],zmm3[27],zmm1[27],zmm3[28],zmm1[28],zmm3[29],zmm1[29],zmm3[30],zmm1[30],zmm3[31],zmm1[31],zmm3[40],zmm1[40],zmm3[41],zmm1[41],zmm3[42],zmm1[42],zmm3[43],zmm1[43],zmm3[44],zmm1[44],zmm3[45],zmm1[45],zmm3[46],zmm1[46],zmm3[47],zmm1[47],zmm3[56],zmm1[56],zmm3[57],zmm1[57],zmm3[58],zmm1[58],zmm3[59],zmm1[59],zmm3[60],zmm1[60],zmm3[61],zmm1[61],zmm3[62],zmm1[62],zmm3[63],zmm1[63] ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm4, %zmm4 ; AVX512BW-NEXT: vpsrlw $8, %zmm4, %zmm4 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm3[0],zmm2[0],zmm3[1],zmm2[1],zmm3[2],zmm2[2],zmm3[3],zmm2[3],zmm3[4],zmm2[4],zmm3[5],zmm2[5],zmm3[6],zmm2[6],zmm3[7],zmm2[7],zmm3[16],zmm2[16],zmm3[17],zmm2[17],zmm3[18],zmm2[18],zmm3[19],zmm2[19],zmm3[20],zmm2[20],zmm3[21],zmm2[21],zmm3[22],zmm2[22],zmm3[23],zmm2[23],zmm3[32],zmm2[32],zmm3[33],zmm2[33],zmm3[34],zmm2[34],zmm3[35],zmm2[35],zmm3[36],zmm2[36],zmm3[37],zmm2[37],zmm3[38],zmm2[38],zmm3[39],zmm2[39],zmm3[48],zmm2[48],zmm3[49],zmm2[49],zmm3[50],zmm2[50],zmm3[51],zmm2[51],zmm3[52],zmm2[52],zmm3[53],zmm2[53],zmm3[54],zmm2[54],zmm3[55],zmm2[55] -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512BW-NEXT: vpackuswb %zmm4, %zmm2, %zmm2 -; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2 -; AVX512BW-NEXT: vpalignr {{.*#+}} zmm2 = zmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,16,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,32,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,48] -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm3[0],zmm1[0],zmm3[1],zmm1[1],zmm3[2],zmm1[2],zmm3[3],zmm1[3],zmm3[4],zmm1[4],zmm3[5],zmm1[5],zmm3[6],zmm1[6],zmm3[7],zmm1[7],zmm3[16],zmm1[16],zmm3[17],zmm1[17],zmm3[18],zmm1[18],zmm3[19],zmm1[19],zmm3[20],zmm1[20],zmm3[21],zmm1[21],zmm3[22],zmm1[22],zmm3[23],zmm1[23],zmm3[32],zmm1[32],zmm3[33],zmm1[33],zmm3[34],zmm1[34],zmm3[35],zmm1[35],zmm3[36],zmm1[36],zmm3[37],zmm1[37],zmm3[38],zmm1[38],zmm3[39],zmm1[39],zmm3[48],zmm1[48],zmm3[49],zmm1[49],zmm3[50],zmm1[50],zmm3[51],zmm1[51],zmm3[52],zmm1[52],zmm3[53],zmm1[53],zmm3[54],zmm1[54],zmm3[55],zmm1[55] +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm3, %zmm3 +; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3 +; AVX512BW-NEXT: vpackuswb %zmm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vpaddb %zmm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm2[8],zmm1[8],zmm2[9],zmm1[9],zmm2[10],zmm1[10],zmm2[11],zmm1[11],zmm2[12],zmm1[12],zmm2[13],zmm1[13],zmm2[14],zmm1[14],zmm2[15],zmm1[15],zmm2[24],zmm1[24],zmm2[25],zmm1[25],zmm2[26],zmm1[26],zmm2[27],zmm1[27],zmm2[28],zmm1[28],zmm2[29],zmm1[29],zmm2[30],zmm1[30],zmm2[31],zmm1[31],zmm2[40],zmm1[40],zmm2[41],zmm1[41],zmm2[42],zmm1[42],zmm2[43],zmm1[43],zmm2[44],zmm1[44],zmm2[45],zmm1[45],zmm2[46],zmm1[46],zmm2[47],zmm1[47],zmm2[56],zmm1[56],zmm2[57],zmm1[57],zmm2[58],zmm1[58],zmm2[59],zmm1[59],zmm2[60],zmm1[60],zmm2[61],zmm1[61],zmm2[62],zmm1[62],zmm2[63],zmm1[63] +; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3 +; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3 +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm3, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm1[0],zmm2[1],zmm1[1],zmm2[2],zmm1[2],zmm2[3],zmm1[3],zmm2[4],zmm1[4],zmm2[5],zmm1[5],zmm2[6],zmm1[6],zmm2[7],zmm1[7],zmm2[16],zmm1[16],zmm2[17],zmm1[17],zmm2[18],zmm1[18],zmm2[19],zmm1[19],zmm2[20],zmm1[20],zmm2[21],zmm1[21],zmm2[22],zmm1[22],zmm2[23],zmm1[23],zmm2[32],zmm1[32],zmm2[33],zmm1[33],zmm2[34],zmm1[34],zmm2[35],zmm1[35],zmm2[36],zmm1[36],zmm2[37],zmm1[37],zmm2[38],zmm1[38],zmm2[39],zmm1[39],zmm2[48],zmm1[48],zmm2[49],zmm1[49],zmm2[50],zmm1[50],zmm2[51],zmm1[51],zmm2[52],zmm1[52],zmm2[53],zmm1[53],zmm2[54],zmm1[54],zmm2[55],zmm1[55] ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,16,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,32,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,48] +; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpandq %zmm3, %zmm1, %zmm1 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpandq %zmm4, %zmm1, %zmm1 +; AVX512BW-NEXT: vpackuswb %zmm3, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %res = urem <64 x i8> %a, Index: llvm/test/CodeGen/X86/vector-reduce-mul.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -62,7 +62,7 @@ ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -203,7 +203,7 @@ ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -413,7 +413,7 @@ ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -732,7 +732,7 @@ ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -1597,39 +1597,16 @@ ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq ; -; AVX512BW-LABEL: test_v4i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v4i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512VL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: test_v4i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovd %xmm0, %eax -; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax -; AVX512DQ-NEXT: retq +; AVX512-LABEL: test_v4i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq %1 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> %a0) ret i8 %1 } @@ -1679,45 +1656,18 @@ ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq ; -; AVX512BW-LABEL: test_v8i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v8i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX512VL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3] -; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: test_v8i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovd %xmm0, %eax -; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax -; AVX512DQ-NEXT: retq +; AVX512-LABEL: test_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq %1 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> %a0) ret i8 %1 } @@ -2131,20 +2081,20 @@ ; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm3, %xmm2 ; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX512DQVL-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpand %xmm3, %xmm0, %xmm1 ; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQVL-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpand %xmm3, %xmm0, %xmm1 ; AVX512DQVL-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 @@ -2378,17 +2328,17 @@ ; AVX512BWVL-NEXT: vpmullw %xmm3, %xmm2, %xmm2 ; AVX512BWVL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BWVL-NEXT: vpmovwb %ymm2, %xmm2 -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1 ; AVX512BWVL-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1 @@ -2454,12 +2404,12 @@ ; AVX512DQVL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 ; AVX512DQVL-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 @@ -2778,17 +2728,17 @@ ; AVX512BWVL-NEXT: vpmullw %xmm3, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BWVL-NEXT: vpmullw %xmm0, %xmm1, %xmm0 ; AVX512BWVL-NEXT: vpmullw %xmm0, %xmm2, %xmm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1 ; AVX512BWVL-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1 @@ -2868,12 +2818,12 @@ ; AVX512DQVL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm1 ; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 Index: llvm/test/CodeGen/X86/vector-rotate-256.ll =================================================================== --- llvm/test/CodeGen/X86/vector-rotate-256.ll +++ llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -1232,15 +1232,14 @@ ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2 -; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] +; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] ; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-rotate-512.ll =================================================================== --- llvm/test/CodeGen/X86/vector-rotate-512.ll +++ llvm/test/CodeGen/X86/vector-rotate-512.ll @@ -619,19 +619,18 @@ ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] -; AVX512VL-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512VL-NEXT: vpmullw %ymm5, %ymm9, %ymm5 -; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] +; AVX512VL-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] ; AVX512VL-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512VL-NEXT: vpmullw %ymm1, %ymm10, %ymm1 +; AVX512VL-NEXT: vpmullw %ymm10, %ymm9, %ymm9 +; AVX512VL-NEXT: vpsrlw $8, %ymm9, %ymm9 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] +; AVX512VL-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512VL-NEXT: vpmullw %ymm1, %ymm11, %ymm1 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VL-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 +; AVX512VL-NEXT: vpackuswb %ymm9, %ymm1, %ymm1 ; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 @@ -641,13 +640,11 @@ ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512VL-NEXT: vpmullw %ymm3, %ymm9, %ymm3 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31] +; AVX512VL-NEXT: vpmullw %ymm3, %ymm10, %ymm3 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmullw %ymm0, %ymm10, %ymm0 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23] +; AVX512VL-NEXT: vpmullw %ymm0, %ymm11, %ymm0 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 @@ -670,15 +667,14 @@ ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] +; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3 +; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; @@ -698,15 +694,14 @@ ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1} -; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] +; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLBW-NEXT: vpsrlw $8, %zmm3, %zmm3 +; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512VLBW-NEXT: retq %shl = shl <64 x i8> %a, Index: llvm/test/CodeGen/X86/vector-shift-lshr-256.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-lshr-256.ll +++ llvm/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -1111,15 +1111,14 @@ ; ; AVX512DQVL-LABEL: constant_shift_v32i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQVL-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512DQVL-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX512DQVL-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQVL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX512DQVL-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2 +; AVX512DQVL-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512DQVL-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX512DQVL-NEXT: retq ; ; AVX512BWVL-LABEL: constant_shift_v32i8: Index: llvm/test/CodeGen/X86/vector-shift-lshr-512.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-lshr-512.ll +++ llvm/test/CodeGen/X86/vector-shift-lshr-512.ll @@ -253,15 +253,14 @@ ; ; AVX512BW-LABEL: constant_shift_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] +; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %shift = lshr <64 x i8> %a, ret <64 x i8> %shift Index: llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -2321,7 +2321,7 @@ ; AVX512VL-SLOW-LABEL: shuffle_v8i16_01100110: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7] -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX512VL-SLOW-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i16_01100110: @@ -2371,7 +2371,7 @@ ; AVX512VL-SLOW-LABEL: shuffle_v8i16_01u0u110: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7] -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX512VL-SLOW-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i16_01u0u110: Index: llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -226,7 +226,7 @@ define <32 x i16> @combine_vpermi2var_32i16_as_pshufb(<32 x i16> %a0) { ; CHECK-LABEL: combine_vpermi2var_32i16_as_pshufb: ; CHECK: # %bb.0: -; CHECK-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29,34,35,32,33,38,39,36,37,42,43,40,41,46,47,44,45,50,51,48,49,54,55,52,53,58,59,56,57,62,63,60,61] +; CHECK-NEXT: vprold $16, %zmm0, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> ) %2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %1, <32 x i16> ) Index: llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll +++ llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll @@ -338,8 +338,7 @@ define <16 x float> @combine_vpermt2var_16f32_vmovddup(<16 x float> %x0, <16 x float> %x1) { ; CHECK-LABEL: combine_vpermt2var_16f32_vmovddup: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] -; CHECK-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6] ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 -1) ret <16 x float> %res0 @@ -348,16 +347,12 @@ ; X86-LABEL: combine_vpermt2var_16f32_vmovddup_load: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovaps (%eax), %zmm1 -; X86-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] -; X86-NEXT: vpermi2ps %zmm0, %zmm1, %zmm0 +; X86-NEXT: vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6] ; X86-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_16f32_vmovddup_load: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %zmm1 -; X64-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] -; X64-NEXT: vpermi2ps %zmm0, %zmm1, %zmm0 +; X64-NEXT: vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6] ; X64-NEXT: retq %x0 = load <16 x float>, <16 x float> *%p0 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 -1) @@ -876,9 +871,7 @@ define <16 x i32> @combine_vpermt2var_16i32_as_vpsrlq(<16 x i32> %x0) { ; CHECK-LABEL: combine_vpermt2var_16i32_as_vpsrlq: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,16,3,16,5,16,7,16,9,16,11,16,13,16,15,16] -; CHECK-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: vpsrlq $32, %zmm0, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> , <16 x i32> %x0, <16 x i32> zeroinitializer, i16 -1) ret <16 x i32> %res0 @@ -887,9 +880,7 @@ define <16 x i32> @combine_vpermt2var_16i32_as_vpsllq(<16 x i32> %x0) { ; CHECK-LABEL: combine_vpermt2var_16i32_as_vpsllq: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,0,16,2,16,4,16,6,16,8,16,10,16,12,16,14] -; CHECK-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: vpsllq $32, %zmm0, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> , <16 x i32> %x0, <16 x i32> zeroinitializer, i16 -1) ret <16 x i32> %res0 Index: llvm/test/CodeGen/X86/vector-shuffle-v1.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shuffle-v1.ll +++ llvm/test/CodeGen/X86/vector-shuffle-v1.ll @@ -62,9 +62,8 @@ ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z} ; AVX512VL-NEXT: movq $-1, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2] -; AVX512VL-NEXT: vpermi2q %xmm2, %xmm1, %xmm3 -; AVX512VL-NEXT: vptestmq %xmm3, %xmm3, %k1 +; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VL-NEXT: retq ; @@ -75,9 +74,8 @@ ; VL_BW_DQ-NEXT: movq $-1, %rax ; VL_BW_DQ-NEXT: vmovq %rax, %xmm0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm1 -; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2] -; VL_BW_DQ-NEXT: vpermi2q %xmm0, %xmm1, %xmm2 -; VL_BW_DQ-NEXT: vpmovq2m %xmm2, %k0 +; VL_BW_DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 ; VL_BW_DQ-NEXT: retq %b = shufflevector <2 x i1> %a, <2 x i1> , <2 x i32>