Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -25486,12 +25486,6 @@ return SDValue(); } -static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { - return detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG, Subtarget, - SDLoc(N)); -} - /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -26135,6 +26129,160 @@ return SDValue(); } +/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS. +static SDValue +combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG, + SmallVector &Regs) { + assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 || + Regs[0].getValueType() == MVT::v2i64)); + EVT OutVT = N->getValueType(0); + EVT ElemVT = OutVT.getVectorElementType(); + EVT InVT = Regs[0].getValueType(); + EVT WideElemVT = InVT.getVectorElementType(); + SDLoc DL(N); + + // First, use mask to unset all bits that won't appear in the result. + SDValue MaskVal = + DAG.getConstant(ElemVT == MVT::i8 ? 255 : 65535, DL, WideElemVT); + SDValue MaskVec = DAG.getNode( + ISD::BUILD_VECTOR, DL, InVT, + SmallVector(InVT.getVectorNumElements(), MaskVal)); + for (auto &Reg : Regs) + Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVec, Reg); + + MVT UnpackedVT, PackedVT; + if (ElemVT == MVT::i8) { + UnpackedVT = MVT::v8i16; + PackedVT = MVT::v16i8; + } else { + UnpackedVT = MVT::v4i32; + PackedVT = MVT::v8i16; + } + + // In each iteration, truncate the type by a half size. + auto RegNum = Regs.size(); + for (unsigned j = 1, e = WideElemVT.getSizeInBits() / ElemVT.getSizeInBits(); + j < e; j *= 2, RegNum /= 2) { + for (unsigned i = 0; i < RegNum; i++) + Regs[i] = DAG.getNode(ISD::BITCAST, DL, UnpackedVT, Regs[i]); + for (unsigned i = 0; i < RegNum / 2; i++) + Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2], + Regs[i * 2 + 1]); + } + + // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and + // then extract a subvector as the result since v8i8 is not a legal type. + if (OutVT == MVT::v8i8) { + Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]); + Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0], + DAG.getIntPtrConstant(0, DL)); + return Regs[0]; + } else if (RegNum > 1) { + Regs.resize(RegNum); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs); + } else + return Regs[0]; +} + +/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS. +static SDValue +combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG, + SmallVector &Regs) { + assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32); + EVT OutVT = N->getValueType(0); + SDLoc DL(N); + + // Shift left by 16 bits, then arithmetic-shift right by 16 bits. + SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32); + for (auto &Reg : Regs) { + Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG); + Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG); + } + + for (unsigned i = 0, e = Regs.size() / 2; i < e; i++) + Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2], + Regs[i * 2 + 1]); + + if (Regs.size() > 2) { + Regs.resize(Regs.size() / 2); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs); + } else + return Regs[0]; +} + +/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into +/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type +/// legalization the truncation will be translated into a BUILD_VECTOR with each +/// element that is extracted from a vector and then truncated, and it is +/// diffcult to do this optimization based on them. +static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT OutVT = N->getValueType(0); + if (!OutVT.isVector()) + return SDValue(); + + SDValue In = N->getOperand(0); + if (!In.getValueType().isSimple()) + return SDValue(); + + EVT InVT = In.getValueType(); + unsigned NumElems = OutVT.getVectorNumElements(); + + // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on + // SSE2, and we need to take care of it specially. + // AVX512 provides vpmovdb. + if (!Subtarget->hasSSE2() || Subtarget->hasAVX2()) + return SDValue(); + + EVT ElemVT = OutVT.getVectorElementType(); + EVT WideElemVT = InVT.getVectorElementType(); + if (!((WideElemVT == MVT::i32 || WideElemVT == MVT::i64) && + (ElemVT == MVT::i8 || ElemVT == MVT::i16) && isPowerOf2_32(NumElems) && + NumElems >= 8)) + return SDValue(); + + // SSSE3's pshufb results in less instructions in the cases below. + if (Subtarget->hasSSSE3() && NumElems == 8 && + (ElemVT == MVT::i8 || (WideElemVT == MVT::i32 && ElemVT == MVT::i16))) + return SDValue(); + + SDLoc DL(N); + + // Split a long vector into vectors of legal type. + unsigned RegNum = InVT.getSizeInBits() / 128; + SmallVector SubVec(RegNum); + if (WideElemVT == MVT::i32) { + for (unsigned i = 0; i < RegNum; i++) + SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, + DAG.getIntPtrConstant(i * 4, DL)); + } else { + for (unsigned i = 0; i < RegNum; i++) + SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, + DAG.getIntPtrConstant(i * 2, DL)); + } + + // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PAKCUS + // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to + // truncate 2 x v4i32 to v8i16. + if (Subtarget->hasSSE41() || ElemVT == MVT::i8) + return combineVectorTruncationWithPACKUS(N, DAG, SubVec); + else if (WideElemVT == MVT::i32) + return combineVectorTruncationWithPACKSS(N, DAG, SubVec); + else + return SDValue(); +} + +static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + // Try to detect AVG pattern first. + SDValue Avg = detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG, + Subtarget, SDLoc(N)); + if (Avg.getNode()) + return Avg; + + return combineVectorTruncation(N, DAG, Subtarget); +} + /// Do target-specific dag combines on floating point negations. static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { Index: test/CodeGen/X86/vector-trunc.ll =================================================================== --- test/CodeGen/X86/vector-trunc.ll +++ test/CodeGen/X86/vector-trunc.ll @@ -4,6 +4,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BW define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) { ; SSE2-LABEL: trunc8i64_8i32: @@ -56,6 +57,11 @@ ; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq +; +; AVX512BW-LABEL: trunc8i64_8i32: +; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: retq entry: %0 = trunc <8 x i64> %a to <8 x i32> ret <8 x i32> %0 @@ -102,6 +108,89 @@ ; ; SSE41-LABEL: trunc8i64_8i16: ; SSE41: # BB#0: # %entry +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packusdw %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc8i64_8i16: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc8i64_8i16: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: trunc8i64_8i16: +; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: retq +entry: + %0 = trunc <8 x i64> %a to <8 x i16> + ret <8 x i16> %0 +} + +define void @trunc8i64_8i8(<8 x i64> %a) { +; SSE2-LABEL: trunc8i64_8i8: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movq %xmm0, (%rax) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc8i64_8i8: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: pextrw $4, %xmm1, %eax +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSSE3-NEXT: pextrw $4, %xmm0, %ecx +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: pextrw $4, %xmm3, %edx +; SSSE3-NEXT: movd %edx, %xmm1 +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSSE3-NEXT: pextrw $4, %xmm2, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: movq %xmm0, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc8i64_8i8: +; SSE41: # BB#0: # %entry ; SSE41-NEXT: pextrw $4, %xmm0, %eax ; SSE41-NEXT: pinsrw $1, %eax, %xmm0 ; SSE41-NEXT: movd %xmm1, %eax @@ -116,50 +205,59 @@ ; SSE41-NEXT: pinsrw $6, %eax, %xmm0 ; SSE41-NEXT: pextrw $4, %xmm3, %eax ; SSE41-NEXT: pinsrw $7, %eax, %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSE41-NEXT: movq %xmm0, (%rax) ; SSE41-NEXT: retq ; -; AVX1-LABEL: trunc8i64_8i16: +; AVX1-LABEL: trunc8i64_8i8: ; AVX1: # BB#0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vmovq %xmm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc8i64_8i16: +; AVX2-LABEL: trunc8i64_8i8: ; AVX2: # BB#0: # %entry ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> ; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq +; +; AVX512BW-LABEL: trunc8i64_8i8: +; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: vpmovqb %zmm0, (%rax) +; AVX512BW-NEXT: retq entry: - %0 = trunc <8 x i64> %a to <8 x i16> - ret <8 x i16> %0 + %0 = trunc <8 x i64> %a to <8 x i8> + store <8 x i8> %0, <8 x i8>* undef, align 4 + ret void } define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) { ; SSE2-LABEL: trunc8i32_8i16: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc8i32_8i16: @@ -194,6 +292,11 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq +; +; AVX512BW-LABEL: trunc8i32_8i16: +; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: retq entry: %0 = trunc <8 x i32> %a to <8 x i16> ret <8 x i16> %0 @@ -202,7 +305,7 @@ define void @trunc8i32_8i8(<8 x i32> %a) { ; SSE2-LABEL: trunc8i32_8i8: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: packuswb %xmm1, %xmm0 @@ -247,6 +350,13 @@ ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq +; +; AVX512BW-LABEL: trunc8i32_8i8: +; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovq %xmm0, (%rax) +; AVX512BW-NEXT: retq entry: %0 = trunc <8 x i32> %a to <8 x i8> store <8 x i8> %0, <8 x i8>* undef, align 4 @@ -254,128 +364,31 @@ } define void @trunc16i32_16i8(<16 x i32> %a) { -; SSE2-LABEL: trunc16i32_16i8: -; SSE2: # BB#0: # %entry -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc16i32_16i8: -; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSSE3-NEXT: movdqu %xmm0, (%rax) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc16i32_16i8: -; SSE41: # BB#0: # %entry -; SSE41-NEXT: pextrb $4, %xmm0, %eax -; SSE41-NEXT: pextrb $8, %xmm0, %ecx -; SSE41-NEXT: pextrb $12, %xmm0, %edx -; SSE41-NEXT: pinsrb $1, %eax, %xmm0 -; SSE41-NEXT: pinsrb $2, %ecx, %xmm0 -; SSE41-NEXT: pinsrb $3, %edx, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: pinsrb $4, %eax, %xmm0 -; SSE41-NEXT: pextrb $4, %xmm1, %eax -; SSE41-NEXT: pinsrb $5, %eax, %xmm0 -; SSE41-NEXT: pextrb $8, %xmm1, %eax -; SSE41-NEXT: pinsrb $6, %eax, %xmm0 -; SSE41-NEXT: pextrb $12, %xmm1, %eax -; SSE41-NEXT: pinsrb $7, %eax, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm2, %eax -; SSE41-NEXT: pinsrb $8, %eax, %xmm0 -; SSE41-NEXT: pextrb $4, %xmm2, %eax -; SSE41-NEXT: pinsrb $9, %eax, %xmm0 -; SSE41-NEXT: pextrb $8, %xmm2, %eax -; SSE41-NEXT: pinsrb $10, %eax, %xmm0 -; SSE41-NEXT: pextrb $12, %xmm2, %eax -; SSE41-NEXT: pinsrb $11, %eax, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm3, %eax -; SSE41-NEXT: pinsrb $12, %eax, %xmm0 -; SSE41-NEXT: pextrb $4, %xmm3, %eax -; SSE41-NEXT: pinsrb $13, %eax, %xmm0 -; SSE41-NEXT: pextrb $8, %xmm3, %eax -; SSE41-NEXT: pinsrb $14, %eax, %xmm0 -; SSE41-NEXT: pextrb $12, %xmm3, %eax -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 -; SSE41-NEXT: movdqu %xmm0, (%rax) -; SSE41-NEXT: retq +; SSE-LABEL: trunc16i32_16i8: +; SSE: # BB#0: # %entry +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: movdqu %xmm0, (%rax) +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc16i32_16i8: ; AVX1: # BB#0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -394,6 +407,11 @@ ; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq +; +; AVX512BW-LABEL: trunc16i32_16i8: +; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: vpmovdb %zmm0, (%rax) +; AVX512BW-NEXT: retq entry: %0 = trunc <16 x i32> %a to <16 x i8> store <16 x i8> %0, <16 x i8>* undef, align 4 @@ -451,6 +469,13 @@ ; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq +; +; AVX512BW-LABEL: trunc2x4i64_8i32: +; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: retq entry: %0 = trunc <4 x i64> %a to <4 x i32> %1 = trunc <4 x i64> %b to <4 x i32> @@ -543,6 +568,16 @@ ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq +; +; AVX512BW-LABEL: trunc2x4i64_8i16: +; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: retq entry: %0 = trunc <4 x i64> %a to <4 x i16> %1 = trunc <4 x i64> %b to <4 x i16> @@ -585,6 +620,13 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: retq +; +; AVX512BW-LABEL: trunc2x2i64_4i32: +; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512BW-NEXT: retq entry: %0 = trunc <2 x i64> %a to <2 x i32> %1 = trunc <2 x i64> %b to <2 x i32> @@ -604,6 +646,12 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: retq +; +; AVX512BW-LABEL: trunc2i64_i64: +; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: retq entry: %0 = trunc <2 x i64> %inval to <2 x i32> %1 = bitcast <2 x i32> %0 to i64 @@ -645,6 +693,14 @@ ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq +; +; AVX512BW-LABEL: trunc2x4i32_8i16: +; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: retq entry: %0 = trunc <4 x i32> %a to <4 x i16> %1 = trunc <4 x i32> %b to <4 x i16> @@ -679,6 +735,12 @@ ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: retq +; +; AVX512BW-LABEL: trunc4i32_i64: +; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: retq entry: %0 = trunc <4 x i32> %inval to <4 x i16> %1 = bitcast <4 x i16> %0 to i64 @@ -717,6 +779,14 @@ ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq +; +; AVX512BW-LABEL: trunc2x8i16_16i8: +; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: retq entry: %0 = trunc <8 x i16> %a to <8 x i8> %1 = trunc <8 x i16> %b to <8 x i8> @@ -750,6 +820,12 @@ ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: retq +; +; AVX512BW-LABEL: trunc8i16_i64: +; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: retq entry: %0 = trunc <8 x i16> %inval to <8 x i8> %1 = bitcast <8 x i8> %0 to i64 @@ -766,6 +842,11 @@ ; AVX: # BB#0: # %entry ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq +; +; AVX512BW-LABEL: trunc16i64_16i8_const: +; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: retq entry: %0 = trunc <16 x i64> zeroinitializer to <16 x i8>