Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -25502,12 +25502,6 @@ return SDValue(); } -static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { - return detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG, Subtarget, - SDLoc(N)); -} - /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -26150,6 +26144,105 @@ return SDValue(); } +/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into +/// bitand and X86ISD::PACKSS operations. We do it here because after type +/// legalization the truncation will be translated into a BUILD_VECTOR with each +/// element that is extracted from a vector and then truncated, and it is +/// diffcult to do this optimization based on them. +static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + if (!VT.isVector()) + return SDValue(); + + SDValue In = N->getOperand(0); + if (!In.getValueType().isSimple()) + return SDValue(); + + EVT InVT = In.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + + // AVX512 provides vpmovdb. The behavior of X86ISD::PACKSS on AVX2 is + // different from that on SSE2 and we could not benefit from this method on + // AVX2. + if (!Subtarget->hasSSE2() || Subtarget->hasAVX2()) + return SDValue(); + + EVT ElemVT = VT.getVectorElementType(); + EVT WideElemVT = InVT.getVectorElementType(); + if (!((ElemVT == MVT::i8 || ElemVT == MVT::i16) && + (WideElemVT == MVT::i32 || WideElemVT == MVT::i64) && + isPowerOf2_32(NumElems) && NumElems >= 8)) + return SDValue(); + + // SSSE3's pshufb results in less instructions in the cases below. + if (Subtarget->hasSSSE3() && NumElems == 8 && + (ElemVT == MVT::i8 || (WideElemVT == MVT::i32 && ElemVT == MVT::i16))) + return SDValue(); + + SDLoc DL(N); + + // First, use mask to unset all bits that won't appear in the result. + SDValue MaskVal = + DAG.getConstant(ElemVT == MVT::i8 ? 255 : 65535, DL, WideElemVT); + SDValue MaskVec = DAG.getNode(ISD::BUILD_VECTOR, DL, InVT, + SmallVector(NumElems, MaskVal)); + SDValue MaskedIn = DAG.getNode(ISD::AND, DL, InVT, MaskVec, In); + + // PACKSS is not for vXi64, so bit-cast it to vXi32. + if (WideElemVT == MVT::i64) { + NumElems *= 2; + MVT InterVT = MVT::getVectorVT(MVT::i32, NumElems); + MaskedIn = DAG.getNode(ISD::BITCAST, DL, InterVT, MaskedIn); + } + + // Split a long vector into vectors of legal type. + unsigned RegNum = InVT.getSizeInBits() / 128; + SmallVector SubVec(RegNum); + for (unsigned i = 0; i < RegNum; i++) + SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, MaskedIn, + DAG.getIntPtrConstant(i * 4, DL)); + + MVT UnpackedVT = MVT::v4i32; + MVT PackedVT = MVT::v8i16; + unsigned ElemSize = WideElemVT.getSizeInBits(); + // In each iteration, truncate the type by a half size. + for (unsigned j = 1, e = WideElemVT.getSizeInBits() / ElemVT.getSizeInBits(); + j < e; j *= 2, ElemSize /= 2, RegNum /= 2) { + // When the element size becomes 16, we need to perform X86ISD::PACKSS on + // vXi16 instead of vXi32. + if (ElemSize == 16) { + UnpackedVT = MVT::v8i16; + PackedVT = MVT::v16i8; + } + for (unsigned i = 0; i < RegNum; i++) + SubVec[i] = DAG.getNode(ISD::BITCAST, DL, UnpackedVT, SubVec[i]); + for (unsigned i = 0; i < RegNum / 2; i++) + SubVec[i] = DAG.getNode(X86ISD::PACKSS, DL, PackedVT, SubVec[i * 2], + SubVec[i * 2 + 1]); + } + + // If the type of the result is v8i8, we need do one more X86ISD::PACKSS, and + // then extract a subvector as the result since v8i8 is not a legal type. + if (VT == MVT::v8i8) { + SubVec[0] = DAG.getNode(X86ISD::PACKSS, DL, PackedVT, SubVec[0], SubVec[0]); + SubVec[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SubVec[0], + DAG.getIntPtrConstant(0, DL)); + } + return SubVec[0]; +} + +static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + // Try to detect AVG pattern first. + SDValue Avg = detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG, + Subtarget, SDLoc(N)); + if (Avg.getNode()) + return Avg; + + return combineVectorTruncation(N, DAG, Subtarget); +} + /// Do target-specific dag combines on floating point negations. static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { Index: test/CodeGen/X86/vector-trunc.ll =================================================================== --- test/CodeGen/X86/vector-trunc.ll +++ test/CodeGen/X86/vector-trunc.ll @@ -64,74 +64,52 @@ define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) { ; SSE2-LABEL: trunc8i64_8i16: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE2-NEXT: pextrw $4, %xmm0, %ecx -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: pextrw $4, %xmm3, %edx -; SSE2-NEXT: movd %edx, %xmm1 -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,0,0,65535,0,0,0] +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: packssdw %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc8i64_8i16: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: pextrw $4, %xmm1, %eax -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSSE3-NEXT: pextrw $4, %xmm0, %ecx -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: pextrw $4, %xmm3, %edx -; SSSE3-NEXT: movd %edx, %xmm1 -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSSE3-NEXT: pextrw $4, %xmm2, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,0,0,65535,0,0,0] +; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: packssdw %xmm3, %xmm2 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSSE3-NEXT: packssdw %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc8i64_8i16: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: pextrw $4, %xmm0, %eax -; SSE41-NEXT: pinsrw $1, %eax, %xmm0 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: pinsrw $2, %eax, %xmm0 -; SSE41-NEXT: pextrw $4, %xmm1, %eax -; SSE41-NEXT: pinsrw $3, %eax, %xmm0 -; SSE41-NEXT: movd %xmm2, %eax -; SSE41-NEXT: pinsrw $4, %eax, %xmm0 -; SSE41-NEXT: pextrw $4, %xmm2, %eax -; SSE41-NEXT: pinsrw $5, %eax, %xmm0 -; SSE41-NEXT: movd %xmm3, %eax -; SSE41-NEXT: pinsrw $6, %eax, %xmm0 -; SSE41-NEXT: pextrw $4, %xmm3, %eax -; SSE41-NEXT: pinsrw $7, %eax, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] +; SSE41-NEXT: packssdw %xmm3, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: packssdw %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc8i64_8i16: ; AVX1: # BB#0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -153,13 +131,10 @@ define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) { ; SSE2-LABEL: trunc8i32_8i16: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc8i32_8i16: @@ -202,11 +177,11 @@ define void @trunc8i32_8i8(<8 x i32> %a) { ; SSE2-LABEL: trunc8i32_8i8: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: movq %xmm0, (%rax) ; SSE2-NEXT: retq ; @@ -254,128 +229,31 @@ } define void @trunc16i32_16i8(<16 x i32> %a) { -; SSE2-LABEL: trunc16i32_16i8: -; SSE2: # BB#0: # %entry -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc16i32_16i8: -; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSSE3-NEXT: movdqu %xmm0, (%rax) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc16i32_16i8: -; SSE41: # BB#0: # %entry -; SSE41-NEXT: pextrb $4, %xmm0, %eax -; SSE41-NEXT: pextrb $8, %xmm0, %ecx -; SSE41-NEXT: pextrb $12, %xmm0, %edx -; SSE41-NEXT: pinsrb $1, %eax, %xmm0 -; SSE41-NEXT: pinsrb $2, %ecx, %xmm0 -; SSE41-NEXT: pinsrb $3, %edx, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: pinsrb $4, %eax, %xmm0 -; SSE41-NEXT: pextrb $4, %xmm1, %eax -; SSE41-NEXT: pinsrb $5, %eax, %xmm0 -; SSE41-NEXT: pextrb $8, %xmm1, %eax -; SSE41-NEXT: pinsrb $6, %eax, %xmm0 -; SSE41-NEXT: pextrb $12, %xmm1, %eax -; SSE41-NEXT: pinsrb $7, %eax, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm2, %eax -; SSE41-NEXT: pinsrb $8, %eax, %xmm0 -; SSE41-NEXT: pextrb $4, %xmm2, %eax -; SSE41-NEXT: pinsrb $9, %eax, %xmm0 -; SSE41-NEXT: pextrb $8, %xmm2, %eax -; SSE41-NEXT: pinsrb $10, %eax, %xmm0 -; SSE41-NEXT: pextrb $12, %xmm2, %eax -; SSE41-NEXT: pinsrb $11, %eax, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm3, %eax -; SSE41-NEXT: pinsrb $12, %eax, %xmm0 -; SSE41-NEXT: pextrb $4, %xmm3, %eax -; SSE41-NEXT: pinsrb $13, %eax, %xmm0 -; SSE41-NEXT: pextrb $8, %xmm3, %eax -; SSE41-NEXT: pinsrb $14, %eax, %xmm0 -; SSE41-NEXT: pextrb $12, %xmm3, %eax -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 -; SSE41-NEXT: movdqu %xmm0, (%rax) -; SSE41-NEXT: retq +; SSE-LABEL: trunc16i32_16i8: +; SSE: # BB#0: # %entry +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm2, %xmm0 +; SSE-NEXT: movdqu %xmm0, (%rax) +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc16i32_16i8: ; AVX1: # BB#0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq