Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -40898,8 +40898,8 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - // Requires SSE2 but AVX512 has fast truncate. - if (!Subtarget.hasSSE2() || Subtarget.hasAVX512()) + // Requires SSE2. + if (!Subtarget.hasSSE2()) return SDValue(); if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple()) @@ -40923,6 +40923,13 @@ if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64) return SDValue(); + // AVX512 has fast truncate, but if the input is already going to be split, + // there's no harm in trying pack. + if (Subtarget.hasAVX512() && + !(!Subtarget.useAVX512Regs() && VT.is256BitVector() && + InVT.is512BitVector())) + return SDValue(); + unsigned NumPackedSignBits = std::min(SVT.getSizeInBits(), 16); unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8; Index: llvm/trunk/test/CodeGen/X86/min-legal-vector-width.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/min-legal-vector-width.ll +++ llvm/trunk/test/CodeGen/X86/min-legal-vector-width.ll @@ -757,9 +757,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vpsrlq $48, 32(%rdi), %ymm0 ; CHECK-NEXT: vpsrlq $48, (%rdi), %ymm1 -; CHECK-NEXT: vpmovqd %ymm1, %xmm1 -; CHECK-NEXT: vpmovqd %ymm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; CHECK-NEXT: retq %a = load <8 x i64>, <8 x i64>* %x %b = lshr <8 x i64> %a, @@ -770,11 +769,9 @@ define <16 x i16> @trunc_v16i32_v16i16_zeroes(<16 x i32>* %x) nounwind "min-legal-vector-width"="256" { ; CHECK-LABEL: trunc_v16i32_v16i16_zeroes: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsrld $16, 32(%rdi), %ymm0 -; CHECK-NEXT: vpsrld $16, (%rdi), %ymm1 -; CHECK-NEXT: vpmovdw %ymm1, %xmm1 -; CHECK-NEXT: vpmovdw %ymm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %a = load <16 x i32>, <16 x i32>* %x %b = lshr <16 x i32> %a, @@ -787,9 +784,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vpsrlw $8, 32(%rdi), %ymm0 ; CHECK-NEXT: vpsrlw $8, (%rdi), %ymm1 -; CHECK-NEXT: vpmovwb %ymm1, %xmm1 -; CHECK-NEXT: vpmovwb %ymm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; CHECK-NEXT: retq %a = load <32 x i16>, <32 x i16>* %x %b = lshr <32 x i16> %a, @@ -802,9 +798,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vpsraq $48, 32(%rdi), %ymm0 ; CHECK-NEXT: vpsraq $48, (%rdi), %ymm1 -; CHECK-NEXT: vpmovqd %ymm1, %xmm1 -; CHECK-NEXT: vpmovqd %ymm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vpackssdw %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; CHECK-NEXT: retq %a = load <8 x i64>, <8 x i64>* %x %b = ashr <8 x i64> %a, @@ -817,9 +812,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vpsrad $16, 32(%rdi), %ymm0 ; CHECK-NEXT: vpsrad $16, (%rdi), %ymm1 -; CHECK-NEXT: vpmovdw %ymm1, %xmm1 -; CHECK-NEXT: vpmovdw %ymm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vpackssdw %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; CHECK-NEXT: retq %a = load <16 x i32>, <16 x i32>* %x %b = ashr <16 x i32> %a, @@ -832,9 +826,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vpsraw $8, 32(%rdi), %ymm0 ; CHECK-NEXT: vpsraw $8, (%rdi), %ymm1 -; CHECK-NEXT: vpmovwb %ymm1, %xmm1 -; CHECK-NEXT: vpmovwb %ymm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vpacksswb %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; CHECK-NEXT: retq %a = load <32 x i16>, <32 x i16>* %x %b = ashr <32 x i16> %a,