Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1318,6 +1318,13 @@ setOperationAction(ISD::FNEARBYINT, VT, Legal); } + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom); + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom); + + // Without BWI we need to use custom lowering to handle MVT::v64i8 input. + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom); + setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); @@ -1518,6 +1525,8 @@ setOperationAction(ISD::UMIN, MVT::v64i8, Legal); setOperationAction(ISD::UMIN, MVT::v32i16, Legal); + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom); + setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); if (Subtarget.hasVLX()) { setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); @@ -16370,9 +16379,13 @@ return DAG.getNode(X86ISD::VTRUNC, dl, VT, V); } -static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG. +// For sign extend this needs to handle all vector sizes and SSE4.1 and +// non-SSE4.1 targets. For zero extend this should only handle inputs of +// MVT::v64i8 when BWI is not supported, but AVX512 is. +static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { SDValue In = Op->getOperand(0); MVT VT = Op->getSimpleValueType(0); MVT InVT = In.getSimpleValueType(); @@ -16387,20 +16400,33 @@ if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) return SDValue(); if (!(VT.is128BitVector() && Subtarget.hasSSE2()) && - !(VT.is256BitVector() && Subtarget.hasInt256())) + !(VT.is256BitVector() && Subtarget.hasInt256()) && + !(VT.is512BitVector() && Subtarget.hasAVX512())) return SDValue(); SDLoc dl(Op); // For 256-bit vectors, we only need the lower (128-bit) half of the input. - if (VT.is256BitVector()) - In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, - MVT::getVectorVT(InSVT, InVT.getVectorNumElements() / 2), - In, DAG.getIntPtrConstant(0, dl)); + // For 512-bit vectors, we need 128-bits or 256-bits. + if (VT.getSizeInBits() > 128) { + // Input needs to be at least the same number of elements as output, and + // at least 128-bits. + int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements(); + In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128)); + } + + assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG || + InVT == MVT::v64i8) && "Zero extend only for v64i8 input!"); // SSE41 targets can use the pmovsx* instructions directly. + unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? + X86ISD::VSEXT : X86ISD::VZEXT; if (Subtarget.hasSSE41()) - return DAG.getNode(X86ISD::VSEXT, dl, VT, In); + return DAG.getNode(ExtOpc, dl, VT, In); + + // We should only get here for sign extend. + assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG && + "Unexpected opcode!"); // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI. SDValue Curr = In; @@ -22077,8 +22103,9 @@ case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG); case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG); + case ISD::ZERO_EXTEND_VECTOR_INREG: case ISD::SIGN_EXTEND_VECTOR_INREG: - return LowerSIGN_EXTEND_VECTOR_INREG(Op, Subtarget, DAG); + return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); @@ -31068,7 +31095,8 @@ // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT. // Also use this if we don't have SSE41 to allow the legalizer do its job. if (!Subtarget.hasSSE41() || VT.is128BitVector() || - (VT.is256BitVector() && Subtarget.hasInt256())) { + (VT.is256BitVector() && Subtarget.hasInt256()) || + (VT.is512BitVector() && Subtarget.hasAVX512())) { SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits()); return Opcode == ISD::SIGN_EXTEND ? DAG.getSignExtendVectorInReg(ExOp, DL, VT) Index: test/CodeGen/X86/avx512-pmovxrm.ll =================================================================== --- test/CodeGen/X86/avx512-pmovxrm.ll +++ test/CodeGen/X86/avx512-pmovxrm.ll @@ -38,16 +38,12 @@ ; X32-LABEL: test_llvm_x86_avx512_pmovsxbq: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero -; X32-NEXT: vpsllq $56, %zmm0, %zmm0 -; X32-NEXT: vpsraq $56, %zmm0, %zmm0 +; X32-NEXT: vpmovsxbq (%eax), %zmm0 ; X32-NEXT: retl ; ; X64-LABEL: test_llvm_x86_avx512_pmovsxbq: ; X64: ## BB#0: -; X64-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero -; X64-NEXT: vpsllq $56, %zmm0, %zmm0 -; X64-NEXT: vpsraq $56, %zmm0, %zmm0 +; X64-NEXT: vpmovsxbq (%rdi), %zmm0 ; X64-NEXT: retq %1 = load <16 x i8>, <16 x i8>* %a, align 1 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <8 x i32> @@ -139,22 +135,14 @@ ; X32-LABEL: test_llvm_x86_avx512_pmovzxbq: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero -; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; X32-NEXT: vpand %ymm2, %ymm1, %ymm1 -; X32-NEXT: vpand %ymm2, %ymm0, %ymm0 -; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X32-NEXT: vmovdqu (%eax), %xmm0 +; X32-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero ; X32-NEXT: retl ; ; X64-LABEL: test_llvm_x86_avx512_pmovzxbq: ; X64: ## BB#0: -; X64-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero -; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; X64-NEXT: vpand %ymm2, %ymm1, %ymm1 -; X64-NEXT: vpand %ymm2, %ymm0, %ymm0 -; X64-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X64-NEXT: vmovdqu (%rdi), %xmm0 +; X64-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero ; X64-NEXT: retq %1 = load <16 x i8>, <16 x i8>* %a, align 1 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <8 x i32> Index: test/CodeGen/X86/vector-sext.ll =================================================================== --- test/CodeGen/X86/vector-sext.ll +++ test/CodeGen/X86/vector-sext.ll @@ -599,9 +599,7 @@ ; ; AVX512-LABEL: sext_16i8_to_8i64: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpsllq $56, %zmm0, %zmm0 -; AVX512-NEXT: vpsraq $56, %zmm0, %zmm0 +; AVX512-NEXT: vpmovsxbq %xmm0, %zmm0 ; AVX512-NEXT: retq ; ; X32-SSE41-LABEL: sext_16i8_to_8i64: Index: test/CodeGen/X86/vector-zext.ll =================================================================== --- test/CodeGen/X86/vector-zext.ll +++ test/CodeGen/X86/vector-zext.ll @@ -461,17 +461,12 @@ ; AVX512F-LABEL: zext_16i8_to_8i64: ; AVX512F: # BB#0: # %entry ; AVX512F-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: zext_16i8_to_8i64: ; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq entry: %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32>