Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -13948,16 +13948,21 @@ return DAG.getNode(X86ISD::VZEXT, DL, VT, In); assert(InVT.getVectorElementType() == MVT::i1); - MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32; + + // Extend VT if the target is 256 or 128bit vector and VLX is not supported. + MVT ExtVT = VT; + if (!VT.is512BitVector() && !Subtarget.hasVLX()) + ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); + SDValue One = DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT); SDValue Zero = DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT); - SDValue V = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero); - if (VT.is512BitVector()) - return V; - return DAG.getNode(X86ISD::VTRUNC, DL, VT, V); + SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero); + if (VT == ExtVT) + return SelectedVal; + return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal); } static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, @@ -15047,16 +15052,15 @@ } } -static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { + SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); - assert(Op0.getSimpleValueType().getVectorElementType().getSizeInBits() >= 8 && - Op.getSimpleValueType().getVectorElementType() == MVT::i1 && + assert(VT.getVectorElementType() == MVT::i1 && "Cannot set masked compare for this operation"); ISD::CondCode SetCCOpcode = cast(CC)->get(); @@ -15194,26 +15198,26 @@ if (VT.is256BitVector() && !Subtarget.hasInt256()) return Lower256IntVSETCC(Op, DAG); + // Operands are boolean (vectors of i1) MVT OpVT = Op1.getSimpleValueType(); if (OpVT.getVectorElementType() == MVT::i1) return LowerBoolVSETCC_AVX512(Op, DAG); - bool MaskResult = (VT.getVectorElementType() == MVT::i1); - if (Subtarget.hasAVX512()) { - if (Op1.getSimpleValueType().is512BitVector() || - (Subtarget.hasBWI() && Subtarget.hasVLX()) || - (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32)) - return LowerIntVSETCC_AVX512(Op, DAG, Subtarget); - + // The result is boolean, but operands are int/float + if (VT.getVectorElementType() == MVT::i1) { // In AVX-512 architecture setcc returns mask with i1 elements, // But there is no compare instruction for i8 and i16 elements in KNL. - // We are not talking about 512-bit operands in this case, these - // types are illegal. - if (MaskResult && - (OpVT.getVectorElementType().getSizeInBits() < 32 && - OpVT.getVectorElementType().getSizeInBits() >= 8)) - return DAG.getNode(ISD::TRUNCATE, dl, VT, - DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC)); + // In this case use SSE compare + bool UseAVX512Inst = + (OpVT.is512BitVector() || + OpVT.getVectorElementType().getSizeInBits() >= 32 || + (Subtarget.hasBWI() && Subtarget.hasVLX())); + + if (UseAVX512Inst) + return LowerIntVSETCC_AVX512(Op, DAG); + + return DAG.getNode(ISD::TRUNCATE, dl, VT, + DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC)); } // Lower using XOP integer comparisons. Index: llvm/trunk/test/CodeGen/X86/avx512-ext.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-ext.ll +++ llvm/trunk/test/CodeGen/X86/avx512-ext.ll @@ -1879,3 +1879,47 @@ %2 = bitcast <8 x i32> %1 to <4 x i64> ret <4 x i64> %2 } + +define <64 x i8> @zext_64xi1_to_64xi8(<64 x i8> %x, <64 x i8> %y) #0 { +; KNL-LABEL: zext_64xi1_to_64xi8: +; KNL: ## BB#0: +; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 +; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; KNL-NEXT: vpand %ymm2, %ymm0, %ymm0 +; KNL-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1 +; KNL-NEXT: vpand %ymm2, %ymm1, %ymm1 +; KNL-NEXT: retq +; +; SKX-LABEL: zext_64xi1_to_64xi8: +; SKX: ## BB#0: +; SKX-NEXT: vpcmpeqb %zmm1, %zmm0, %k1 +; SKX-NEXT: vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z} +; SKX-NEXT: retq + %mask = icmp eq <64 x i8> %x, %y + %1 = zext <64 x i1> %mask to <64 x i8> + ret <64 x i8> %1 +} + +define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 { +; KNL-LABEL: zext_4xi1_to_4x32: +; KNL: ## BB#0: +; KNL-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; KNL-NEXT: vpand %xmm2, %xmm1, %xmm1 +; KNL-NEXT: vpand %xmm2, %xmm0, %xmm0 +; KNL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: zext_4xi1_to_4x32: +; SKX: ## BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SKX-NEXT: vpandq %xmm2, %xmm1, %xmm1 +; SKX-NEXT: vpandq %xmm2, %xmm0, %xmm0 +; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; SKX-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; SKX-NEXT: retq + %mask = icmp eq <4 x i8> %x, %y + %1 = zext <4 x i1> %mask to <4 x i32> + ret <4 x i32> %1 +}