Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -9055,33 +9055,48 @@ SDValue V2, ArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { + // TODO: If minimizing size and one of the inputs is a zero vector and the + // the zero vector has only one use, we could use a VPERM2X128 to save the + // instruction bytes needed to explicitly generate the zero vector. + // Blends are faster and handle all the non-lane-crossing cases. if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask, Subtarget, DAG)) return Blend; - MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), - VT.getVectorNumElements() / 2); - // Check for patterns which can be matched with a single insert of a 128-bit - // subvector. - bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}); - if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) { - SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, - DAG.getIntPtrConstant(0)); - SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, - OnlyUsesV1 ? V1 : V2, DAG.getIntPtrConstant(0)); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); - } - if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 6, 7})) { - SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, - DAG.getIntPtrConstant(0)); - SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2, - DAG.getIntPtrConstant(2)); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); + bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode()); + bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode()); + + // If either input operand is a zero vector, use VPERM2X128 because its mask + // allows us to replace the zero input with an implicit zero. + if (!IsV1Zero && !IsV2Zero) { + // Check for patterns which can be matched with a single insert of a 128-bit + // subvector. + bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}); + if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) { + MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), + VT.getVectorNumElements() / 2); + SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, + DAG.getIntPtrConstant(0)); + SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, + OnlyUsesV1 ? V1 : V2, DAG.getIntPtrConstant(0)); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); + } } - // Otherwise form a 128-bit permutation. - // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half. + // Otherwise form a 128-bit permutation. After accounting for undefs, + // convert the 64-bit shuffle mask selection values into 128-bit + // selection bits by dividing the indexes by 2 and shifting into positions + // defined by a vperm2*128 instruction's immediate control byte. + + // The immediate permute control byte looks like this: + // [1:0] - select 128 bits from sources for low half of destination + // [2] - ignore + // [3] - zero low half of destination + // [5:4] - select 128 bits from sources for high half of destination + // [6] - ignore + // [7] - zero high half of destination + int MaskLO = Mask[0]; if (MaskLO == SM_SentinelUndef) MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1]; @@ -9091,6 +9106,27 @@ MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3]; unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4; + + // If either input is a zero vector, replace it with an undef input. + // Shuffle mask values < 4 are selecting elements of V1. + // Shuffle mask values >= 4 are selecting elements of V2. + // Adjust each half of the permute mask by clearing the half that was + // selecting the zero vector and setting the zero mask bit. + if (IsV1Zero) { + V1 = DAG.getUNDEF(VT); + if (MaskLO < 4) + PermMask = (PermMask & 0xf0) | 0x08; + if (MaskHI < 4) + PermMask = (PermMask & 0x0f) | 0x80; + } + if (IsV2Zero) { + V2 = DAG.getUNDEF(VT); + if (MaskLO >= 4) + PermMask = (PermMask & 0xf0) | 0x08; + if (MaskHI >= 4) + PermMask = (PermMask & 0x0f) | 0x80; + } + return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2, DAG.getConstant(PermMask, MVT::i8)); } Index: llvm/trunk/test/CodeGen/X86/avx-vperm2x128.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx-vperm2x128.ll +++ llvm/trunk/test/CodeGen/X86/avx-vperm2x128.ll @@ -261,3 +261,94 @@ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } + +;; Test zero mask generation. +;; PR22984: https://llvm.org/bugs/show_bug.cgi?id=22984 +;; Prefer xor+vblendpd over vperm2f128 because that has better performance. + +define <4 x double> @vperm2z_0x08(<4 x double> %a) { +; ALL-LABEL: vperm2z_0x08: +; ALL: # BB#0: +; ALL-NEXT: vperm2f128 $40, %ymm0, %ymm0, %ymm0 +; ALL-NEXT: retq + %s = shufflevector <4 x double> %a, <4 x double> , <4 x i32> + ret <4 x double> %s +} + +define <4 x double> @vperm2z_0x18(<4 x double> %a) { +; ALL-LABEL: vperm2z_0x18: +; ALL: # BB#0: +; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; ALL-NEXT: vblendpd $12, %ymm0, %ymm1, %ymm0 +; ALL-NEXT: retq + %s = shufflevector <4 x double> %a, <4 x double> , <4 x i32> + ret <4 x double> %s +} + +define <4 x double> @vperm2z_0x28(<4 x double> %a) { +; ALL-LABEL: vperm2z_0x28: +; ALL: # BB#0: +; ALL-NEXT: vperm2f128 $40, %ymm0, %ymm0, %ymm0 +; ALL-NEXT: retq + %s = shufflevector <4 x double> , <4 x double> %a, <4 x i32> + ret <4 x double> %s +} + +define <4 x double> @vperm2z_0x38(<4 x double> %a) { +; ALL-LABEL: vperm2z_0x38: +; ALL: # BB#0: +; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; ALL-NEXT: vblendpd $12, %ymm0, %ymm1, %ymm0 +; ALL-NEXT: retq + %s = shufflevector <4 x double> , <4 x double> %a, <4 x i32> + ret <4 x double> %s +} + +define <4 x double> @vperm2z_0x80(<4 x double> %a) { +; ALL-LABEL: vperm2z_0x80: +; ALL: # BB#0: +; ALL-NEXT: vperm2f128 $128, %ymm0, %ymm0, %ymm0 +; ALL-NEXT: retq + %s = shufflevector <4 x double> %a, <4 x double> , <4 x i32> + ret <4 x double> %s +} + +define <4 x double> @vperm2z_0x81(<4 x double> %a) { +; ALL-LABEL: vperm2z_0x81: +; ALL: # BB#0: +; ALL-NEXT: vperm2f128 $129, %ymm0, %ymm0, %ymm0 +; ALL-NEXT: retq + %s = shufflevector <4 x double> %a, <4 x double> , <4 x i32> + ret <4 x double> %s +} + +define <4 x double> @vperm2z_0x82(<4 x double> %a) { +; ALL-LABEL: vperm2z_0x82: +; ALL: # BB#0: +; ALL-NEXT: vperm2f128 $128, %ymm0, %ymm0, %ymm0 +; ALL-NEXT: retq + %s = shufflevector <4 x double> , <4 x double> %a, <4 x i32> + ret <4 x double> %s +} + +define <4 x double> @vperm2z_0x83(<4 x double> %a) { +; ALL-LABEL: vperm2z_0x83: +; ALL: # BB#0: +; ALL-NEXT: vperm2f128 $129, %ymm0, %ymm0, %ymm0 +; ALL-NEXT: retq + %s = shufflevector <4 x double> , <4 x double> %a, <4 x i32> + ret <4 x double> %s +} + +;; With AVX2 select the integer version of the instruction. Use an add to force the domain selection. + +define <4 x i64> @vperm2z_int_0x83(<4 x i64> %a, <4 x i64> %b) { +; ALL-LABEL: vperm2z_int_0x83: +; ALL: # BB#0: +; AVX1: vperm2f128 $129, %ymm0, %ymm0, %ymm0 +; AVX2: vperm2i128 $129, %ymm0, %ymm0, %ymm0 + %s = shufflevector <4 x i64> , <4 x i64> %a, <4 x i32> + %c = add <4 x i64> %b, %s + ret <4 x i64> %c +} +