Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -27772,6 +27772,58 @@ return SDValue(); } +// If this is a bitcasted op that can be represented as another type, push the +// the bitcast to the inputs. This allows more opportunities for pattern +// matching masked instructions. This is called when we know that the operation +// is used as one of the inputs of a vselect. +static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + // Make sure we have a bitcast. + if (OrigOp.getOpcode() != ISD::BITCAST) + return false; + + SDValue Op = OrigOp.getOperand(0); + + // If the operation is used by anything other than the bitcast, we shouldn't + // do this combine as that would replicate the operation. + if (!Op.hasOneUse()) + return false; + + MVT VT = OrigOp.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + SDLoc DL(Op.getNode()); + + switch (Op.getOpcode()) { + case X86ISD::PALIGNR: + // PALIGNR can be converted to VALIGND/Q for 128-bit vectors. + if (!VT.is128BitVector()) + return false; + LLVM_FALLTHROUGH; + case X86ISD::VALIGN: { + if (EltVT != MVT::i32 && EltVT != MVT::i64) + return false; + uint64_t Imm = cast(Op.getOperand(2))->getZExtValue(); + MVT OpEltVT = Op.getSimpleValueType().getVectorElementType(); + unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits(); + unsigned EltSize = EltVT.getSizeInBits(); + // Make sure we can represent the same shift with the new VT. + if ((ShiftAmt % EltSize) != 0) + return false; + Imm = ShiftAmt / EltSize; + SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0)); + DCI.AddToWorklist(Op0.getNode()); + SDValue Op1 = DAG.getBitcast(VT, Op.getOperand(1)); + DCI.AddToWorklist(Op1.getNode()); + DCI.CombineTo(OrigOp.getNode(), + DAG.getNode(X86ISD::VALIGN, DL, VT, Op0, Op1, + DAG.getConstant(Imm, DL, MVT::i8))); + return true; + } + } + + return false; +} + /// Do target-specific dag combines on SELECT and VSELECT nodes. static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -28133,6 +28185,17 @@ } } + // Look for vselects with LHS/RHS being bitcasted from an operation that + // can be executed on another type. Push the bitcast to the inputs of + // the operation. This exposes opportunities for using masking instructions. + if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalizeOps() && + CondVT.getVectorElementType() == MVT::i1) { + if (combineBitcastForMaskedOp(LHS, DAG, DCI)) + return SDValue(N, 0); + if (combineBitcastForMaskedOp(RHS, DAG, DCI)) + return SDValue(N, 0); + } + return SDValue(); } Index: test/CodeGen/X86/vector-shuffle-512-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v16.ll +++ test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -360,9 +360,9 @@ define <16 x i32> @mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01(<16 x i32> %a, <16 x i32> %passthru, i16 %mask) { ; ALL-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01: ; ALL: # BB#0: -; ALL-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,0] ; ALL-NEXT: kmovw %edi, %k1 -; ALL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; ALL-NEXT: valignd {{.*#+}} zmm1 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1] +; ALL-NEXT: vmovdqa64 %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> %mask.cast = bitcast i16 %mask to <16 x i1> @@ -373,9 +373,9 @@ define <16 x i32> @mask_shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) { ; ALL-LABEL: mask_shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16: ; ALL: # BB#0: -; ALL-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm1[0] ; ALL-NEXT: kmovw %edi, %k1 -; ALL-NEXT: vpblendmd %zmm0, %zmm2, %zmm0 {%k1} +; ALL-NEXT: valignd {{.*#+}} zmm2 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1] +; ALL-NEXT: vmovdqa64 %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> %mask.cast = bitcast i16 %mask to <16 x i1> @@ -386,9 +386,8 @@ define <16 x i32> @maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01(<16 x i32> %a, i16 %mask) { ; ALL-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01: ; ALL: # BB#0: -; ALL-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,0] ; ALL-NEXT: kmovw %edi, %k1 -; ALL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; ALL-NEXT: valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1] ; ALL-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> %mask.cast = bitcast i16 %mask to <16 x i1> @@ -399,9 +398,8 @@ define <16 x i32> @maskz_shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i32> %a, <16 x i32> %b, i16 %mask) { ; ALL-LABEL: maskz_shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16: ; ALL: # BB#0: -; ALL-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm1[0] ; ALL-NEXT: kmovw %edi, %k1 -; ALL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; ALL-NEXT: valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1] ; ALL-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> %mask.cast = bitcast i16 %mask to <16 x i1> Index: test/CodeGen/X86/vector-shuffle-masked.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-masked.ll +++ test/CodeGen/X86/vector-shuffle-masked.ll @@ -4,9 +4,9 @@ define <4 x i32> @mask_shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passthru, i8 %mask) { ; CHECK-LABEL: mask_shuffle_v4i32_1234: ; CHECK: # BB#0: -; CHECK-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] ; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm2, %xmm0 {%k1} +; CHECK-NEXT: valignd {{.*#+}} xmm2 {%k1} = xmm0[1,2,3],xmm1[0] +; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 ; CHECK-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %mask.cast = bitcast i8 %mask to <8 x i1> @@ -18,9 +18,8 @@ define <4 x i32> @maskz_shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b, i8 %mask) { ; CHECK-LABEL: maskz_shuffle_v4i32_1234: ; CHECK: # BB#0: -; CHECK-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] ; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: valignd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,2,3],xmm1[0] ; CHECK-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %mask.cast = bitcast i8 %mask to <8 x i1> @@ -32,9 +31,9 @@ define <4 x i32> @mask_shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passthru, i8 %mask) { ; CHECK-LABEL: mask_shuffle_v4i32_2345: ; CHECK: # BB#0: -; CHECK-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm2, %xmm0 {%k1} +; CHECK-NEXT: valignd {{.*#+}} xmm2 {%k1} = xmm0[2,3],xmm1[0,1] +; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 ; CHECK-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %mask.cast = bitcast i8 %mask to <8 x i1> @@ -46,9 +45,8 @@ define <4 x i32> @maskz_shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b, i8 %mask) { ; CHECK-LABEL: maskz_shuffle_v4i32_2345: ; CHECK: # BB#0: -; CHECK-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: valignd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3],xmm1[0,1] ; CHECK-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %mask.cast = bitcast i8 %mask to <8 x i1> @@ -60,9 +58,9 @@ define <2 x i64> @mask_shuffle_v2i64_12(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passthru, i8 %mask) { ; CHECK-LABEL: mask_shuffle_v2i64_12: ; CHECK: # BB#0: -; CHECK-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: vpblendmq %xmm0, %xmm2, %xmm0 {%k1} +; CHECK-NEXT: valignq {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[0] +; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 ; CHECK-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> %mask.cast = bitcast i8 %mask to <8 x i1> @@ -74,9 +72,8 @@ define <2 x i64> @maskz_shuffle_v2i64_12(<2 x i64> %a, <2 x i64> %b, i8 %mask) { ; CHECK-LABEL: maskz_shuffle_v2i64_12: ; CHECK: # BB#0: -; CHECK-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: valignq {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[0] ; CHECK-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> %mask.cast = bitcast i8 %mask to <8 x i1> @@ -167,9 +164,9 @@ define <8 x i32> @mask_shuffle_v8i32_23456789(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passthru, i8 %mask) { ; CHECK-LABEL: mask_shuffle_v8i32_23456789: ; CHECK: # BB#0: -; CHECK-NEXT: valignq {{.*#+}} ymm0 = ymm0[1,2,3],ymm1[0] ; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: valignd {{.*#+}} ymm2 {%k1} = ymm0[2,3,4,5,6,7],ymm1[0,1] +; CHECK-NEXT: vmovdqa64 %ymm2, %ymm0 ; CHECK-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> %mask.cast = bitcast i8 %mask to <8 x i1> @@ -180,9 +177,8 @@ define <8 x i32> @maskz_shuffle_v8i32_23456789(<8 x i32> %a, <8 x i32> %b, i8 %mask) { ; CHECK-LABEL: maskz_shuffle_v8i32_23456789: ; CHECK: # BB#0: -; CHECK-NEXT: valignq {{.*#+}} ymm0 = ymm0[1,2,3],ymm1[0] ; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: valignd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,4,5,6,7],ymm1[0,1] ; CHECK-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> %mask.cast = bitcast i8 %mask to <8 x i1>