Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -793,7 +793,7 @@ APInt MsbMask = APInt::getHighBitsSet(BitWidth, 1); // If we only care about the highest bit, don't bother shifting right. - if (MsbMask == DemandedMask) { + if (MsbMask == NewMask) { unsigned ShAmt = ExVT.getScalarType().getSizeInBits(); SDValue InOp = Op.getOperand(0); Index: lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- lib/Target/X86/X86ISelDAGToDAG.cpp +++ lib/Target/X86/X86ISelDAGToDAG.cpp @@ -2131,16 +2131,6 @@ case X86ISD::GlobalBaseReg: return getGlobalBaseReg(); - case X86ISD::SHRUNKBLEND: { - // SHRUNKBLEND selects like a regular VSELECT. - SDValue VSelect = CurDAG->getNode( - ISD::VSELECT, SDLoc(Node), Node->getValueType(0), Node->getOperand(0), - Node->getOperand(1), Node->getOperand(2)); - ReplaceUses(SDValue(Node, 0), VSelect); - SelectCode(VSelect.getNode()); - // We already called ReplaceUses. - return nullptr; - } case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_AND: Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -190,11 +190,6 @@ /// BLENDI - Blend where the selector is an immediate. BLENDI, - /// SHRUNKBLEND - Blend where the condition has been shrunk. - /// This is used to emphasize that the condition mask is - /// no more valid for generic VSELECT optimizations. - SHRUNKBLEND, - /// ADDSUB - Combined add and sub on an FP vector. ADDSUB, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -19910,7 +19910,6 @@ case X86ISD::ANDNP: return "X86ISD::ANDNP"; case X86ISD::PSIGN: return "X86ISD::PSIGN"; case X86ISD::BLENDI: return "X86ISD::BLENDI"; - case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND"; case X86ISD::SUBUS: return "X86ISD::SUBUS"; case X86ISD::HADD: return "X86ISD::HADD"; case X86ISD::HSUB: return "X86ISD::HSUB"; @@ -23525,17 +23524,22 @@ // build_vector of constants. This will be taken care in a later // condition. (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 && - VT != MVT::v8i16) && - // Don't optimize vector of constants. Those are handled by - // the generic code and all the bits must be properly set for - // the generic optimizer. - !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) { + VT != MVT::v8i16)) { unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); // Don't optimize vector selects that map to mask-registers. if (BitWidth == 1) return SDValue(); + // Check all uses of that condition operand to check whether it will be + // consumed by non-BLEND instructions, which may depend on all bits are set + // properly. + for (SDNode::use_iterator I = Cond->use_begin(), + E = Cond->use_end(); I != E; ++I) + if (I->getOpcode() != ISD::VSELECT) + // TODO: Add other opcodes eventually lowered into BLEND. + return SDValue(); + assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); @@ -23543,45 +23547,13 @@ TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), DCI.isBeforeLegalizeOps()); if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || - TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, - TLO)) { - // If we changed the computation somewhere in the DAG, this change - // will affect all users of Cond. - // Make sure it is fine and update all the nodes so that we do not - // use the generic VSELECT anymore. Otherwise, we may perform - // wrong optimizations as we messed up with the actual expectation - // for the vector boolean values. - if (Cond != TLO.Old) { - // Check all uses of that condition operand to check whether it will be - // consumed by non-BLEND instructions, which may depend on all bits are - // set properly. - for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); - I != E; ++I) - if (I->getOpcode() != ISD::VSELECT) - // TODO: Add other opcodes eventually lowered into BLEND. - return SDValue(); - - // Update all the users of the condition, before committing the change, - // so that the VSELECT optimizations that expect the correct vector - // boolean value will not be triggered. - for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); - I != E; ++I) - DAG.ReplaceAllUsesOfValueWith( - SDValue(*I, 0), - DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0), - Cond, I->getOperand(1), I->getOperand(2))); - DCI.CommitTargetLoweringOpt(TLO); - return SDValue(); - } - // At this point, only Cond is changed. Change the condition - // just for N to keep the opportunity to optimize all other - // users their own way. - DAG.ReplaceAllUsesOfValueWith( - SDValue(N, 0), - DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0), - TLO.New, N->getOperand(1), N->getOperand(2))); - return SDValue(); - } + (TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, + TLO) && + // Don't optimize vector of constants. Those are handled by + // the generic code and all the bits must be properly set for + // the generic optimizer. + !ISD::isBuildVectorOfConstantSDNodes(TLO.New.getNode()))) + DCI.CommitTargetLoweringOpt(TLO); } // We should generate an X86ISD::BLENDI from a vselect if its argument @@ -23595,9 +23567,7 @@ // Iff we find this pattern and the build_vectors are built from // constants, we translate the vselect into a shuffle_vector that we // know will be matched by LowerVECTOR_SHUFFLEtoBlend. - if ((N->getOpcode() == ISD::VSELECT || - N->getOpcode() == X86ISD::SHRUNKBLEND) && - !DCI.isBeforeLegalize()) { + if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) { SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget); if (Shuffle.getNode()) return Shuffle; @@ -25785,9 +25755,7 @@ case ISD::EXTRACT_VECTOR_ELT: return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI); case ISD::VSELECT: - case ISD::SELECT: - case X86ISD::SHRUNKBLEND: - return PerformSELECTCombine(N, DAG, DCI, Subtarget); + case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget); case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); Index: test/CodeGen/X86/vector-blend.ll =================================================================== --- test/CodeGen/X86/vector-blend.ll +++ test/CodeGen/X86/vector-blend.ll @@ -419,8 +419,8 @@ ; ; SSE41-LABEL: vsel_i648: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm4[1] +; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],xmm6[1] ; SSE41-NEXT: movaps %xmm5, %xmm1 ; SSE41-NEXT: movaps %xmm7, %xmm3 ; SSE41-NEXT: retq Index: test/CodeGen/X86/vselect-avx.ll =================================================================== --- test/CodeGen/X86/vselect-avx.ll +++ test/CodeGen/X86/vselect-avx.ll @@ -59,19 +59,15 @@ ; ; <rdar://problem/18819506> -; Note: For now, hard code ORIG_MASK and SHRUNK_MASK registers, because we -; cannot express that ORIG_MASK must not be equal to ORIG_MASK. Otherwise, -; even a faulty pattern would pass! -; ; CHECK-LABEL: test3: -; Compute the original mask. -; CHECK: vpcmpeqd {{%xmm[0-9]+}}, {{%xmm[0-9]+}}, [[ORIG_MASK:%xmm0]] -; Shrink the bit of the mask. -; CHECK-NEXT: vpslld $31, [[ORIG_MASK]], [[SHRUNK_MASK:%xmm3]] -; Use the shrunk mask in the blend. -; CHECK-NEXT: vblendvps [[SHRUNK_MASK]], %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}} -; Use the original mask in the and. -; CHECK-NEXT: vpand LCPI2_2(%rip), [[ORIG_MASK]], {{%xmm[0-9]+}} +; Compute the mask. +; CHECK: vpcmpeqd {{%xmm[0-9]+}}, {{%xmm[0-9]+}}, [[MASK:%xmm[0-9]+]] +; Do not shrink the bit of the mask. +; CHECK-NOT: vpslld $31, [[MASK]], {{%xmm[0-9]+}} +; Use the mask in the blend. +; CHECK-NEXT: vblendvps [[MASK]], %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}} +; Use the mask in the and. +; CHECK-NEXT: vpand LCPI2_2(%rip), [[MASK]], {{%xmm[0-9]+}} ; CHECK: retq define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17, <4 x i16> %tmp3, <4 x i16> %tmp12) { %tmp6 = srem <4 x i32> %induction30, <i32 3, i32 3, i32 3, i32 3>