Index: llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp +++ llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -3381,13 +3381,17 @@ } break; - case X86ISD::BLENDV: { - // BLENDV selects like a regular VSELECT. - SDValue VSelect = CurDAG->getNode( - ISD::VSELECT, SDLoc(Node), Node->getValueType(0), Node->getOperand(0), + case ISD::VSELECT: { + // Replace VSELECT with non-mask conditions with with BLENDV. + if (Node->getOperand(0).getValueType().getVectorElementType() == MVT::i1) + break; + + assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!"); + SDValue Blendv = CurDAG->getNode( + X86ISD::BLENDV, SDLoc(Node), Node->getValueType(0), Node->getOperand(0), Node->getOperand(1), Node->getOperand(2)); - ReplaceNode(Node, VSelect.getNode()); - SelectCode(VSelect.getNode()); + ReplaceNode(Node, Blendv.getNode()); + SelectCode(Blendv.getNode()); // We already called ReplaceUses. return; } Index: llvm/trunk/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.h +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h @@ -204,7 +204,8 @@ /// Dynamic (non-constant condition) vector blend where only the sign bits /// of the condition elements are used. This is used to enforce that the /// condition mask is not valid for generic VSELECT optimizations. This - /// can also be used to implement the intrinsics. + /// is also used to implement the intrinsics. + /// Operands are in VSELECT order: MASK, TRUE, FALSE BLENDV, /// Combined add and sub on an FP vector. Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -21840,6 +21840,17 @@ Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } + case BLENDV: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + + EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger(); + Src3 = DAG.getBitcast(MaskVT, Src3); + + // Reverse the operands to match VSELECT order. + return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1); + } case VPERM_2OP : { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); Index: llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td +++ llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -448,6 +448,12 @@ def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>; def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>; +def X86Blendv : SDNode<"X86ISD::BLENDV", + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<2, 3>, + SDTCisSameNumEltsAs<0, 1>, + SDTCisSameSizeAs<0, 1>]>>; def X86Addsub : SDNode<"X86ISD::ADDSUB", SDTFPBinOp>; Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -6582,16 +6582,16 @@ VR128:$src2, sub_xmm), 0xf)>; } -/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators -multiclass SS41I_quaternary_int_avx opc, string OpcodeStr, - RegisterClass RC, X86MemOperand x86memop, - PatFrag mem_frag, Intrinsic IntId, - X86FoldableSchedWrite sched> { +/// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators +multiclass SS41I_quaternary_avx opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, ValueType VT, + PatFrag mem_frag, SDNode OpNode, + X86FoldableSchedWrite sched> { def rr : Ii8Reg, TAPD, VEX_4V, Sched<[sched]>; @@ -6600,8 +6600,8 @@ !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, - (IntId RC:$src1, (mem_frag addr:$src2), - RC:$src3))], SSEPackedInt>, TAPD, VEX_4V, + (OpNode RC:$src3, (mem_frag addr:$src2), + RC:$src1))], SSEPackedInt>, TAPD, VEX_4V, Sched<[sched.Folded, sched.ReadAfterFold, // x86memop:$src2 ReadDefault, ReadDefault, ReadDefault, ReadDefault, @@ -6612,68 +6612,47 @@ let Predicates = [HasAVX] in { let ExeDomain = SSEPackedDouble in { -defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem, - load, int_x86_sse41_blendvpd, - SchedWriteFVarBlend.XMM>; -defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem, - loadv4f64, int_x86_avx_blendv_pd_256, - SchedWriteFVarBlend.YMM>, VEX_L; +defm VBLENDVPD : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem, + v2f64, loadv2f64, X86Blendv, + SchedWriteFVarBlend.XMM>; +defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem, + v4f64, loadv4f64, X86Blendv, + SchedWriteFVarBlend.YMM>, VEX_L; } // ExeDomain = SSEPackedDouble let ExeDomain = SSEPackedSingle in { -defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem, - load, int_x86_sse41_blendvps, - SchedWriteFVarBlend.XMM>; -defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem, - loadv8f32, int_x86_avx_blendv_ps_256, - SchedWriteFVarBlend.YMM>, VEX_L; +defm VBLENDVPS : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem, + v4f32, loadv4f32, X86Blendv, + SchedWriteFVarBlend.XMM>; +defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem, + v8f32, loadv8f32, X86Blendv, + SchedWriteFVarBlend.YMM>, VEX_L; } // ExeDomain = SSEPackedSingle -defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem, - load, int_x86_sse41_pblendvb, - SchedWriteVarBlend.XMM>; +defm VPBLENDVB : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem, + v16i8, loadv16i8, X86Blendv, + SchedWriteVarBlend.XMM>; } let Predicates = [HasAVX2] in { -defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem, - load, int_x86_avx2_pblendvb, - SchedWriteVarBlend.YMM>, VEX_L; +defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem, + v32i8, loadv32i8, X86Blendv, + SchedWriteVarBlend.YMM>, VEX_L; } let Predicates = [HasAVX] in { - def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1), - (v16i8 VR128:$src2))), - (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>; - def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1), - (v4i32 VR128:$src2))), + def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1), + (v4i32 VR128:$src2))), (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; - def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1), - (v4f32 VR128:$src2))), - (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; - def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1), - (v2i64 VR128:$src2))), - (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; - def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1), - (v2f64 VR128:$src2))), + def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1), + (v2i64 VR128:$src2))), (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; - def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1), - (v8i32 VR256:$src2))), - (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; - def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1), - (v8f32 VR256:$src2))), + def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1), + (v8i32 VR256:$src2))), (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; - def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1), - (v4i64 VR256:$src2))), - (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; - def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1), - (v4f64 VR256:$src2))), + def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1), + (v4i64 VR256:$src2))), (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; } -let Predicates = [HasAVX2] in { - def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1), - (v32i8 VR256:$src2))), - (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>; -} - // Prefer a movss or movsd over a blendps when optimizing for size. these were // changed to use blends because blends have better throughput on sandybridge // and haswell, but movs[s/d] are 1-2 byte shorter instructions. @@ -6747,16 +6726,17 @@ } -/// SS41I_ternary_int - SSE 4.1 ternary operator +/// SS41I_ternary - SSE 4.1 ternary operator let Uses = [XMM0], Constraints = "$src1 = $dst" in { - multiclass SS41I_ternary_int opc, string OpcodeStr, PatFrag mem_frag, - X86MemOperand x86memop, Intrinsic IntId, - X86FoldableSchedWrite sched> { + multiclass SS41I_ternary opc, string OpcodeStr, ValueType VT, + PatFrag mem_frag, X86MemOperand x86memop, + SDNode OpNode, X86FoldableSchedWrite sched> { def rr0 : SS48I, + [(set VR128:$dst, + (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>, Sched<[sched]>; def rm0 : SS48I, + (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } let ExeDomain = SSEPackedDouble in -defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memop, f128mem, - int_x86_sse41_blendvpd, SchedWriteFVarBlend.XMM>; +defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem, + X86Blendv, SchedWriteFVarBlend.XMM>; let ExeDomain = SSEPackedSingle in -defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memop, f128mem, - int_x86_sse41_blendvps, SchedWriteFVarBlend.XMM>; -defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memop, i128mem, - int_x86_sse41_pblendvb, SchedWriteVarBlend.XMM>; +defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem, + X86Blendv, SchedWriteFVarBlend.XMM>; +defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem, + X86Blendv, SchedWriteVarBlend.XMM>; // Aliases with the implicit xmm0 argument def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", @@ -6794,20 +6773,11 @@ (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>; let Predicates = [UseSSE41] in { - def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1), - (v16i8 VR128:$src2))), - (PBLENDVBrr0 VR128:$src2, VR128:$src1)>; - def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1), - (v4i32 VR128:$src2))), + def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1), + (v4i32 VR128:$src2))), (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; - def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1), - (v4f32 VR128:$src2))), - (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; - def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1), - (v2i64 VR128:$src2))), - (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; - def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1), - (v2f64 VR128:$src2))), + def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1), + (v2i64 VR128:$src2))), (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; } Index: llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h +++ llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h @@ -23,7 +23,7 @@ GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASSS, INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP, INTR_TYPE_3OP_IMM8, - CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, + CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, BLENDV, CVTPD2PS, CVTPD2PS_MASK, CVTPD2PS_RND_MASK, INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, @@ -340,6 +340,8 @@ X86_INTRINSIC_DATA(addcarry_64, ADX, X86ISD::ADC, X86ISD::ADD), X86_INTRINSIC_DATA(avx_addsub_pd_256, INTR_TYPE_2OP, X86ISD::ADDSUB, 0), X86_INTRINSIC_DATA(avx_addsub_ps_256, INTR_TYPE_2OP, X86ISD::ADDSUB, 0), + X86_INTRINSIC_DATA(avx_blendv_pd_256, BLENDV, X86ISD::BLENDV, 0), + X86_INTRINSIC_DATA(avx_blendv_ps_256, BLENDV, X86ISD::BLENDV, 0), X86_INTRINSIC_DATA(avx_cmp_pd_256, INTR_TYPE_3OP, X86ISD::CMPP, 0), X86_INTRINSIC_DATA(avx_cmp_ps_256, INTR_TYPE_3OP, X86ISD::CMPP, 0), X86_INTRINSIC_DATA(avx_cvt_pd2_ps_256,CVTPD2PS, ISD::FP_ROUND, 0), @@ -369,6 +371,7 @@ X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx2_pblendvb, BLENDV, X86ISD::BLENDV, 0), X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), @@ -1156,8 +1159,11 @@ X86_INTRINSIC_DATA(sse3_hadd_ps, INTR_TYPE_2OP, X86ISD::FHADD, 0), X86_INTRINSIC_DATA(sse3_hsub_pd, INTR_TYPE_2OP, X86ISD::FHSUB, 0), X86_INTRINSIC_DATA(sse3_hsub_ps, INTR_TYPE_2OP, X86ISD::FHSUB, 0), + X86_INTRINSIC_DATA(sse41_blendvpd, BLENDV, X86ISD::BLENDV, 0), + X86_INTRINSIC_DATA(sse41_blendvps, BLENDV, X86ISD::BLENDV, 0), X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0), X86_INTRINSIC_DATA(sse41_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(sse41_pblendvb, BLENDV, X86ISD::BLENDV, 0), X86_INTRINSIC_DATA(sse41_phminposuw, INTR_TYPE_1OP, X86ISD::PHMINPOS, 0), X86_INTRINSIC_DATA(sse41_round_pd, ROUNDP, X86ISD::VRNDSCALE, 0), X86_INTRINSIC_DATA(sse41_round_ps, ROUNDP, X86ISD::VRNDSCALE, 0),