Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -35542,6 +35542,39 @@ return SDValue(); } +/// If both arms of a vector select are concatenated vectors, split the select, +/// and concatenate the result to eliminate a wide (256-bit) vector instruction: +/// vselect Cond, (concat T0, T1), (concat F0, F1) --> +/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1) +static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + unsigned Opcode = N->getOpcode(); + if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT) + return SDValue(); + + // TODO: Split 512-bit vectors too? + EVT VT = N->getValueType(0); + if (!VT.is256BitVector()) + return SDValue(); + + // TODO: Split as long as any 2 of the 3 operands are concatenated? + SDValue Cond = N->getOperand(0); + SDValue TVal = N->getOperand(1); + SDValue FVal = N->getOperand(2); + SmallVector CatOpsT, CatOpsF; + if (!TVal.hasOneUse() || !FVal.hasOneUse() || + !collectConcatOps(TVal.getNode(), CatOpsT) || + !collectConcatOps(FVal.getNode(), CatOpsF)) + return SDValue(); + + auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL, + ArrayRef Ops) { + return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops); + }; + return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal }, + makeBlend, /*CheckBWI*/ false); +} + static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) { SDValue Cond = N->getOperand(0); SDValue LHS = N->getOperand(1); @@ -36105,6 +36138,9 @@ if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget)) return V; + if (SDValue V = narrowVectorSelect(N, DAG, Subtarget)) + return V; + // Custom action for SELECT MMX if (VT == MVT::x86mmx) { LHS = DAG.getBitcast(MVT::i64, LHS); Index: llvm/trunk/test/CodeGen/X86/cast-vsel.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/cast-vsel.ll +++ llvm/trunk/test/CodeGen/X86/cast-vsel.ll @@ -38,15 +38,16 @@ ; AVX1-LABEL: sext: ; AVX1: # %bb.0: ; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpmovsxwd %xmm2, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm4, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm5, %xmm5 +; AVX1-NEXT: vblendvps %xmm1, %xmm4, %xmm5, %xmm1 ; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vpmovsxwd %xmm3, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] ; AVX1-NEXT: vpmovsxwd %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0 +; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: sext: @@ -95,12 +96,13 @@ ; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vblendvps %xmm5, %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 +; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext: @@ -403,6 +405,8 @@ ret void } +; TODO: AVX1 could have used 256-bit ops for a likely improvement. + define void @example24(i16 signext %x, i16 signext %y) nounwind { ; SSE2-LABEL: example24: ; SSE2: # %bb.0: # %vector.ph @@ -469,26 +473,27 @@ ; AVX1: # %bb.0: # %vector.ph ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vmovd %esi, %xmm1 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] +; AVX1-NEXT: vmovd %esi, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] ; AVX1-NEXT: movq $-4096, %rax # imm = 0xF000 -; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vpmovsxwd %xmm1, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vpmovsxwd %xmm2, %xmm0 +; AVX1-NEXT: vpmovsxwd %xmm3, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm3, %xmm3 ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB6_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vmovups da+4096(%rax), %ymm2 -; AVX1-NEXT: vcmpltps db+4096(%rax), %ymm2, %ymm2 -; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm2 -; AVX1-NEXT: vmovups %ymm2, dj+4096(%rax) +; AVX1-NEXT: vmovups da+4096(%rax), %ymm4 +; AVX1-NEXT: vcmpltps db+4096(%rax), %ymm4, %ymm4 +; AVX1-NEXT: vblendvps %xmm4, %xmm0, %xmm1, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-NEXT: vblendvps %xmm4, %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vmovaps %xmm4, dj+4112(%rax) +; AVX1-NEXT: vmovaps %xmm5, dj+4096(%rax) ; AVX1-NEXT: addq $32, %rax ; AVX1-NEXT: jne .LBB6_1 ; AVX1-NEXT: # %bb.2: # %for.end Index: llvm/trunk/test/CodeGen/X86/known-signbits-vector.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/known-signbits-vector.ll +++ llvm/trunk/test/CodeGen/X86/known-signbits-vector.ll @@ -310,24 +310,23 @@ ; X32-NEXT: movl %esp, %ebp ; X32-NEXT: andl $-16, %esp ; X32-NEXT: subl $16, %esp -; X32-NEXT: vpmovsxdq 16(%ebp), %xmm3 -; X32-NEXT: vpmovsxdq 8(%ebp), %xmm4 -; X32-NEXT: vextractf128 $1, %ymm2, %xmm5 -; X32-NEXT: vpsrlq $33, %xmm5, %xmm5 +; X32-NEXT: vpmovsxdq 8(%ebp), %xmm3 +; X32-NEXT: vpmovsxdq 16(%ebp), %xmm4 +; X32-NEXT: vpsrlq $33, %xmm2, %xmm5 ; X32-NEXT: vmovdqa {{.*#+}} xmm6 = [1073741824,0,1,0] ; X32-NEXT: vpxor %xmm6, %xmm5, %xmm5 ; X32-NEXT: vpsubq %xmm6, %xmm5, %xmm5 +; X32-NEXT: vextractf128 $1, %ymm2, %xmm2 ; X32-NEXT: vpsrlq $33, %xmm2, %xmm2 ; X32-NEXT: vpxor %xmm6, %xmm2, %xmm2 ; X32-NEXT: vpsubq %xmm6, %xmm2, %xmm2 -; X32-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; X32-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; X32-NEXT: vextractf128 $1, %ymm1, %xmm4 -; X32-NEXT: vextractf128 $1, %ymm0, %xmm5 -; X32-NEXT: vpcmpeqq %xmm4, %xmm5, %xmm4 +; X32-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm6 +; X32-NEXT: vblendvpd %xmm6, %xmm5, %xmm3, %xmm3 +; X32-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X32-NEXT: vextractf128 $1, %ymm0, %xmm0 ; X32-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; X32-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; X32-NEXT: vblendvpd %ymm0, %ymm2, %ymm3, %ymm0 +; X32-NEXT: vblendvpd %xmm0, %xmm2, %xmm4, %xmm0 +; X32-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; X32-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; X32-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] @@ -339,25 +338,24 @@ ; ; X64-LABEL: signbits_ashr_sext_select_shuffle_sitofp: ; X64: # %bb.0: -; X64-NEXT: vextractf128 $1, %ymm2, %xmm4 -; X64-NEXT: vpsrlq $33, %xmm4, %xmm4 +; X64-NEXT: vpsrlq $33, %xmm2, %xmm4 ; X64-NEXT: vmovdqa {{.*#+}} xmm5 = [1073741824,1] ; X64-NEXT: vpxor %xmm5, %xmm4, %xmm4 ; X64-NEXT: vpsubq %xmm5, %xmm4, %xmm4 +; X64-NEXT: vextractf128 $1, %ymm2, %xmm2 ; X64-NEXT: vpsrlq $33, %xmm2, %xmm2 ; X64-NEXT: vpxor %xmm5, %xmm2, %xmm2 ; X64-NEXT: vpsubq %xmm5, %xmm2, %xmm2 -; X64-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; X64-NEXT: vpmovsxdq %xmm3, %xmm4 +; X64-NEXT: vpmovsxdq %xmm3, %xmm5 ; X64-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] ; X64-NEXT: vpmovsxdq %xmm3, %xmm3 -; X64-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; X64-NEXT: vextractf128 $1, %ymm1, %xmm4 -; X64-NEXT: vextractf128 $1, %ymm0, %xmm5 -; X64-NEXT: vpcmpeqq %xmm4, %xmm5, %xmm4 +; X64-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm6 +; X64-NEXT: vblendvpd %xmm6, %xmm4, %xmm5, %xmm4 +; X64-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X64-NEXT: vextractf128 $1, %ymm0, %xmm0 ; X64-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; X64-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; X64-NEXT: vblendvpd %ymm0, %ymm2, %ymm3, %ymm0 +; X64-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0 +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 ; X64-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] Index: llvm/trunk/test/CodeGen/X86/vselect-avx.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vselect-avx.ll +++ llvm/trunk/test/CodeGen/X86/vselect-avx.ll @@ -167,7 +167,7 @@ ret <32 x i8> %tmp } -; TODO: Split a 256-bit select into two 128-bit selects when the operands are concatenated. +; Split a 256-bit select into two 128-bit selects when the operands are concatenated. define void @blendv_split(<8 x i32>* %p, <8 x i32> %cond, <8 x i32> %a, <8 x i32> %x, <8 x i32> %y, <8 x i32> %z, <8 x i32> %w) { ; AVX1-LABEL: blendv_split: @@ -177,12 +177,13 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpslld %xmm2, %xmm4, %xmm5 ; AVX1-NEXT: vpslld %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 ; AVX1-NEXT: vpslld %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpslld %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 -; AVX1-NEXT: vmovups %ymm0, (%rdi) +; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm0, %xmm5, %xmm4, %xmm0 +; AVX1-NEXT: vmovups %xmm0, 16(%rdi) +; AVX1-NEXT: vmovups %xmm1, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ;