Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12237,10 +12237,14 @@ else if (Opcode == ISD::ZERO_EXTEND) ExtLoadOpcode = ISD::ZEXTLOAD; + // Illegal VSELECT may ISel fail if happen after lagelization (DAG + // Combine2), so we should conservatively check the OperationAction. LoadSDNode *Load1 = cast(Op1); LoadSDNode *Load2 = cast(Op2); if (!TLI.isLoadExtLegal(ExtLoadOpcode, VT, Load1->getMemoryVT()) || - !TLI.isLoadExtLegal(ExtLoadOpcode, VT, Load2->getMemoryVT())) + !TLI.isLoadExtLegal(ExtLoadOpcode, VT, Load2->getMemoryVT()) || + (N0->getOpcode() == ISD::VSELECT && + TLI.getOperationAction(ISD::VSELECT, VT) != TargetLowering::Legal)) return SDValue(); SDValue Ext1 = DAG.getNode(Opcode, DL, VT, Op1); Index: llvm/lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -1039,6 +1039,8 @@ break; assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!"); + assert(N->getValueType(0).getVectorElementType() != MVT::i16 && + "We can't replace VSELECT with BLENDV in vxi16!"); SDValue Blendv = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), N->getOperand(0), N->getOperand(1), N->getOperand(2)); Index: llvm/test/CodeGen/X86/select-ext.ll =================================================================== --- llvm/test/CodeGen/X86/select-ext.ll +++ llvm/test/CodeGen/X86/select-ext.ll @@ -94,11 +94,12 @@ define <2 x i64> @zext_vector_v2i1(ptr %p, <2 x i1> %c) { ; CHECK-LABEL: zext_vector_v2i1: ; CHECK: # %bb.0: -; CHECK-NEXT: psllq $63, %xmm0 -; CHECK-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; CHECK-NEXT: pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero -; CHECK-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: pslld $31, %xmm0 +; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; CHECK-NEXT: blendvps %xmm0, %xmm2, %xmm1 +; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero ; CHECK-NEXT: retq %ld1 = load volatile <2 x i32>, ptr %p %arrayidx1 = getelementptr inbounds <2 x i32>, ptr %p, i64 1 @@ -131,11 +132,12 @@ define <2 x i64> @sext_vector_v2i1(ptr %p, <2 x i1> %c) { ; CHECK-LABEL: sext_vector_v2i1: ; CHECK: # %bb.0: -; CHECK-NEXT: psllq $63, %xmm0 -; CHECK-NEXT: pmovsxdq (%rdi), %xmm1 -; CHECK-NEXT: pmovsxdq 8(%rdi), %xmm2 -; CHECK-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: pslld $31, %xmm0 +; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; CHECK-NEXT: blendvps %xmm0, %xmm2, %xmm1 +; CHECK-NEXT: pmovsxdq %xmm1, %xmm0 ; CHECK-NEXT: retq %ld1 = load volatile <2 x i32>, ptr %p %arrayidx1 = getelementptr inbounds <2 x i32>, ptr %p, i64 1 Index: llvm/test/CodeGen/X86/vselect-post-combine.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/vselect-post-combine.ll @@ -0,0 +1,29 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-pc-windows-msvc -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 + +define ptr @test_mul(<32 x i8> %vecinit.i.i.i.i.i92) { +; AVX2-LABEL: test_mul: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpblendvb %xmm0, (%rcx), %xmm1, %xmm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, 0 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +entry: + %vecinit31.i.i.i.i.i123 = shufflevector <32 x i8> %vecinit.i.i.i.i.i92, <32 x i8> , <32 x i32> + %0 = bitcast <32 x i8> %vecinit31.i.i.i.i.i123 to <4 x i64> + %shuffle.i.i.i6.i.i = shufflevector <4 x i64> %0, <4 x i64> zeroinitializer, <2 x i32> + %1 = bitcast <2 x i64> %shuffle.i.i.i6.i.i to <16 x i8> + %conv.i.i7.i.i = zext <16 x i8> %1 to <16 x i16> + %2 = tail call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %conv.i.i7.i.i, <16 x i16> zeroinitializer) + %3 = bitcast <32 x i8> %2 to <4 x i64> + store <4 x i64> %3, ptr null, align 1 + ret ptr null +} + +declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>)