diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -38821,11 +38821,21 @@ // Attempt to extract a i1 element by using MOVMSK to extract the signbits // and then testing the relevant element. + // + // Note that we only combine extracts on the *same* result number, i.e. + // t0 = merge_values a0, a1, a2, a3 + // i1 = extract_vector_elt t0, Constant:i64<2> + // i1 = extract_vector_elt t0, Constant:i64<3> + // but not + // i1 = extract_vector_elt t0:1, Constant:i64<2> + // since the latter would need its own MOVMSK. if (CIdx && SrcVT.getScalarType() == MVT::i1) { SmallVector BoolExtracts; - auto IsBoolExtract = [&BoolExtracts](SDNode *Use) { + unsigned ResNo = InputVector.getResNo(); + auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) { if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT && isa(Use->getOperand(1)) && + Use->getOperand(0).getResNo() == ResNo && Use->getValueType(0) == MVT::i1) { BoolExtracts.push_back(Use); return true; @@ -39666,7 +39676,7 @@ if (N->getOpcode() == ISD::SELECT && VT.isVector() && VT.getVectorElementType() == MVT::i1 && (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) { - MVT IntVT = MVT::getIntegerVT(VT.getVectorNumElements()); + EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements()); bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()); bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()); diff --git a/llvm/test/CodeGen/X86/pr45995-2.ll b/llvm/test/CodeGen/X86/pr45995-2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr45995-2.ll @@ -0,0 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O3 --x86-asm-syntax=intel -mtriple=x86_64-grtev4-linux-gnu -march=x86-64 -mcpu=skylake-avx512 -mattr=fma,avx512f < %s | FileCheck %s + +define <4 x i1> @selecter(i64 %0) { +; CHECK-LABEL: selecter: +; CHECK: # %bb.0: +; CHECK-NEXT: xor eax, eax +; CHECK-NEXT: cmp rdi, 1 +; CHECK-NEXT: setg al +; CHECK-NEXT: lea eax, [rax + 2*rax] +; CHECK-NEXT: kmovd k0, eax +; CHECK-NEXT: vpmovm2d xmm0, k0 +; CHECK-NEXT: ret + %2 = icmp slt i64 0, %0 + %3 = select i1 %2, <4 x i1> , <4 x i1> zeroinitializer + %4 = insertvalue [4 x <4 x i1>] zeroinitializer, <4 x i1> %3, 0 + %5 = icmp slt i64 1, %0 + %6 = select i1 %5, <4 x i1> , <4 x i1> zeroinitializer + %7 = insertvalue [4 x <4 x i1>] %4, <4 x i1> %6, 1 + %8 = icmp slt i64 2, %0 + %9 = select i1 %8, <4 x i1> , <4 x i1> zeroinitializer + %10 = insertvalue [4 x <4 x i1>] %7, <4 x i1> %9, 2 + %11 = icmp slt i64 3, %0 + %12 = select i1 %11, <4 x i1> , <4 x i1> zeroinitializer + %13 = insertvalue [4 x <4 x i1>] %10, <4 x i1> %12, 3 + %14 = extractvalue [4 x <4 x i1>] %13, 1 + ret <4 x i1> %14 +} diff --git a/llvm/test/CodeGen/X86/pr45995.ll b/llvm/test/CodeGen/X86/pr45995.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr45995.ll @@ -0,0 +1,147 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O3 --x86-asm-syntax=intel -mtriple=x86_64-grtev4-linux-gnu -march=x86-64 -mattr=avx < %s | FileCheck %s + +define void @extracter0([4 x <4 x i1>] %matrix) { +; CHECK-LABEL: extracter0: +; CHECK: # %bb.0: +; CHECK-NEXT: push rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: push r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: push rbx +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset rbx, -32 +; CHECK-NEXT: .cfi_offset r14, -24 +; CHECK-NEXT: .cfi_offset rbp, -16 +; CHECK-NEXT: vpslld xmm0, xmm0, 31 +; CHECK-NEXT: vmovmskps edi, xmm0 +; CHECK-NEXT: mov ebp, edi +; CHECK-NEXT: shr bpl, 3 +; CHECK-NEXT: mov r14d, edi +; CHECK-NEXT: and r14b, 4 +; CHECK-NEXT: shr r14b, 2 +; CHECK-NEXT: mov ebx, edi +; CHECK-NEXT: and bl, 2 +; CHECK-NEXT: shr bl +; CHECK-NEXT: call print_i1 +; CHECK-NEXT: movzx edi, bl +; CHECK-NEXT: call print_i1 +; CHECK-NEXT: movzx edi, r14b +; CHECK-NEXT: call print_i1 +; CHECK-NEXT: movzx edi, bpl +; CHECK-NEXT: call print_i1 +; CHECK-NEXT: pop rbx +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pop r14 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pop rbp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: ret + %1 = extractvalue [4 x <4 x i1>] %matrix, 0 + %2 = extractelement <4 x i1> %1, i64 0 + %3 = extractelement <4 x i1> %1, i64 1 + %4 = extractelement <4 x i1> %1, i64 2 + %5 = extractelement <4 x i1> %1, i64 3 + call void @print_i1(i1 %2) + call void @print_i1(i1 %3) + call void @print_i1(i1 %4) + call void @print_i1(i1 %5) + ret void +} + +define void @extracter1([4 x <4 x i1>] %matrix) { +; CHECK-LABEL: extracter1: +; CHECK: # %bb.0: +; CHECK-NEXT: push rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: push r15 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: push r14 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: push r13 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: push r12 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: push rbx +; CHECK-NEXT: .cfi_def_cfa_offset 56 +; CHECK-NEXT: push rax +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: .cfi_offset rbx, -56 +; CHECK-NEXT: .cfi_offset r12, -48 +; CHECK-NEXT: .cfi_offset r13, -40 +; CHECK-NEXT: .cfi_offset r14, -32 +; CHECK-NEXT: .cfi_offset r15, -24 +; CHECK-NEXT: .cfi_offset rbp, -16 +; CHECK-NEXT: vpslld xmm1, xmm1, 31 +; CHECK-NEXT: vmovmskps ebp, xmm1 +; CHECK-NEXT: mov eax, ebp +; CHECK-NEXT: shr al, 3 +; CHECK-NEXT: mov byte ptr [rsp + 7], al # 1-byte Spill +; CHECK-NEXT: mov r15d, ebp +; CHECK-NEXT: and r15b, 4 +; CHECK-NEXT: shr r15b, 2 +; CHECK-NEXT: mov r13d, ebp +; CHECK-NEXT: and r13b, 2 +; CHECK-NEXT: shr r13b +; CHECK-NEXT: vpslld xmm0, xmm0, 31 +; CHECK-NEXT: vmovmskps edi, xmm0 +; CHECK-NEXT: mov r12d, edi +; CHECK-NEXT: shr r12b, 3 +; CHECK-NEXT: mov ebx, edi +; CHECK-NEXT: and bl, 4 +; CHECK-NEXT: shr bl, 2 +; CHECK-NEXT: mov r14d, edi +; CHECK-NEXT: and r14b, 2 +; CHECK-NEXT: shr r14b +; CHECK-NEXT: call print_i1 +; CHECK-NEXT: movzx edi, r14b +; CHECK-NEXT: call print_i1 +; CHECK-NEXT: movzx edi, bl +; CHECK-NEXT: call print_i1 +; CHECK-NEXT: movzx edi, r12b +; CHECK-NEXT: call print_i1 +; CHECK-NEXT: mov edi, ebp +; CHECK-NEXT: call print_i1 +; CHECK-NEXT: movzx edi, r13b +; CHECK-NEXT: call print_i1 +; CHECK-NEXT: movzx edi, r15b +; CHECK-NEXT: call print_i1 +; CHECK-NEXT: movzx edi, byte ptr [rsp + 7] # 1-byte Folded Reload +; CHECK-NEXT: call print_i1 +; CHECK-NEXT: add rsp, 8 +; CHECK-NEXT: .cfi_def_cfa_offset 56 +; CHECK-NEXT: pop rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: pop r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pop r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pop r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pop r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pop rbp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: ret + %1 = extractvalue [4 x <4 x i1>] %matrix, 0 + %2 = extractelement <4 x i1> %1, i64 0 + %3 = extractelement <4 x i1> %1, i64 1 + %4 = extractelement <4 x i1> %1, i64 2 + %5 = extractelement <4 x i1> %1, i64 3 + call void @print_i1(i1 %2) + call void @print_i1(i1 %3) + call void @print_i1(i1 %4) + call void @print_i1(i1 %5) + %6 = extractvalue [4 x <4 x i1>] %matrix, 1 + %7 = extractelement <4 x i1> %6, i64 0 + %8 = extractelement <4 x i1> %6, i64 1 + %9 = extractelement <4 x i1> %6, i64 2 + %10 = extractelement <4 x i1> %6, i64 3 + call void @print_i1(i1 %7) + call void @print_i1(i1 %8) + call void @print_i1(i1 %9) + call void @print_i1(i1 %10) + ret void +} + +declare void @print_i1(i1)