Index: llvm/trunk/lib/Target/X86/X86DomainReassignment.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86DomainReassignment.cpp +++ llvm/trunk/lib/Target/X86/X86DomainReassignment.cpp @@ -663,8 +663,10 @@ createReplacer(X86::XOR32rr, X86::KXORDrr); createReplacer(X86::XOR64rr, X86::KXORQrr); - createReplacer(X86::TEST32rr, X86::KTESTDrr); - createReplacer(X86::TEST64rr, X86::KTESTQrr); + // TODO: KTEST is not a replacement for TEST due to flag differences. Need + // to prove only Z flag is used. + //createReplacer(X86::TEST32rr, X86::KTESTDrr); + //createReplacer(X86::TEST64rr, X86::KTESTQrr); } if (STI->hasDQI()) { @@ -684,8 +686,10 @@ createReplacer(X86::SHR8ri, X86::KSHIFTRBri); createReplacer(X86::SHL8ri, X86::KSHIFTLBri); - createReplacer(X86::TEST8rr, X86::KTESTBrr); - createReplacer(X86::TEST16rr, X86::KTESTWrr); + // TODO: KTEST is not a replacement for TEST due to flag differences. Need + // to prove only Z flag is used. + //createReplacer(X86::TEST8rr, X86::KTESTBrr); + //createReplacer(X86::TEST16rr, X86::KTESTWrr); createReplacer(X86::XOR8rr, X86::KXORBrr); } Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -17114,24 +17114,6 @@ return false; } -// Emit KTEST instruction for bit vectors on AVX-512 -static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - if (Op.getOpcode() == ISD::BITCAST) { - auto hasKTEST = [&](MVT VT) { - unsigned SizeInBits = VT.getSizeInBits(); - return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) || - (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64)); - }; - SDValue Op0 = Op.getOperand(0); - MVT Op0VT = Op0.getValueType().getSimpleVT(); - if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 && - hasKTEST(Op0VT)) - return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0); - } - return SDValue(); -} - /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent. SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, @@ -17171,9 +17153,6 @@ // doing a separate TEST. TEST always sets OF and CF to 0, so unless // we prove that the arithmetic won't overflow, we can't use OF or CF. if (Op.getResNo() != 0 || NeedOF || NeedCF) { - // Emit KTEST for bit vectors - if (auto Node = EmitKTEST(Op, DAG, Subtarget)) - return Node; // Emit a CMP with 0, which is the TEST pattern. return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, dl, Op.getValueType())); @@ -17396,10 +17375,6 @@ } if (Opcode == 0) { - // Emit KTEST for bit vectors - if (auto Node = EmitKTEST(Op, DAG, Subtarget)) - return Node; - // Emit a CMP with 0, which is the TEST pattern. return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, dl, Op.getValueType())); @@ -18146,6 +18121,34 @@ return Result; } +// Try to select this as a KTEST+SETCC if possible. +static SDValue EmitKTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC, + const SDLoc &dl, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // Only support equality comparisons. + if (CC != ISD::SETEQ && CC != ISD::SETNE) + return SDValue(); + + // Must be a bitcast from vXi1. + if (Op0.getOpcode() != ISD::BITCAST) + return SDValue(); + + Op0 = Op0.getOperand(0); + MVT VT = Op0.getSimpleValueType(); + if (!(Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1)) && + !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))) + return SDValue(); + + X86::CondCode X86CC; + if (isNullConstant(Op1)) { + X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE; + } else + return SDValue(); + + SDValue KTEST = DAG.getNode(X86ISD::KTEST, dl, MVT::i32, Op0, Op0); + return getSETCC(X86CC, KTEST, dl, DAG); +} + SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); @@ -18176,6 +18179,10 @@ return NewSetCC; } + // Try to lower using KTEST. + if (SDValue NewSetCC = EmitKTEST(Op0, Op1, CC, dl, DAG, Subtarget)) + return NewSetCC; + // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of // these. if ((isOneConstant(Op1) || isNullConstant(Op1)) && Index: llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll +++ llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll @@ -2694,3 +2694,95 @@ %ret = bitcast <8 x i1> %m2 to i8 ret i8 %ret } + +; Make sure we don't emit a ktest for signed comparisons. +define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) { +; KNL-LABEL: ktest_signed: +; KNL: ## %bb.0: +; KNL-NEXT: pushq %rax +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: vporq %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vptestnmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: testw %ax, %ax +; KNL-NEXT: jle LBB64_1 +; KNL-NEXT: ## %bb.2: ## %bb.2 +; KNL-NEXT: popq %rax +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; KNL-NEXT: LBB64_1: ## %bb.1 +; KNL-NEXT: vzeroupper +; KNL-NEXT: callq _foo +; KNL-NEXT: popq %rax +; KNL-NEXT: retq +; +; SKX-LABEL: ktest_signed: +; SKX: ## %bb.0: +; SKX-NEXT: pushq %rax +; SKX-NEXT: .cfi_def_cfa_offset 16 +; SKX-NEXT: vporq %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k0 +; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: testw %ax, %ax +; SKX-NEXT: jle LBB64_1 +; SKX-NEXT: ## %bb.2: ## %bb.2 +; SKX-NEXT: popq %rax +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq +; SKX-NEXT: LBB64_1: ## %bb.1 +; SKX-NEXT: vzeroupper +; SKX-NEXT: callq _foo +; SKX-NEXT: popq %rax +; SKX-NEXT: retq +; +; AVX512BW-LABEL: ktest_signed: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: pushq %rax +; AVX512BW-NEXT: .cfi_def_cfa_offset 16 +; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testw %ax, %ax +; AVX512BW-NEXT: jle LBB64_1 +; AVX512BW-NEXT: ## %bb.2: ## %bb.2 +; AVX512BW-NEXT: popq %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; AVX512BW-NEXT: LBB64_1: ## %bb.1 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: callq _foo +; AVX512BW-NEXT: popq %rax +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: ktest_signed: +; AVX512DQ: ## %bb.0: +; AVX512DQ-NEXT: pushq %rax +; AVX512DQ-NEXT: .cfi_def_cfa_offset 16 +; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: testw %ax, %ax +; AVX512DQ-NEXT: jle LBB64_1 +; AVX512DQ-NEXT: ## %bb.2: ## %bb.2 +; AVX512DQ-NEXT: popq %rax +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; AVX512DQ-NEXT: LBB64_1: ## %bb.1 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: callq _foo +; AVX512DQ-NEXT: popq %rax +; AVX512DQ-NEXT: retq + %a = icmp eq <16 x i32> %x, zeroinitializer + %b = icmp eq <16 x i32> %y, zeroinitializer + %c = and <16 x i1> %a, %b + %d = bitcast <16 x i1> %c to i16 + %e = icmp sgt i16 %d, 0 + br i1 %e, label %bb.2, label %bb.1 +bb.1: + call void @foo() + br label %bb.2 +bb.2: + ret void +} +declare void @foo() + Index: llvm/trunk/test/CodeGen/X86/domain-reassignment.mir =================================================================== --- llvm/trunk/test/CodeGen/X86/domain-reassignment.mir +++ llvm/trunk/test/CodeGen/X86/domain-reassignment.mir @@ -256,7 +256,7 @@ body: | ; CHECK-LABEL: name: test_8bitops ; CHECK: bb.0: - ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $rdi, $zmm0, $zmm1, $zmm2, $zmm3 ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY $rdi ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY $zmm0 @@ -277,9 +277,6 @@ ; CHECK: [[COPY8:%[0-9]+]]:vk8wm = COPY [[COPY7]] ; CHECK: [[VMOVAPDZrrk:%[0-9]+]]:vr512 = VMOVAPDZrrk [[COPY2]], killed [[COPY8]], [[COPY1]] ; CHECK: VMOVAPDZmr [[COPY]], 1, $noreg, 0, $noreg, killed [[VMOVAPDZrrk]] - ; CHECK: KTESTBrr [[KADDBrr]], [[KADDBrr]], implicit-def $eflags - ; CHECK: JE_1 %bb.1, implicit $eflags - ; CHECK: JMP_1 %bb.2 ; CHECK: bb.1: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: bb.2: @@ -311,9 +308,10 @@ %11 = VMOVAPDZrrk %2, killed %10, %1 VMOVAPDZmr %0, 1, $noreg, 0, $noreg, killed %11 - TEST8rr %18, %18, implicit-def $eflags - JE_1 %bb.1, implicit $eflags - JMP_1 %bb.2 + ; FIXME We can't replace TEST with KTEST due to flag differences + ; TEST8rr %18, %18, implicit-def $eflags + ; JE_1 %bb.1, implicit $eflags + ; JMP_1 %bb.2 bb.1: @@ -377,7 +375,7 @@ body: | ; CHECK-LABEL: name: test_16bitops ; CHECK: bb.0: - ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $rdi, $zmm0, $zmm1, $zmm2, $zmm3 ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY $rdi ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY $zmm0 @@ -397,9 +395,6 @@ ; CHECK: [[COPY8:%[0-9]+]]:vk16wm = COPY [[COPY7]] ; CHECK: [[VMOVAPSZrrk:%[0-9]+]]:vr512 = VMOVAPSZrrk [[COPY2]], killed [[COPY8]], [[COPY1]] ; CHECK: VMOVAPSZmr [[COPY]], 1, $noreg, 0, $noreg, killed [[VMOVAPSZrrk]] - ; CHECK: KTESTWrr [[KXORWrr]], [[KXORWrr]], implicit-def $eflags - ; CHECK: JE_1 %bb.1, implicit $eflags - ; CHECK: JMP_1 %bb.2 ; CHECK: bb.1: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: bb.2: @@ -430,9 +425,10 @@ %11 = VMOVAPSZrrk %2, killed %10, %1 VMOVAPSZmr %0, 1, $noreg, 0, $noreg, killed %11 - TEST16rr %17, %17, implicit-def $eflags - JE_1 %bb.1, implicit $eflags - JMP_1 %bb.2 + ; FIXME We can't replace TEST with KTEST due to flag differences + ; FIXME TEST16rr %17, %17, implicit-def $eflags + ; FIXME JE_1 %bb.1, implicit $eflags + ; FIXME JMP_1 %bb.2 bb.1: @@ -490,7 +486,7 @@ body: | ; CHECK-LABEL: name: test_32bitops ; CHECK: bb.0: - ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $rdi, $zmm0, $zmm1 ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY $rdi ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY $zmm0 @@ -507,9 +503,6 @@ ; CHECK: [[COPY3:%[0-9]+]]:vk32wm = COPY [[KADDDrr]] ; CHECK: [[VMOVDQU16Zrrk:%[0-9]+]]:vr512 = VMOVDQU16Zrrk [[COPY2]], killed [[COPY3]], [[COPY1]] ; CHECK: VMOVDQA32Zmr [[COPY]], 1, $noreg, 0, $noreg, killed [[VMOVDQU16Zrrk]] - ; CHECK: KTESTDrr [[KADDDrr]], [[KADDDrr]], implicit-def $eflags - ; CHECK: JE_1 %bb.1, implicit $eflags - ; CHECK: JMP_1 %bb.2 ; CHECK: bb.1: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: bb.2: @@ -535,9 +528,10 @@ %4 = VMOVDQU16Zrrk %2, killed %3, %1 VMOVDQA32Zmr %0, 1, $noreg, 0, $noreg, killed %4 - TEST32rr %13, %13, implicit-def $eflags - JE_1 %bb.1, implicit $eflags - JMP_1 %bb.2 + ; FIXME We can't replace TEST with KTEST due to flag differences + ; FIXME TEST32rr %13, %13, implicit-def $eflags + ; FIXME JE_1 %bb.1, implicit $eflags + ; FIXME JMP_1 %bb.2 bb.1: @@ -595,7 +589,7 @@ body: | ; CHECK-LABEL: name: test_64bitops ; CHECK: bb.0: - ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $rdi, $zmm0, $zmm1 ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY $rdi ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY $zmm0 @@ -612,9 +606,6 @@ ; CHECK: [[COPY3:%[0-9]+]]:vk64wm = COPY [[KADDQrr]] ; CHECK: [[VMOVDQU8Zrrk:%[0-9]+]]:vr512 = VMOVDQU8Zrrk [[COPY2]], killed [[COPY3]], [[COPY1]] ; CHECK: VMOVDQA32Zmr [[COPY]], 1, $noreg, 0, $noreg, killed [[VMOVDQU8Zrrk]] - ; CHECK: KTESTQrr [[KADDQrr]], [[KADDQrr]], implicit-def $eflags - ; CHECK: JE_1 %bb.1, implicit $eflags - ; CHECK: JMP_1 %bb.2 ; CHECK: bb.1: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: bb.2: @@ -640,9 +631,10 @@ %4 = VMOVDQU8Zrrk %2, killed %3, %1 VMOVDQA32Zmr %0, 1, $noreg, 0, $noreg, killed %4 - TEST64rr %13, %13, implicit-def $eflags - JE_1 %bb.1, implicit $eflags - JMP_1 %bb.2 + ; FIXME We can't replace TEST with KTEST due to flag differences + ; FIXME TEST64rr %13, %13, implicit-def $eflags + ; FIXME JE_1 %bb.1, implicit $eflags + ; FIXME JMP_1 %bb.2 bb.1: