Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -368,9 +368,10 @@ // Vector packed fp sign bitwise comparisons. TESTP, - // OR/AND test for masks. - KORTEST, - KTEST, + // Test for masks. Returns ZF for all 0s or CF for all 1s. Only takes + // a single argument. Isel can combine with or/and to make KORTEST/KTEST. + // Otherwise will just emit a KORTEST with duplicate operands. + KTESTREG, // Several flavors of instructions with vector shuffle behaviors. // Saturated signed/unnsigned packing. Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -17120,7 +17120,7 @@ MVT Op0VT = Op0.getValueType().getSimpleVT(); if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 && hasKTEST(Op0VT)) - return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0); + return DAG.getNode(X86ISD::KTESTREG, SDLoc(Op), MVT::i32, Op0); } return SDValue(); } @@ -20553,7 +20553,8 @@ (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B; SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1)); SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2)); - SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); + SDValue Test = DAG.getNode(ISD::OR, dl, MVT::v16i1, LHS, RHS); + Test = DAG.getNode(X86ISD::KTESTREG, dl, MVT::i32, Test); SDValue SetCC = getSETCC(X86CC, Test, dl, DAG); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } @@ -25213,8 +25214,7 @@ case X86ISD::MOVMSK: return "X86ISD::MOVMSK"; case X86ISD::PTEST: return "X86ISD::PTEST"; case X86ISD::TESTP: return "X86ISD::TESTP"; - case X86ISD::KORTEST: return "X86ISD::KORTEST"; - case X86ISD::KTEST: return "X86ISD::KTEST"; + case X86ISD::KTESTREG: return "X86ISD::KTESTREG"; case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL"; case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR"; case X86ISD::PACKSS: return "X86ISD::PACKSS"; Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -2909,8 +2909,8 @@ let Predicates = [prd], Defs = [EFLAGS] in def rr : I, - Sched<[itins.Sched]>; + [(set EFLAGS, (X86ktestreg (OpNode KRC:$src1, KRC:$src2)))], + itins.rr>, Sched<[itins.Sched]>; } multiclass avx512_mask_testop_w opc, string OpcodeStr, SDNode OpNode, @@ -2925,8 +2925,19 @@ VEX, PD, VEX_W; } -defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest, SSE_PTEST>; -defm KTEST : avx512_mask_testop_w<0x99, "ktest", X86ktest, SSE_PTEST, HasDQI>; +defm KORTEST : avx512_mask_testop_w<0x98, "kortest", or, SSE_PTEST>; +defm KTEST : avx512_mask_testop_w<0x99, "ktest", and, SSE_PTEST, HasDQI>; + +// Patterns to use a single kortest with the same operand twice if we can't find +// an and/or to combine with. +def : Pat<(X86ktestreg VK8:$src), + (KORTESTBrr VK8:$src, VK8:$src)>, Requires<[HasDQI]>; +def : Pat<(X86ktestreg VK16:$src), + (KORTESTWrr VK16:$src, VK16:$src)>, Requires<[HasAVX512]>; +def : Pat<(X86ktestreg VK32:$src), + (KORTESTDrr VK32:$src, VK32:$src)>, Requires<[HasBWI]>; +def : Pat<(X86ktestreg VK64:$src), + (KORTESTQrr VK64:$src, VK64:$src)>, Requires<[HasBWI]>; // Mask shift multiclass avx512_mask_shiftop opc, string OpcodeStr, RegisterClass KRC, Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -242,8 +242,9 @@ def X86avg : SDNode<"X86ISD::AVG" , SDTIntBinOp, [SDNPCommutative]>; def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>; -def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>; -def X86ktest : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>; +def X86ktestreg: SDNode<"X86ISD::KTESTREG", + SDTypeProfile<1, 1, [SDTCisVT<0, i32>, + SDTCVecEltisVT<1, i1>]>>; def X86movmsk : SDNode<"X86ISD::MOVMSK", SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVec<1>]>>; Index: test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- test/CodeGen/X86/avx512-mask-op.ll +++ test/CodeGen/X86/avx512-mask-op.ll @@ -633,8 +633,7 @@ ; SKX-NEXT: vpmovw2m %xmm0, %k0 ; SKX-NEXT: movb $85, %al ; SKX-NEXT: kmovd %eax, %k1 -; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: ktestb %k0, %k0 +; SKX-NEXT: kortestb %k1, %k0 ; SKX-NEXT: retq ; ; AVX512BW-LABEL: test7: @@ -656,8 +655,7 @@ ; AVX512DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512DQ-NEXT: movb $85, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: korb %k1, %k0, %k0 -; AVX512DQ-NEXT: ktestb %k0, %k0 +; AVX512DQ-NEXT: kortestb %k1, %k0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq allocas: @@ -1722,8 +1720,8 @@ ; SKX-NEXT: vmovupd (%rdi), %zmm1 ; SKX-NEXT: vcmpltpd %zmm0, %zmm1, %k1 ; SKX-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} -; SKX-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} -; SKX-NEXT: ktestb %k0, %k0 +; SKX-NEXT: vcmpltpd %zmm1, %zmm0, %k0 +; SKX-NEXT: ktestb %k0, %k1 ; SKX-NEXT: je LBB42_2 ; SKX-NEXT: ## %bb.1: ## %L1 ; SKX-NEXT: vmovapd %zmm0, (%rdi) @@ -1757,8 +1755,8 @@ ; AVX512DQ-NEXT: vmovupd (%rdi), %zmm1 ; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1 ; AVX512DQ-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} -; AVX512DQ-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} -; AVX512DQ-NEXT: ktestb %k0, %k0 +; AVX512DQ-NEXT: vcmpltpd %zmm1, %zmm0, %k0 +; AVX512DQ-NEXT: ktestb %k0, %k1 ; AVX512DQ-NEXT: je LBB42_2 ; AVX512DQ-NEXT: ## %bb.1: ## %L1 ; AVX512DQ-NEXT: vmovapd %zmm0, (%rdi) @@ -1837,8 +1835,7 @@ ; SKX-NEXT: vcmpltps %zmm3, %zmm0, %k1 ; SKX-NEXT: vcmpltps %zmm2, %zmm1, %k2 ; SKX-NEXT: kunpckwd %k1, %k2, %k1 -; SKX-NEXT: kord %k1, %k0, %k0 -; SKX-NEXT: ktestd %k0, %k0 +; SKX-NEXT: kortestd %k1, %k0 ; SKX-NEXT: je LBB43_2 ; SKX-NEXT: ## %bb.1: ## %L1 ; SKX-NEXT: vmovaps %zmm0, (%rdi) @@ -1863,8 +1860,7 @@ ; AVX512BW-NEXT: vcmpltps %zmm3, %zmm0, %k1 ; AVX512BW-NEXT: vcmpltps %zmm2, %zmm1, %k2 ; AVX512BW-NEXT: kunpckwd %k1, %k2, %k1 -; AVX512BW-NEXT: kord %k1, %k0, %k0 -; AVX512BW-NEXT: ktestd %k0, %k0 +; AVX512BW-NEXT: kortestd %k1, %k0 ; AVX512BW-NEXT: je LBB43_2 ; AVX512BW-NEXT: ## %bb.1: ## %L1 ; AVX512BW-NEXT: vmovaps %zmm0, (%rdi) Index: test/CodeGen/X86/avx512-schedule.ll =================================================================== --- test/CodeGen/X86/avx512-schedule.ll +++ test/CodeGen/X86/avx512-schedule.ll @@ -7034,8 +7034,7 @@ ; GENERIC-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:0.33] ; GENERIC-NEXT: movb $85, %al # sched: [1:0.33] ; GENERIC-NEXT: kmovd %eax, %k1 # sched: [1:0.33] -; GENERIC-NEXT: korb %k1, %k0, %k0 # sched: [1:1.00] -; GENERIC-NEXT: ktestb %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: kortestb %k1, %k0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vcmp_test7: @@ -7044,8 +7043,7 @@ ; SKX-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:1.00] ; SKX-NEXT: movb $85, %al # sched: [1:0.25] ; SKX-NEXT: kmovd %eax, %k1 # sched: [1:1.00] -; SKX-NEXT: korb %k1, %k0, %k0 # sched: [1:1.00] -; SKX-NEXT: ktestb %k0, %k0 # sched: [3:1.00] +; SKX-NEXT: kortestb %k1, %k0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] allocas: %a= or <8 x i1> %mask, @@ -7618,8 +7616,8 @@ ; GENERIC-NEXT: vmovupd (%rdi), %zmm1 # sched: [4:0.50] ; GENERIC-NEXT: vcmpltpd %zmm0, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} # sched: [4:0.50] -; GENERIC-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} # sched: [3:1.00] -; GENERIC-NEXT: ktestb %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: vcmpltpd %zmm1, %zmm0, %k0 # sched: [3:1.00] +; GENERIC-NEXT: ktestb %k0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: je .LBB410_2 # sched: [1:1.00] ; GENERIC-NEXT: # %bb.1: # %L1 ; GENERIC-NEXT: vmovapd %zmm0, (%rdi) # sched: [1:1.00] @@ -7635,8 +7633,8 @@ ; SKX-NEXT: vmovupd (%rdi), %zmm1 # sched: [8:0.50] ; SKX-NEXT: vcmpltpd %zmm0, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} # sched: [8:0.50] -; SKX-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} # sched: [3:1.00] -; SKX-NEXT: ktestb %k0, %k0 # sched: [3:1.00] +; SKX-NEXT: vcmpltpd %zmm1, %zmm0, %k0 # sched: [3:1.00] +; SKX-NEXT: ktestb %k0, %k1 # sched: [3:1.00] ; SKX-NEXT: je .LBB410_2 # sched: [1:0.50] ; SKX-NEXT: # %bb.1: # %L1 ; SKX-NEXT: vmovapd %zmm0, (%rdi) # sched: [1:1.00] @@ -7687,8 +7685,7 @@ ; GENERIC-NEXT: vcmpltps %zmm3, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vcmpltps %zmm2, %zmm1, %k2 # sched: [3:1.00] ; GENERIC-NEXT: kunpckwd %k1, %k2, %k1 # sched: [1:1.00] -; GENERIC-NEXT: kord %k1, %k0, %k0 # sched: [1:1.00] -; GENERIC-NEXT: ktestd %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: kortestd %k1, %k0 # sched: [1:1.00] ; GENERIC-NEXT: je .LBB411_2 # sched: [1:1.00] ; GENERIC-NEXT: # %bb.1: # %L1 ; GENERIC-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00] @@ -7707,14 +7704,13 @@ ; SKX-NEXT: vmovups 64(%rdi), %zmm3 # sched: [8:0.50] ; SKX-NEXT: vcmpltps %zmm0, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vcmpltps %zmm1, %zmm3, %k2 # sched: [3:1.00] +; SKX-NEXT: kunpckwd %k1, %k2, %k0 # sched: [3:1.00] ; SKX-NEXT: vmovups 68(%rdi), %zmm2 {%k2} {z} # sched: [8:0.50] ; SKX-NEXT: vmovups 4(%rdi), %zmm3 {%k1} {z} # sched: [8:0.50] -; SKX-NEXT: kunpckwd %k1, %k2, %k0 # sched: [3:1.00] ; SKX-NEXT: vcmpltps %zmm3, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vcmpltps %zmm2, %zmm1, %k2 # sched: [3:1.00] ; SKX-NEXT: kunpckwd %k1, %k2, %k1 # sched: [3:1.00] -; SKX-NEXT: kord %k1, %k0, %k0 # sched: [1:1.00] -; SKX-NEXT: ktestd %k0, %k0 # sched: [3:1.00] +; SKX-NEXT: kortestd %k1, %k0 # sched: [3:1.00] ; SKX-NEXT: je .LBB411_2 # sched: [1:0.50] ; SKX-NEXT: # %bb.1: # %L1 ; SKX-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00]