diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -42321,10 +42321,19 @@ if ((OpSize == 128 && Subtarget.hasSSE2()) || (OpSize == 256 && Subtarget.hasAVX2()) || (OpSize == 512 && Subtarget.useAVX512Regs())) { - EVT VecVT = OpSize == 512 ? MVT::v16i32 : + auto BW = Subtarget.hasBWI(); + auto VL = Subtarget.hasVLX(); + EVT VecVT = OpSize == 512 && BW ? MVT::v64i8 : + OpSize == 512 ? MVT::v16i32 : OpSize == 256 ? MVT::v32i8 : MVT::v16i8; - EVT CmpVT = OpSize == 512 ? MVT::v16i1 : VecVT; + EVT CmpVT = VecVT; + if (OpSize == 512) { + CmpVT = BW ? MVT::v64i1 : MVT::v16i1; + } else if (BW && VL) { + CmpVT = OpSize == 256 ? MVT::v32i1 : MVT::v16i1; + } + SDValue Cmp; if (IsOrXorXorCCZero) { // This is a bitwise-combined equality comparison of 2 pairs of vectors: @@ -42344,9 +42353,18 @@ Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ); } // For 512-bits we want to emit a setcc that will lower to kortest. + if (OpSize == 512 && BW) + return DAG.getSetCC(DL, VT, DAG.getBitcast(MVT::i64, Cmp), + DAG.getConstant(0xFFFFFFFFFFFFFFFF, DL, MVT::i64), CC); if (OpSize == 512) return DAG.getSetCC(DL, VT, DAG.getBitcast(MVT::i16, Cmp), DAG.getConstant(0xFFFF, DL, MVT::i16), CC); + if (OpSize == 256 && BW && VL) + return DAG.getSetCC(DL, VT, DAG.getBitcast(MVT::i32, Cmp), + DAG.getConstant(0xFFFFFFFF, DL, MVT::i32), CC); + if (OpSize == 128 && BW && VL) + return DAG.getSetCC(DL, VT, DAG.getBitcast(MVT::i16, Cmp), + DAG.getConstant(0xFFFF, DL, MVT::i16), CC); // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality. // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll --- a/llvm/test/CodeGen/X86/memcmp.ll +++ b/test/CodeGen/X86/memcmp.ll @@ -6,7 +6,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512F --check-prefix=X64-AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw,avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512BW ; This tests codegen time inlining/optimization of memcmp ; rdar://6480398 @@ -1008,6 +1008,14 @@ ; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-AVX-NEXT: setne %al ; X64-AVX-NEXT: retq +; +; X64-AVX512BW-LABEL: length16_eq: +; X64-AVX512BW: # %bb.0: +; X64-AVX512BW-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX512BW-NEXT: vpcmpeqb (%rsi), %xmm0, %k0 +; X64-AVX512BW-NEXT: kortestw %k0, %k0 +; X64-AVX512BW-NEXT: setae %al +; X64-AVX512BW-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind %cmp = icmp ne i32 %call, 0 ret i1 %cmp @@ -1065,6 +1073,14 @@ ; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-AVX-NEXT: sete %al ; X64-AVX-NEXT: retq +; +; X64-AVX512BW-LABEL: length16_eq_const: +; X64-AVX512BW: # %bb.0: +; X64-AVX512BW-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX512BW-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %k0 +; X64-AVX512BW-NEXT: kortestw %k0, %k0 +; X64-AVX512BW-NEXT: setb %al +; X64-AVX512BW-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind %c = icmp eq i32 %m, 0 ret i1 %c @@ -1326,6 +1342,15 @@ ; X64-AVX2-NEXT: sete %al ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq +; +; X64-AVX512BW-LABEL: length32_eq: +; X64-AVX512BW: # %bb.0: +; X64-AVX512BW-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX512BW-NEXT: vpcmpeqb (%rsi), %ymm0, %k0 +; X64-AVX512BW-NEXT: kortestd %k0, %k0 +; X64-AVX512BW-NEXT: setb %al +; X64-AVX512BW-NEXT: vzeroupper +; X64-AVX512BW-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind %cmp = icmp eq i32 %call, 0 ret i1 %cmp @@ -1473,6 +1498,15 @@ ; X64-AVX2-NEXT: setne %al ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq +; +; X64-AVX512BW-LABEL: length32_eq_const: +; X64-AVX512BW: # %bb.0: +; X64-AVX512BW-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX512BW-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %k0 +; X64-AVX512BW-NEXT: kortestd %k0, %k0 +; X64-AVX512BW-NEXT: setae %al +; X64-AVX512BW-NEXT: vzeroupper +; X64-AVX512BW-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind %c = icmp ne i32 %m, 0 ret i1 %c @@ -1551,6 +1585,15 @@ ; X64-AVX512F-NEXT: setae %al ; X64-AVX512F-NEXT: vzeroupper ; X64-AVX512F-NEXT: retq +; +; X64-AVX512BW-LABEL: length64_eq: +; X64-AVX512BW: # %bb.0: +; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 +; X64-AVX512BW-NEXT: vpcmpeqb (%rsi), %zmm0, %k0 +; X64-AVX512BW-NEXT: kortestq %k0, %k0 +; X64-AVX512BW-NEXT: setae %al +; X64-AVX512BW-NEXT: vzeroupper +; X64-AVX512BW-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind %cmp = icmp ne i32 %call, 0 ret i1 %cmp @@ -1612,6 +1655,15 @@ ; X64-AVX512F-NEXT: setb %al ; X64-AVX512F-NEXT: vzeroupper ; X64-AVX512F-NEXT: retq +; +; X64-AVX512BW-LABEL: length64_eq_const: +; X64-AVX512BW: # %bb.0: +; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 +; X64-AVX512BW-NEXT: vpcmpeqb {{.*}}(%rip), %zmm0, %k0 +; X64-AVX512BW-NEXT: kortestq %k0, %k0 +; X64-AVX512BW-NEXT: setb %al +; X64-AVX512BW-NEXT: vzeroupper +; X64-AVX512BW-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind %c = icmp eq i32 %m, 0 ret i1 %c diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/test/CodeGen/X86/setcc-wide-types.ll --- a/llvm/test/CodeGen/X86/setcc-wide-types.ll +++ b/test/CodeGen/X86/setcc-wide-types.ll @@ -319,14 +319,23 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: ne_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: setae %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: ne_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: xorl %eax, %eax +; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: setae %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: ne_i512: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: xorl %eax, %eax +; AVX512BW-NEXT: kortestq %k0, %k0 +; AVX512BW-NEXT: setae %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %bcx = bitcast <8 x i64> %x to i512 %bcy = bitcast <8 x i64> %y to i512 %cmp = icmp ne i512 %bcx, %bcy @@ -464,14 +473,23 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: eq_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: setb %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: eq_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: xorl %eax, %eax +; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: eq_i512: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: xorl %eax, %eax +; AVX512BW-NEXT: kortestq %k0, %k0 +; AVX512BW-NEXT: setb %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %bcx = bitcast <8 x i64> %x to i512 %bcy = bitcast <8 x i64> %y to i512 %cmp = icmp eq i512 %bcx, %bcy @@ -804,17 +822,29 @@ ; NO512-NEXT: setne %al ; NO512-NEXT: retq ; -; AVX512-LABEL: ne_i512_pair: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 -; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1 -; AVX512-NEXT: vpcmpeqd (%rsi), %zmm0, %k1 -; AVX512-NEXT: vpcmpeqd 64(%rsi), %zmm1, %k0 {%k1} -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: setae %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: ne_i512_pair: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 +; AVX512F-NEXT: vpcmpeqd (%rsi), %zmm0, %k1 +; AVX512F-NEXT: vpcmpeqd 64(%rsi), %zmm1, %k0 {%k1} +; AVX512F-NEXT: xorl %eax, %eax +; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: setae %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: ne_i512_pair: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vpcmpeqb (%rsi), %zmm0, %k1 +; AVX512BW-NEXT: vpcmpeqb 64(%rsi), %zmm1, %k0 {%k1} +; AVX512BW-NEXT: xorl %eax, %eax +; AVX512BW-NEXT: kortestq %k0, %k0 +; AVX512BW-NEXT: setae %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %a0 = load i512, i512* %a %b0 = load i512, i512* %b %xor1 = xor i512 %a0, %b0 @@ -886,17 +916,29 @@ ; NO512-NEXT: sete %al ; NO512-NEXT: retq ; -; AVX512-LABEL: eq_i512_pair: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 -; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1 -; AVX512-NEXT: vpcmpeqd (%rsi), %zmm0, %k1 -; AVX512-NEXT: vpcmpeqd 64(%rsi), %zmm1, %k0 {%k1} -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: setb %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: eq_i512_pair: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 +; AVX512F-NEXT: vpcmpeqd (%rsi), %zmm0, %k1 +; AVX512F-NEXT: vpcmpeqd 64(%rsi), %zmm1, %k0 {%k1} +; AVX512F-NEXT: xorl %eax, %eax +; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: eq_i512_pair: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vpcmpeqb (%rsi), %zmm0, %k1 +; AVX512BW-NEXT: vpcmpeqb 64(%rsi), %zmm1, %k0 {%k1} +; AVX512BW-NEXT: xorl %eax, %eax +; AVX512BW-NEXT: kortestq %k0, %k0 +; AVX512BW-NEXT: setb %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %a0 = load i512, i512* %a %b0 = load i512, i512* %b %xor1 = xor i512 %a0, %b0