diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -20961,6 +20961,15 @@ return PTEST; } + // Use PTEST when explicitly requested. + if (Op0.getOpcode() == X86ISD::PTEST && isNullConstant(Op1) && + (CC == ISD::SETEQ || CC == ISD::SETNE)) { + SDLoc DL(Op0); + X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, + DL, MVT::i8); + return Op0; + } + // Try to lower using KORTEST. if (SDValue KORTEST = EmitKORTEST(Op0, Op1, CC, dl, DAG, Subtarget, X86CC)) return KORTEST; @@ -42359,18 +42368,27 @@ !IsOrXorXorCCZero) return SDValue(); - // TODO: Use PXOR + PTEST for SSE4.1 or later? EVT VT = SetCC->getValueType(0); SDLoc DL(SetCC); + auto AVX = Subtarget.hasAVX(); + + // Use XOR (plus OR) and PTEST after SSE4.1 and before AVX512. + // Otherwise use PCMPEQ (plus AND) and mask testing. if ((OpSize == 128 && Subtarget.hasSSE2()) || - (OpSize == 256 && Subtarget.hasAVX2()) || + (OpSize == 256 && AVX) || (OpSize == 512 && Subtarget.useAVX512Regs())) { auto BW = Subtarget.hasBWI(); + auto PT = Subtarget.hasSSE41(); EVT VecVT = OpSize == 512 ? (BW ? MVT::v64i8 : MVT::v16i32) : - OpSize == 256 ? MVT::v32i8 : - MVT::v16i8; + PT ? (OpSize == 256 ? MVT::v8f32 : MVT::v4f32) : + OpSize == 256 ? MVT::v32i8 : MVT::v16i8; EVT CmpVT = OpSize == 512 ? (BW ? MVT::v64i1 : MVT::v16i1) : VecVT; + auto XorOp = OpSize == 256 ? X86::VXORPSYrr : + AVX ? X86::VXORPSrr : X86::XORPSrr; + auto OrOp = OpSize == 256 ? X86::VORPSYrr : + AVX ? X86::VORPSrr : X86::ORPSrr; + SDValue Cmp; if (IsOrXorXorCCZero) { // This is a bitwise-combined equality comparison of 2 pairs of vectors: @@ -42381,13 +42399,23 @@ SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1)); SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0)); SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1)); - SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ); - SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ); - Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2); + if (VecVT == CmpVT && PT) { + auto Cmp1 = SDValue(DAG.getMachineNode(XorOp, DL, VecVT, A, B), 0); + auto Cmp2 = SDValue(DAG.getMachineNode(XorOp, DL, VecVT, C, D), 0); + Cmp = SDValue(DAG.getMachineNode(OrOp, DL, VecVT, Cmp1, Cmp2), 0); + } else { + SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ); + SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ); + Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2); + } } else { SDValue VecX = DAG.getBitcast(VecVT, X); SDValue VecY = DAG.getBitcast(VecVT, Y); - Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ); + if (VecVT == CmpVT && PT) { + Cmp = SDValue(DAG.getMachineNode(XorOp, DL, VecVT, VecX, VecY), 0); + } else { + Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ); + } } // For 512-bits we want to emit a setcc that will lower to kortest. if (OpSize == 512 && BW) @@ -42396,6 +42424,11 @@ if (OpSize == 512) return DAG.getSetCC(DL, VT, DAG.getBitcast(MVT::i16, Cmp), DAG.getConstant(0xFFFF, DL, MVT::i16), CC); + if (VecVT == CmpVT && PT) { + auto BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp); + auto PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp); + return DAG.getSetCC(DL, VT, PT, DAG.getConstant(0, DL, MVT::i32), CC); + } // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality. // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne diff --git a/llvm/test/CodeGen/X86/memcmp-minsize.ll b/test/CodeGen/X86/memcmp-minsize.ll --- a/llvm/test/CodeGen/X86/memcmp-minsize.ll +++ b/test/CodeGen/X86/memcmp-minsize.ll @@ -2,7 +2,8 @@ ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 ; This tests codegen time inlining/optimization of memcmp ; rdar://6480398 @@ -426,14 +427,13 @@ ; X64-SSE2-NEXT: setne %al ; X64-SSE2-NEXT: retq ; -; X64-AVX2-LABEL: length16_eq: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 -; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax -; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: setne %al -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: length16_eq: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovdqu (%rsi), %xmm0 +; X64-AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; X64-AVX-NEXT: vptest %xmm0, %xmm0 +; X64-AVX-NEXT: setne %al +; X64-AVX-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind %cmp = icmp ne i32 %call, 0 ret i1 %cmp @@ -471,14 +471,13 @@ ; X64-SSE2-NEXT: sete %al ; X64-SSE2-NEXT: retq ; -; X64-AVX2-LABEL: length16_eq_const: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax -; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: sete %al -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: length16_eq_const: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovdqa {{.*}}(%rip), %xmm0 +; X64-AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; X64-AVX-NEXT: vptest %xmm0, %xmm0 +; X64-AVX-NEXT: sete %al +; X64-AVX-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind %c = icmp eq i32 %m, 0 ret i1 %c @@ -609,12 +608,20 @@ ; X64-SSE2-NEXT: popq %rcx ; X64-SSE2-NEXT: retq ; +; X64-AVX1-LABEL: length32_eq: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovups (%rsi), %ymm0 +; X64-AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 +; X64-AVX1-NEXT: vptest %ymm0, %ymm +; X64-AVX1-NEXT: sete %al +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; ; X64-AVX2-LABEL: length32_eq: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax -; X64-AVX2-NEXT: cmpl $-1, %eax +; X64-AVX2-NEXT: vmovdqu (%rsi), %ymm0 +; X64-AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vptest %ymm0, %ymm ; X64-AVX2-NEXT: sete %al ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -648,12 +655,20 @@ ; X64-SSE2-NEXT: popq %rcx ; X64-SSE2-NEXT: retq ; +; X64-AVX1-LABEL: length32_eq_const: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps {{.*}}(%rip), %ymm0 +; X64-AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 +; X64-AVX1-NEXT: vptest %ymm0, %ymm0 +; X64-AVX1-NEXT: setne %al +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; ; X64-AVX2-LABEL: length32_eq_const: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax -; X64-AVX2-NEXT: cmpl $-1, %eax +; X64-AVX2-NEXT: vmovdqa {{.*}}(%rip), %ymm0 +; X64-AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: setne %al ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/memcmp-optsize.ll b/test/CodeGen/X86/memcmp-optsize.ll --- a/llvm/test/CodeGen/X86/memcmp-optsize.ll +++ b/test/CodeGen/X86/memcmp-optsize.ll @@ -2,7 +2,8 @@ ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 ; This tests codegen time inlining/optimization of memcmp ; rdar://6480398 @@ -560,14 +561,13 @@ ; X64-SSE2-NEXT: setne %al ; X64-SSE2-NEXT: retq ; -; X64-AVX2-LABEL: length16_eq: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 -; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax -; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: setne %al -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: length16_eq: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovdqu (%rsi), %xmm0 +; X64-AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; X64-AVX-NEXT: vptest %xmm0, %xmm0 +; X64-AVX-NEXT: setne %al +; X64-AVX-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind %cmp = icmp ne i32 %call, 0 ret i1 %cmp @@ -605,14 +605,13 @@ ; X64-SSE2-NEXT: sete %al ; X64-SSE2-NEXT: retq ; -; X64-AVX2-LABEL: length16_eq_const: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax -; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: sete %al -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: length16_eq_const: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovdqa {{.*}}(%rip), %xmm0 +; X64-AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; X64-AVX-NEXT: vptest %xmm0, %xmm0 +; X64-AVX-NEXT: sete %al +; X64-AVX-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind %c = icmp eq i32 %m, 0 ret i1 %c @@ -682,18 +681,17 @@ ; X64-SSE2-NEXT: sete %al ; X64-SSE2-NEXT: retq ; -; X64-AVX2-LABEL: length24_eq: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; X64-AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; X64-AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax -; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: sete %al -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: length24_eq: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovdqu (%rsi), %xmm0 +; X64-AVX-NEXT: vmovq 16(%rdi), %xmm1 +; X64-AVX-NEXT: vmovq 16(%rsi), %xmm2 +; X64-AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; X64-AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vptest %xmm0, %xmm0 +; X64-AVX-NEXT: sete %al +; X64-AVX-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind %cmp = icmp eq i32 %call, 0 ret i1 %cmp @@ -737,17 +735,18 @@ ; X64-SSE2-NEXT: setne %al ; X64-SSE2-NEXT: retq ; -; X64-AVX2-LABEL: length24_eq_const: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm1, %xmm1 -; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax -; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: setne %al -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: length24_eq_const: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovq 16(%rdi), %xmm0 +; X64-AVX-NEXT: movabsq $3689065127958034230, %rax +; X64-AVX-NEXT: vmovq %rax, %xmm1 +; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqa .LCPI21_0(%rip), %xmm1 +; X64-AVX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; X64-AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vptest %xmm0, %xmm0 +; X64-AVX-NEXT: setne %al +; X64-AVX-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 24) nounwind %c = icmp ne i32 %m, 0 ret i1 %c @@ -817,12 +816,22 @@ ; X64-SSE2-NEXT: sete %al ; X64-SSE2-NEXT: retq ; +; X64-AVX1-LABEL: length32_eq: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovdqu (%rsi), %xmm0 +; X64-AVX1-NEXT: vmovdqu 16(%rsi), %xmm1 +; X64-AVX1-NEXT: vpxor 16(%rdi), %xmm1, %xmm1 +; X64-AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vptest %xmm0, %xmm0 +; X64-AVX1-NEXT: sete %al +; X64-AVX1-NEXT: retq +; ; X64-AVX2-LABEL: length32_eq: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax -; X64-AVX2-NEXT: cmpl $-1, %eax +; X64-AVX2-NEXT: vmovdqu (%rsi), %ymm0 +; X64-AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: sete %al ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -869,12 +878,22 @@ ; X64-SSE2-NEXT: setne %al ; X64-SSE2-NEXT: retq ; +; X64-AVX1-LABEL: length32_eq_const: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm0 +; X64-AVX1-NEXT: vpxor 16(%rdi), %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm1 +; X64-AVX1-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; X64-AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X64-AVX1-NEXT: vptest %xmm0, %xmm0 +; X64-AVX1-NEXT: setne %al +; X64-AVX1-NEXT: retq +; ; X64-AVX2-LABEL: length32_eq_const: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax -; X64-AVX2-NEXT: cmpl $-1, %eax +; X64-AVX2-NEXT: vmovdqa {{.*}}(%rip), %ymm0 +; X64-AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: setne %al ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -925,15 +944,24 @@ ; X64-SSE2-NEXT: popq %rcx ; X64-SSE2-NEXT: retq ; +; X64-AVX1-LABEL: length64_eq: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: movl $64, %edx +; X64-AVX1-NEXT: callq memcmp +; X64-AVX1-NEXT: testl %eax, %eax +; X64-AVX1-NEXT: setne %al +; X64-AVX1-NEXT: popq %rcx +; X64-AVX1-NEXT: retq +; ; X64-AVX2-LABEL: length64_eq: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm1, %ymm1 -; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax -; X64-AVX2-NEXT: cmpl $-1, %eax +; X64-AVX2-NEXT: vmovdqu (%rsi), %ymm0 +; X64-AVX2-NEXT: vmovdqu 32(%rsi), %ymm1 +; X64-AVX2-NEXT: vpxor 32(%rdi), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: setne %al ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -966,15 +994,25 @@ ; X64-SSE2-NEXT: popq %rcx ; X64-SSE2-NEXT: retq ; +; X64-AVX1-LABEL: length64_eq_const: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: movl $.L.str, %esi +; X64-AVX1-NEXT: movl $64, %edx +; X64-AVX1-NEXT: callq memcmp +; X64-AVX1-NEXT: testl %eax, %eax +; X64-AVX1-NEXT: sete %al +; X64-AVX1-NEXT: popq %rcx +; X64-AVX1-NEXT: retq +; ; X64-AVX2-LABEL: length64_eq_const: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm1, %ymm1 -; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax -; X64-AVX2-NEXT: cmpl $-1, %eax +; X64-AVX2-NEXT: vmovdqa {{.*}}(%rip), %ymm0 +; X64-AVX2-NEXT: vpxor 32(%rdi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqa {{.*}}(%rip), %ymm1 # ymm1 +; X64-AVX2-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: sete %al ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll --- a/llvm/test/CodeGen/X86/memcmp.ll +++ b/test/CodeGen/X86/memcmp.ll @@ -1002,10 +1002,9 @@ ; ; X64-AVX-LABEL: length16_eq: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 -; X64-AVX-NEXT: vpmovmskb %xmm0, %eax -; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX-NEXT: vmovdqu (%rsi), %xmm0 +; X64-AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; X64-AVX-NEXT: vptest %xmm0, %xmm0 ; X64-AVX-NEXT: setne %al ; X64-AVX-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind @@ -1059,10 +1058,9 @@ ; ; X64-AVX-LABEL: length16_eq_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpmovmskb %xmm0, %eax -; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX-NEXT: vmovdqa {{.*}}(%rip), %xmm0 +; X64-AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; X64-AVX-NEXT: vptest %xmm0, %xmm0 ; X64-AVX-NEXT: sete %al ; X64-AVX-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind @@ -1148,14 +1146,13 @@ ; ; X64-AVX-LABEL: length24_eq: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; X64-AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; X64-AVX-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; X64-AVX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 -; X64-AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpmovmskb %xmm0, %eax -; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX-NEXT: vmovdqu (%rsi), %xmm0 +; X64-AVX-NEXT: vmovq 16(%rdi), %xmm1 +; X64-AVX-NEXT: vmovq 16(%rsi), %xmm2 +; X64-AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; X64-AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vptest %xmm0, %xmm0 ; X64-AVX-NEXT: sete %al ; X64-AVX-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind @@ -1215,13 +1212,14 @@ ; ; X64-AVX-LABEL: length24_eq_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; X64-AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm1, %xmm1 -; X64-AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpmovmskb %xmm0, %eax -; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX-NEXT: vmovq 16(%rdi), %xmm0 +; X64-AVX-NEXT: movabsq $3689065127958034230, %rax +; X64-AVX-NEXT: vmovq %rax, %xmm1 +; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqa .LCPI36_0(%rip), %xmm1 +; X64-AVX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; X64-AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vptest %xmm0, %xmm0 ; X64-AVX-NEXT: setne %al ; X64-AVX-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 24) nounwind @@ -1307,22 +1305,20 @@ ; ; X64-AVX1-LABEL: length32_eq: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 -; X64-AVX1-NEXT: vpcmpeqb 16(%rsi), %xmm1, %xmm1 -; X64-AVX1-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 -; X64-AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax -; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX1-NEXT: vmovdqu (%rsi), %xmm0 +; X64-AVX1-NEXT: vmovdqu 16(%rsi), %xmm1 +; X64-AVX1-NEXT: vpxor 16(%rdi), %xmm1, %xmm1 +; X64-AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vptest %xmm0, %xmm0 ; X64-AVX1-NEXT: sete %al ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: length32_eq: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax -; X64-AVX2-NEXT: cmpl $-1, %eax +; X64-AVX2-NEXT: vmovdqu (%rsi), %ymm0 +; X64-AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: sete %al ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1388,13 +1384,12 @@ ; ; X64-AVX-LABEL: length32_eq_prefer128: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX-NEXT: vmovdqu 16(%rdi), %xmm1 -; X64-AVX-NEXT: vpcmpeqb 16(%rsi), %xmm1, %xmm1 -; X64-AVX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 -; X64-AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpmovmskb %xmm0, %eax -; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX-NEXT: vmovdqu (%rsi), %xmm0 +; X64-AVX-NEXT: vmovdqu 16(%rsi), %xmm1 +; X64-AVX-NEXT: vpxor 16(%rdi), %xmm1, %xmm1 +; X64-AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vptest %xmm0, %xmm0 ; X64-AVX-NEXT: sete %al ; X64-AVX-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind @@ -1454,22 +1449,20 @@ ; ; X64-AVX1-LABEL: length32_eq_const: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 -; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm1, %xmm1 -; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax -; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm0 +; X64-AVX1-NEXT: vpxor 16(%rdi), %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm1 +; X64-AVX1-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; X64-AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X64-AVX1-NEXT: vptest %xmm0, %xmm0 ; X64-AVX1-NEXT: setne %al ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: length32_eq_const: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax -; X64-AVX2-NEXT: cmpl $-1, %eax +; X64-AVX2-NEXT: vmovdqa {{.*}}(%rip), %ymm0 +; X64-AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: setne %al ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1532,13 +1525,12 @@ ; ; X64-AVX2-LABEL: length64_eq: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm1, %ymm1 -; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax -; X64-AVX2-NEXT: cmpl $-1, %eax +; X64-AVX2-NEXT: vmovdqu (%rsi), %ymm0 +; X64-AVX2-NEXT: vmovdqu 32(%rsi), %ymm1 +; X64-AVX2-NEXT: vpxor 32(%rdi), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: setne %al ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1602,13 +1594,12 @@ ; ; X64-AVX2-LABEL: length64_eq_const: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm1, %ymm1 -; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax -; X64-AVX2-NEXT: cmpl $-1, %eax +; X64-AVX2-NEXT: vmovdqa {{.*}}(%rip), %ymm0 +; X64-AVX2-NEXT: vpxor 32(%rdi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqa {{.*}}(%rip), %ymm1 +; X64-AVX2-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: sete %al ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/test/CodeGen/X86/setcc-wide-types.ll --- a/llvm/test/CodeGen/X86/setcc-wide-types.ll +++ b/test/CodeGen/X86/setcc-wide-types.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=ANY --check-prefix=NO512 --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=ANY --check-prefix=NO512 --check-prefix=AVXANY --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=ANY --check-prefix=NO512 --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=ANY --check-prefix=NO512 --check-prefix=AVXANY --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX512 --check-prefix=AVX512BW ; Equality checks of 128/256-bit values can use PMOVMSK or PTEST to avoid scalarization. @@ -19,10 +19,9 @@ ; ; AVXANY-LABEL: ne_i128: ; AVXANY: # %bb.0: -; AVXANY-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVXANY-NEXT: vpmovmskb %xmm0, %ecx +; AVXANY-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVXANY-NEXT: xorl %eax, %eax -; AVXANY-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; AVXANY-NEXT: vptest %xmm0, %xmm0 ; AVXANY-NEXT: setne %al ; AVXANY-NEXT: retq %bcx = bitcast <2 x i64> %x to i128 @@ -44,10 +43,9 @@ ; ; AVXANY-LABEL: eq_i128: ; AVXANY: # %bb.0: -; AVXANY-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVXANY-NEXT: vpmovmskb %xmm0, %ecx +; AVXANY-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVXANY-NEXT: xorl %eax, %eax -; AVXANY-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; AVXANY-NEXT: vptest %xmm0, %xmm0 ; AVXANY-NEXT: sete %al ; AVXANY-NEXT: retq %bcx = bitcast <2 x i64> %x to i128 @@ -85,37 +83,21 @@ ; ; AVX1-LABEL: ne_i256: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovq %xmm2, %rcx -; AVX1-NEXT: vpextrq $1, %xmm0, %rdx -; AVX1-NEXT: vpextrq $1, %xmm2, %r8 -; AVX1-NEXT: vmovq %xmm1, %rdi -; AVX1-NEXT: xorq %rax, %rdi -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rsi -; AVX1-NEXT: xorq %rcx, %rsi -; AVX1-NEXT: orq %rdi, %rsi -; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: xorq %rdx, %rax -; AVX1-NEXT: vpextrq $1, %xmm0, %rcx -; AVX1-NEXT: xorq %r8, %rcx -; AVX1-NEXT: orq %rax, %rcx +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: orq %rsi, %rcx +; AVX1-NEXT: vptest %ymm0, %ymm0 ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX256-LABEL: ne_i256: -; AVX256: # %bb.0: -; AVX256-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 -; AVX256-NEXT: vpmovmskb %ymm0, %ecx -; AVX256-NEXT: xorl %eax, %eax -; AVX256-NEXT: cmpl $-1, %ecx -; AVX256-NEXT: setne %al -; AVX256-NEXT: vzeroupper -; AVX256-NEXT: retq +; AVX2-LABEL: ne_i256: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: setne %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %bcx = bitcast <4 x i64> %x to i256 %bcy = bitcast <4 x i64> %y to i256 %cmp = icmp ne i256 %bcx, %bcy @@ -151,37 +133,21 @@ ; ; AVX1-LABEL: eq_i256: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovq %xmm2, %rcx -; AVX1-NEXT: vpextrq $1, %xmm0, %rdx -; AVX1-NEXT: vpextrq $1, %xmm2, %r8 -; AVX1-NEXT: vmovq %xmm1, %rdi -; AVX1-NEXT: xorq %rax, %rdi -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rsi -; AVX1-NEXT: xorq %rcx, %rsi -; AVX1-NEXT: orq %rdi, %rsi -; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: xorq %rdx, %rax -; AVX1-NEXT: vpextrq $1, %xmm0, %rcx -; AVX1-NEXT: xorq %r8, %rcx -; AVX1-NEXT: orq %rax, %rcx +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: orq %rsi, %rcx +; AVX1-NEXT: vptest %ymm0, %ymm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX256-LABEL: eq_i256: -; AVX256: # %bb.0: -; AVX256-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 -; AVX256-NEXT: vpmovmskb %ymm0, %ecx -; AVX256-NEXT: xorl %eax, %eax -; AVX256-NEXT: cmpl $-1, %ecx -; AVX256-NEXT: sete %al -; AVX256-NEXT: vzeroupper -; AVX256-NEXT: retq +; AVX2-LABEL: eq_i256: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: sete %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %bcx = bitcast <4 x i64> %x to i256 %bcy = bitcast <4 x i64> %y to i256 %cmp = icmp eq i256 %bcx, %bcy @@ -518,14 +484,13 @@ ; ; AVXANY-LABEL: ne_i128_pair: ; AVXANY: # %bb.0: -; AVXANY-NEXT: vmovdqu (%rdi), %xmm0 -; AVXANY-NEXT: vmovdqu 16(%rdi), %xmm1 -; AVXANY-NEXT: vpcmpeqb 16(%rsi), %xmm1, %xmm1 -; AVXANY-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 -; AVXANY-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVXANY-NEXT: vpmovmskb %xmm0, %ecx +; AVXANY-NEXT: vmovdqu (%rsi), %xmm0 +; AVXANY-NEXT: vmovdqu 16(%rsi), %xmm1 +; AVXANY-NEXT: vpxor 16(%rdi), %xmm1, %xmm1 +; AVXANY-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVXANY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVXANY-NEXT: xorl %eax, %eax -; AVXANY-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; AVXANY-NEXT: vptest %xmm0, %xmm0 ; AVXANY-NEXT: setne %al ; AVXANY-NEXT: retq %a0 = load i128, i128* %a @@ -563,14 +528,13 @@ ; ; AVXANY-LABEL: eq_i128_pair: ; AVXANY: # %bb.0: -; AVXANY-NEXT: vmovdqu (%rdi), %xmm0 -; AVXANY-NEXT: vmovdqu 16(%rdi), %xmm1 -; AVXANY-NEXT: vpcmpeqb 16(%rsi), %xmm1, %xmm1 -; AVXANY-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 -; AVXANY-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVXANY-NEXT: vpmovmskb %xmm0, %ecx +; AVXANY-NEXT: vmovdqu (%rsi), %xmm0 +; AVXANY-NEXT: vmovdqu 16(%rsi), %xmm1 +; AVXANY-NEXT: vpxor 16(%rdi), %xmm1, %xmm1 +; AVXANY-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVXANY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVXANY-NEXT: xorl %eax, %eax -; AVXANY-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; AVXANY-NEXT: vptest %xmm0, %xmm0 ; AVXANY-NEXT: sete %al ; AVXANY-NEXT: retq %a0 = load i128, i128* %a @@ -622,46 +586,29 @@ ; ; AVX1-LABEL: ne_i256_pair: ; AVX1: # %bb.0: -; AVX1-NEXT: movq 16(%rdi), %r9 -; AVX1-NEXT: movq 24(%rdi), %r11 -; AVX1-NEXT: movq (%rdi), %r8 -; AVX1-NEXT: movq 8(%rdi), %r10 -; AVX1-NEXT: xorq 8(%rsi), %r10 -; AVX1-NEXT: xorq 24(%rsi), %r11 -; AVX1-NEXT: xorq (%rsi), %r8 -; AVX1-NEXT: xorq 16(%rsi), %r9 -; AVX1-NEXT: movq 48(%rdi), %rdx -; AVX1-NEXT: movq 32(%rdi), %rax -; AVX1-NEXT: movq 56(%rdi), %rcx -; AVX1-NEXT: movq 40(%rdi), %rdi -; AVX1-NEXT: xorq 40(%rsi), %rdi -; AVX1-NEXT: xorq 56(%rsi), %rcx -; AVX1-NEXT: orq %r11, %rcx -; AVX1-NEXT: orq %rdi, %rcx -; AVX1-NEXT: orq %r10, %rcx -; AVX1-NEXT: xorq 32(%rsi), %rax -; AVX1-NEXT: xorq 48(%rsi), %rdx -; AVX1-NEXT: orq %r9, %rdx -; AVX1-NEXT: orq %rax, %rdx -; AVX1-NEXT: orq %r8, %rdx +; AVX1-NEXT: vmovups (%rsi), %ymm0 +; AVX1-NEXT: vmovups 32(%rsi), %ymm1 +; AVX1-NEXT: vxorps 32(%rdi), %ymm1, %ymm1 +; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: orq %rcx, %rdx +; AVX1-NEXT: vptest %ymm0, %ymm0 ; AVX1-NEXT: setne %al +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX256-LABEL: ne_i256_pair: -; AVX256: # %bb.0: -; AVX256-NEXT: vmovdqu (%rdi), %ymm0 -; AVX256-NEXT: vmovdqu 32(%rdi), %ymm1 -; AVX256-NEXT: vpcmpeqb 32(%rsi), %ymm1, %ymm1 -; AVX256-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 -; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX256-NEXT: vpmovmskb %ymm0, %ecx -; AVX256-NEXT: xorl %eax, %eax -; AVX256-NEXT: cmpl $-1, %ecx -; AVX256-NEXT: setne %al -; AVX256-NEXT: vzeroupper -; AVX256-NEXT: retq +; AVX2-LABEL: ne_i256_pair: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rsi), %ymm0 +; AVX2-NEXT: vmovdqu 32(%rsi), %ymm1 +; AVX2-NEXT: vpxor 32(%rdi), %ymm1, %ymm1 +; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: setne %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %a0 = load i256, i256* %a %b0 = load i256, i256* %b %xor1 = xor i256 %a0, %b0 @@ -711,46 +658,29 @@ ; ; AVX1-LABEL: eq_i256_pair: ; AVX1: # %bb.0: -; AVX1-NEXT: movq 16(%rdi), %r9 -; AVX1-NEXT: movq 24(%rdi), %r11 -; AVX1-NEXT: movq (%rdi), %r8 -; AVX1-NEXT: movq 8(%rdi), %r10 -; AVX1-NEXT: xorq 8(%rsi), %r10 -; AVX1-NEXT: xorq 24(%rsi), %r11 -; AVX1-NEXT: xorq (%rsi), %r8 -; AVX1-NEXT: xorq 16(%rsi), %r9 -; AVX1-NEXT: movq 48(%rdi), %rdx -; AVX1-NEXT: movq 32(%rdi), %rax -; AVX1-NEXT: movq 56(%rdi), %rcx -; AVX1-NEXT: movq 40(%rdi), %rdi -; AVX1-NEXT: xorq 40(%rsi), %rdi -; AVX1-NEXT: xorq 56(%rsi), %rcx -; AVX1-NEXT: orq %r11, %rcx -; AVX1-NEXT: orq %rdi, %rcx -; AVX1-NEXT: orq %r10, %rcx -; AVX1-NEXT: xorq 32(%rsi), %rax -; AVX1-NEXT: xorq 48(%rsi), %rdx -; AVX1-NEXT: orq %r9, %rdx -; AVX1-NEXT: orq %rax, %rdx -; AVX1-NEXT: orq %r8, %rdx +; AVX1-NEXT: vmovups (%rsi), %ymm0 +; AVX1-NEXT: vmovups 32(%rsi), %ymm1 +; AVX1-NEXT: vxorps 32(%rdi), %ymm1, %ymm1 +; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: orq %rcx, %rdx +; AVX1-NEXT: vptest %ymm0, %ymm0 ; AVX1-NEXT: sete %al +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX256-LABEL: eq_i256_pair: -; AVX256: # %bb.0: -; AVX256-NEXT: vmovdqu (%rdi), %ymm0 -; AVX256-NEXT: vmovdqu 32(%rdi), %ymm1 -; AVX256-NEXT: vpcmpeqb 32(%rsi), %ymm1, %ymm1 -; AVX256-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 -; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX256-NEXT: vpmovmskb %ymm0, %ecx -; AVX256-NEXT: xorl %eax, %eax -; AVX256-NEXT: cmpl $-1, %ecx -; AVX256-NEXT: sete %al -; AVX256-NEXT: vzeroupper -; AVX256-NEXT: retq +; AVX2-LABEL: eq_i256_pair: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rsi), %ymm0 +; AVX2-NEXT: vmovdqu 32(%rsi), %ymm1 +; AVX2-NEXT: vpxor 32(%rdi), %ymm1, %ymm1 +; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: sete %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %a0 = load i256, i256* %a %b0 = load i256, i256* %b %xor1 = xor i256 %a0, %b0