diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1057,6 +1057,9 @@ setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal); + + if (Subtarget.hasGFNI()) + setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) { @@ -28972,23 +28975,33 @@ if (Subtarget.hasXOP() && !VT.is512BitVector()) return LowerBITREVERSE_XOP(Op, DAG); - assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"); - SDValue In = Op.getOperand(0); SDLoc DL(Op); + assert(VT.getScalarType() == MVT::i8 && + "Only byte vector BITREVERSE supported"); + // Split v64i8 without BWI so that we can still use the PSHUFB lowering. if (VT == MVT::v64i8 && !Subtarget.hasBWI()) return splitVectorIntUnary(Op, DAG); - unsigned NumElts = VT.getVectorNumElements(); - assert(VT.getScalarType() == MVT::i8 && - "Only byte vector BITREVERSE supported"); - // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2. - if (VT.is256BitVector() && !Subtarget.hasInt256()) + if (VT == MVT::v32i8 && !Subtarget.hasInt256()) return splitVectorIntUnary(Op, DAG); + unsigned NumElts = VT.getVectorNumElements(); + + // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits. + if (Subtarget.hasGFNI()) { + MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8); + SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT); + Matrix = DAG.getBitcast(VT, Matrix); + return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix, + DAG.getTargetConstant(0, DL, MVT::i8)); + } + + assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"); + // Perform BITREVERSE using PSHUFB lookups. Each byte is split into // two nibbles and a PSHUFB lookup to find the bitreverse of each // 0-15 value (moved to the other nibble). diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll --- a/llvm/test/CodeGen/X86/vector-bitreverse.ll +++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll @@ -7,6 +7,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNISSE --check-prefix=GFNISSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNISSE --check-prefix=GFNISSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+gfni | FileCheck %s --check-prefix=ALL --check-prefix=GFNIAVX512BW ; Make sure we don't crash with avx512bw and xop ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx512bw @@ -57,6 +61,63 @@ ; XOP-NEXT: vmovd %xmm0, %eax ; XOP-NEXT: # kill: def $al killed $al killed $eax ; XOP-NEXT: retq +; +; GFNISSE-LABEL: test_bitreverse_i8: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: # kill: def $edi killed $edi def $rdi +; GFNISSE-NEXT: rolb $4, %dil +; GFNISSE-NEXT: movl %edi, %eax +; GFNISSE-NEXT: andb $51, %al +; GFNISSE-NEXT: shlb $2, %al +; GFNISSE-NEXT: andb $-52, %dil +; GFNISSE-NEXT: shrb $2, %dil +; GFNISSE-NEXT: orb %al, %dil +; GFNISSE-NEXT: movl %edi, %eax +; GFNISSE-NEXT: andb $85, %al +; GFNISSE-NEXT: addb %al, %al +; GFNISSE-NEXT: andb $-86, %dil +; GFNISSE-NEXT: shrb %dil +; GFNISSE-NEXT: addl %edi, %eax +; GFNISSE-NEXT: # kill: def $al killed $al killed $eax +; GFNISSE-NEXT: retq +; +; GFNIAVX2-LABEL: test_bitreverse_i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: # kill: def $edi killed $edi def $rdi +; GFNIAVX2-NEXT: rolb $4, %dil +; GFNIAVX2-NEXT: movl %edi, %eax +; GFNIAVX2-NEXT: andb $51, %al +; GFNIAVX2-NEXT: shlb $2, %al +; GFNIAVX2-NEXT: andb $-52, %dil +; GFNIAVX2-NEXT: shrb $2, %dil +; GFNIAVX2-NEXT: orb %al, %dil +; GFNIAVX2-NEXT: movl %edi, %eax +; GFNIAVX2-NEXT: andb $85, %al +; GFNIAVX2-NEXT: addb %al, %al +; GFNIAVX2-NEXT: andb $-86, %dil +; GFNIAVX2-NEXT: shrb %dil +; GFNIAVX2-NEXT: addl %edi, %eax +; GFNIAVX2-NEXT: # kill: def $al killed $al killed $eax +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512BW-LABEL: test_bitreverse_i8: +; GFNIAVX512BW: # %bb.0: +; GFNIAVX512BW-NEXT: # kill: def $edi killed $edi def $rdi +; GFNIAVX512BW-NEXT: rolb $4, %dil +; GFNIAVX512BW-NEXT: movl %edi, %eax +; GFNIAVX512BW-NEXT: andb $51, %al +; GFNIAVX512BW-NEXT: shlb $2, %al +; GFNIAVX512BW-NEXT: andb $-52, %dil +; GFNIAVX512BW-NEXT: shrb $2, %dil +; GFNIAVX512BW-NEXT: orb %al, %dil +; GFNIAVX512BW-NEXT: movl %edi, %eax +; GFNIAVX512BW-NEXT: andb $85, %al +; GFNIAVX512BW-NEXT: addb %al, %al +; GFNIAVX512BW-NEXT: andb $-86, %dil +; GFNIAVX512BW-NEXT: shrb %dil +; GFNIAVX512BW-NEXT: addl %edi, %eax +; GFNIAVX512BW-NEXT: # kill: def $al killed $al killed $eax +; GFNIAVX512BW-NEXT: retq %b = call i8 @llvm.bitreverse.i8(i8 %a) ret i8 %b } @@ -115,6 +176,75 @@ ; XOP-NEXT: vmovd %xmm0, %eax ; XOP-NEXT: # kill: def $ax killed $ax killed $eax ; XOP-NEXT: retq +; +; GFNISSE-LABEL: test_bitreverse_i16: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: # kill: def $edi killed $edi def $rdi +; GFNISSE-NEXT: rolw $8, %di +; GFNISSE-NEXT: movl %edi, %eax +; GFNISSE-NEXT: andl $3855, %eax # imm = 0xF0F +; GFNISSE-NEXT: shll $4, %eax +; GFNISSE-NEXT: andl $61680, %edi # imm = 0xF0F0 +; GFNISSE-NEXT: shrl $4, %edi +; GFNISSE-NEXT: orl %eax, %edi +; GFNISSE-NEXT: movl %edi, %eax +; GFNISSE-NEXT: andl $13107, %eax # imm = 0x3333 +; GFNISSE-NEXT: andl $52428, %edi # imm = 0xCCCC +; GFNISSE-NEXT: shrl $2, %edi +; GFNISSE-NEXT: leal (%rdi,%rax,4), %eax +; GFNISSE-NEXT: movl %eax, %ecx +; GFNISSE-NEXT: andl $21845, %ecx # imm = 0x5555 +; GFNISSE-NEXT: andl $43690, %eax # imm = 0xAAAA +; GFNISSE-NEXT: shrl %eax +; GFNISSE-NEXT: leal (%rax,%rcx,2), %eax +; GFNISSE-NEXT: # kill: def $ax killed $ax killed $eax +; GFNISSE-NEXT: retq +; +; GFNIAVX2-LABEL: test_bitreverse_i16: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: # kill: def $edi killed $edi def $rdi +; GFNIAVX2-NEXT: rolw $8, %di +; GFNIAVX2-NEXT: movl %edi, %eax +; GFNIAVX2-NEXT: andl $3855, %eax # imm = 0xF0F +; GFNIAVX2-NEXT: shll $4, %eax +; GFNIAVX2-NEXT: andl $61680, %edi # imm = 0xF0F0 +; GFNIAVX2-NEXT: shrl $4, %edi +; GFNIAVX2-NEXT: orl %eax, %edi +; GFNIAVX2-NEXT: movl %edi, %eax +; GFNIAVX2-NEXT: andl $13107, %eax # imm = 0x3333 +; GFNIAVX2-NEXT: andl $52428, %edi # imm = 0xCCCC +; GFNIAVX2-NEXT: shrl $2, %edi +; GFNIAVX2-NEXT: leal (%rdi,%rax,4), %eax +; GFNIAVX2-NEXT: movl %eax, %ecx +; GFNIAVX2-NEXT: andl $21845, %ecx # imm = 0x5555 +; GFNIAVX2-NEXT: andl $43690, %eax # imm = 0xAAAA +; GFNIAVX2-NEXT: shrl %eax +; GFNIAVX2-NEXT: leal (%rax,%rcx,2), %eax +; GFNIAVX2-NEXT: # kill: def $ax killed $ax killed $eax +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512BW-LABEL: test_bitreverse_i16: +; GFNIAVX512BW: # %bb.0: +; GFNIAVX512BW-NEXT: # kill: def $edi killed $edi def $rdi +; GFNIAVX512BW-NEXT: rolw $8, %di +; GFNIAVX512BW-NEXT: movl %edi, %eax +; GFNIAVX512BW-NEXT: andl $3855, %eax # imm = 0xF0F +; GFNIAVX512BW-NEXT: shll $4, %eax +; GFNIAVX512BW-NEXT: andl $61680, %edi # imm = 0xF0F0 +; GFNIAVX512BW-NEXT: shrl $4, %edi +; GFNIAVX512BW-NEXT: orl %eax, %edi +; GFNIAVX512BW-NEXT: movl %edi, %eax +; GFNIAVX512BW-NEXT: andl $13107, %eax # imm = 0x3333 +; GFNIAVX512BW-NEXT: andl $52428, %edi # imm = 0xCCCC +; GFNIAVX512BW-NEXT: shrl $2, %edi +; GFNIAVX512BW-NEXT: leal (%rdi,%rax,4), %eax +; GFNIAVX512BW-NEXT: movl %eax, %ecx +; GFNIAVX512BW-NEXT: andl $21845, %ecx # imm = 0x5555 +; GFNIAVX512BW-NEXT: andl $43690, %eax # imm = 0xAAAA +; GFNIAVX512BW-NEXT: shrl %eax +; GFNIAVX512BW-NEXT: leal (%rax,%rcx,2), %eax +; GFNIAVX512BW-NEXT: # kill: def $ax killed $ax killed $eax +; GFNIAVX512BW-NEXT: retq %b = call i16 @llvm.bitreverse.i16(i16 %a) ret i16 %b } @@ -170,6 +300,72 @@ ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 ; XOP-NEXT: vmovd %xmm0, %eax ; XOP-NEXT: retq +; +; GFNISSE-LABEL: test_bitreverse_i32: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: # kill: def $edi killed $edi def $rdi +; GFNISSE-NEXT: bswapl %edi +; GFNISSE-NEXT: movl %edi, %eax +; GFNISSE-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F +; GFNISSE-NEXT: shll $4, %eax +; GFNISSE-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 +; GFNISSE-NEXT: shrl $4, %edi +; GFNISSE-NEXT: orl %eax, %edi +; GFNISSE-NEXT: movl %edi, %eax +; GFNISSE-NEXT: andl $858993459, %eax # imm = 0x33333333 +; GFNISSE-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC +; GFNISSE-NEXT: shrl $2, %edi +; GFNISSE-NEXT: leal (%rdi,%rax,4), %eax +; GFNISSE-NEXT: movl %eax, %ecx +; GFNISSE-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; GFNISSE-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA +; GFNISSE-NEXT: shrl %eax +; GFNISSE-NEXT: leal (%rax,%rcx,2), %eax +; GFNISSE-NEXT: retq +; +; GFNIAVX2-LABEL: test_bitreverse_i32: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: # kill: def $edi killed $edi def $rdi +; GFNIAVX2-NEXT: bswapl %edi +; GFNIAVX2-NEXT: movl %edi, %eax +; GFNIAVX2-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F +; GFNIAVX2-NEXT: shll $4, %eax +; GFNIAVX2-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 +; GFNIAVX2-NEXT: shrl $4, %edi +; GFNIAVX2-NEXT: orl %eax, %edi +; GFNIAVX2-NEXT: movl %edi, %eax +; GFNIAVX2-NEXT: andl $858993459, %eax # imm = 0x33333333 +; GFNIAVX2-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC +; GFNIAVX2-NEXT: shrl $2, %edi +; GFNIAVX2-NEXT: leal (%rdi,%rax,4), %eax +; GFNIAVX2-NEXT: movl %eax, %ecx +; GFNIAVX2-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; GFNIAVX2-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA +; GFNIAVX2-NEXT: shrl %eax +; GFNIAVX2-NEXT: leal (%rax,%rcx,2), %eax +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512BW-LABEL: test_bitreverse_i32: +; GFNIAVX512BW: # %bb.0: +; GFNIAVX512BW-NEXT: # kill: def $edi killed $edi def $rdi +; GFNIAVX512BW-NEXT: bswapl %edi +; GFNIAVX512BW-NEXT: movl %edi, %eax +; GFNIAVX512BW-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F +; GFNIAVX512BW-NEXT: shll $4, %eax +; GFNIAVX512BW-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 +; GFNIAVX512BW-NEXT: shrl $4, %edi +; GFNIAVX512BW-NEXT: orl %eax, %edi +; GFNIAVX512BW-NEXT: movl %edi, %eax +; GFNIAVX512BW-NEXT: andl $858993459, %eax # imm = 0x33333333 +; GFNIAVX512BW-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC +; GFNIAVX512BW-NEXT: shrl $2, %edi +; GFNIAVX512BW-NEXT: leal (%rdi,%rax,4), %eax +; GFNIAVX512BW-NEXT: movl %eax, %ecx +; GFNIAVX512BW-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; GFNIAVX512BW-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA +; GFNIAVX512BW-NEXT: shrl %eax +; GFNIAVX512BW-NEXT: leal (%rax,%rcx,2), %eax +; GFNIAVX512BW-NEXT: retq %b = call i32 @llvm.bitreverse.i32(i32 %a) ret i32 %b } @@ -229,6 +425,78 @@ ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 ; XOP-NEXT: vmovq %xmm0, %rax ; XOP-NEXT: retq +; +; GFNISSE-LABEL: test_bitreverse_i64: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: bswapq %rdi +; GFNISSE-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F +; GFNISSE-NEXT: andq %rdi, %rax +; GFNISSE-NEXT: shlq $4, %rax +; GFNISSE-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 +; GFNISSE-NEXT: andq %rdi, %rcx +; GFNISSE-NEXT: shrq $4, %rcx +; GFNISSE-NEXT: orq %rax, %rcx +; GFNISSE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; GFNISSE-NEXT: andq %rcx, %rax +; GFNISSE-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC +; GFNISSE-NEXT: andq %rcx, %rdx +; GFNISSE-NEXT: shrq $2, %rdx +; GFNISSE-NEXT: leaq (%rdx,%rax,4), %rax +; GFNISSE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; GFNISSE-NEXT: andq %rax, %rcx +; GFNISSE-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA +; GFNISSE-NEXT: andq %rax, %rdx +; GFNISSE-NEXT: shrq %rdx +; GFNISSE-NEXT: leaq (%rdx,%rcx,2), %rax +; GFNISSE-NEXT: retq +; +; GFNIAVX2-LABEL: test_bitreverse_i64: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: bswapq %rdi +; GFNIAVX2-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F +; GFNIAVX2-NEXT: andq %rdi, %rax +; GFNIAVX2-NEXT: shlq $4, %rax +; GFNIAVX2-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 +; GFNIAVX2-NEXT: andq %rdi, %rcx +; GFNIAVX2-NEXT: shrq $4, %rcx +; GFNIAVX2-NEXT: orq %rax, %rcx +; GFNIAVX2-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; GFNIAVX2-NEXT: andq %rcx, %rax +; GFNIAVX2-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC +; GFNIAVX2-NEXT: andq %rcx, %rdx +; GFNIAVX2-NEXT: shrq $2, %rdx +; GFNIAVX2-NEXT: leaq (%rdx,%rax,4), %rax +; GFNIAVX2-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; GFNIAVX2-NEXT: andq %rax, %rcx +; GFNIAVX2-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA +; GFNIAVX2-NEXT: andq %rax, %rdx +; GFNIAVX2-NEXT: shrq %rdx +; GFNIAVX2-NEXT: leaq (%rdx,%rcx,2), %rax +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512BW-LABEL: test_bitreverse_i64: +; GFNIAVX512BW: # %bb.0: +; GFNIAVX512BW-NEXT: bswapq %rdi +; GFNIAVX512BW-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F +; GFNIAVX512BW-NEXT: andq %rdi, %rax +; GFNIAVX512BW-NEXT: shlq $4, %rax +; GFNIAVX512BW-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 +; GFNIAVX512BW-NEXT: andq %rdi, %rcx +; GFNIAVX512BW-NEXT: shrq $4, %rcx +; GFNIAVX512BW-NEXT: orq %rax, %rcx +; GFNIAVX512BW-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; GFNIAVX512BW-NEXT: andq %rcx, %rax +; GFNIAVX512BW-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC +; GFNIAVX512BW-NEXT: andq %rcx, %rdx +; GFNIAVX512BW-NEXT: shrq $2, %rdx +; GFNIAVX512BW-NEXT: leaq (%rdx,%rax,4), %rax +; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; GFNIAVX512BW-NEXT: andq %rax, %rcx +; GFNIAVX512BW-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA +; GFNIAVX512BW-NEXT: andq %rax, %rdx +; GFNIAVX512BW-NEXT: shrq %rdx +; GFNIAVX512BW-NEXT: leaq (%rdx,%rcx,2), %rax +; GFNIAVX512BW-NEXT: retq %b = call i64 @llvm.bitreverse.i64(i64 %a) ret i64 %b } @@ -288,6 +556,21 @@ ; XOP: # %bb.0: ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 ; XOP-NEXT: retq +; +; GFNISSE-LABEL: test_bitreverse_v16i8: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: gf2p8affineqb $0, {{.*}}(%rip), %xmm0 +; GFNISSE-NEXT: retq +; +; GFNIAVX2-LABEL: test_bitreverse_v16i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512BW-LABEL: test_bitreverse_v16i8: +; GFNIAVX512BW: # %bb.0: +; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0 +; GFNIAVX512BW-NEXT: retq %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) ret <16 x i8> %b } @@ -353,6 +636,33 @@ ; XOP: # %bb.0: ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 ; XOP-NEXT: retq +; +; GFNISSE2-LABEL: test_bitreverse_v8i16: +; GFNISSE2: # %bb.0: +; GFNISSE2-NEXT: movdqa %xmm0, %xmm1 +; GFNISSE2-NEXT: psrlw $8, %xmm1 +; GFNISSE2-NEXT: psllw $8, %xmm0 +; GFNISSE2-NEXT: por %xmm1, %xmm0 +; GFNISSE2-NEXT: gf2p8affineqb $0, {{.*}}(%rip), %xmm0 +; GFNISSE2-NEXT: retq +; +; GFNISSSE3-LABEL: test_bitreverse_v8i16: +; GFNISSSE3: # %bb.0: +; GFNISSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; GFNISSSE3-NEXT: gf2p8affineqb $0, {{.*}}(%rip), %xmm0 +; GFNISSSE3-NEXT: retq +; +; GFNIAVX2-LABEL: test_bitreverse_v8i16: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512BW-LABEL: test_bitreverse_v8i16: +; GFNIAVX512BW: # %bb.0: +; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0 +; GFNIAVX512BW-NEXT: retq %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) ret <8 x i16> %b } @@ -423,6 +733,38 @@ ; XOP: # %bb.0: ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 ; XOP-NEXT: retq +; +; GFNISSE2-LABEL: test_bitreverse_v4i32: +; GFNISSE2: # %bb.0: +; GFNISSE2-NEXT: pxor %xmm1, %xmm1 +; GFNISSE2-NEXT: movdqa %xmm0, %xmm2 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm2, %xmm0 +; GFNISSE2-NEXT: gf2p8affineqb $0, {{.*}}(%rip), %xmm0 +; GFNISSE2-NEXT: retq +; +; GFNISSSE3-LABEL: test_bitreverse_v4i32: +; GFNISSSE3: # %bb.0: +; GFNISSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; GFNISSSE3-NEXT: gf2p8affineqb $0, {{.*}}(%rip), %xmm0 +; GFNISSSE3-NEXT: retq +; +; GFNIAVX2-LABEL: test_bitreverse_v4i32: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512BW-LABEL: test_bitreverse_v4i32: +; GFNIAVX512BW: # %bb.0: +; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0 +; GFNIAVX512BW-NEXT: retq %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) ret <4 x i32> %b } @@ -495,6 +837,40 @@ ; XOP: # %bb.0: ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 ; XOP-NEXT: retq +; +; GFNISSE2-LABEL: test_bitreverse_v2i64: +; GFNISSE2: # %bb.0: +; GFNISSE2-NEXT: pxor %xmm1, %xmm1 +; GFNISSE2-NEXT: movdqa %xmm0, %xmm2 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm2, %xmm0 +; GFNISSE2-NEXT: gf2p8affineqb $0, {{.*}}(%rip), %xmm0 +; GFNISSE2-NEXT: retq +; +; GFNISSSE3-LABEL: test_bitreverse_v2i64: +; GFNISSSE3: # %bb.0: +; GFNISSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; GFNISSSE3-NEXT: gf2p8affineqb $0, {{.*}}(%rip), %xmm0 +; GFNISSSE3-NEXT: retq +; +; GFNIAVX2-LABEL: test_bitreverse_v2i64: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512BW-LABEL: test_bitreverse_v2i64: +; GFNIAVX512BW: # %bb.0: +; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{.*}}(%rip), %xmm0, %xmm0 +; GFNIAVX512BW-NEXT: retq %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) ret <2 x i64> %b } @@ -634,6 +1010,25 @@ ; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq +; +; GFNISSE-LABEL: test_bitreverse_v32i8: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745] +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm0 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm1 +; GFNISSE-NEXT: retq +; +; GFNIAVX2-LABEL: test_bitreverse_v32i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512BW-LABEL: test_bitreverse_v32i8: +; GFNIAVX512BW: # %bb.0: +; GFNIAVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 +; GFNIAVX512BW-NEXT: retq %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) ret <32 x i8> %b } @@ -789,6 +1184,45 @@ ; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq +; +; GFNISSE2-LABEL: test_bitreverse_v16i16: +; GFNISSE2: # %bb.0: +; GFNISSE2-NEXT: movdqa %xmm0, %xmm2 +; GFNISSE2-NEXT: psrlw $8, %xmm2 +; GFNISSE2-NEXT: psllw $8, %xmm0 +; GFNISSE2-NEXT: por %xmm2, %xmm0 +; GFNISSE2-NEXT: movdqa {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745] +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm2, %xmm0 +; GFNISSE2-NEXT: movdqa %xmm1, %xmm3 +; GFNISSE2-NEXT: psrlw $8, %xmm3 +; GFNISSE2-NEXT: psllw $8, %xmm1 +; GFNISSE2-NEXT: por %xmm3, %xmm1 +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm2, %xmm1 +; GFNISSE2-NEXT: retq +; +; GFNISSSE3-LABEL: test_bitreverse_v16i16: +; GFNISSSE3: # %bb.0: +; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; GFNISSSE3-NEXT: pshufb %xmm2, %xmm0 +; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 +; GFNISSSE3-NEXT: pshufb %xmm2, %xmm1 +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 +; GFNISSSE3-NEXT: retq +; +; GFNIAVX2-LABEL: test_bitreverse_v16i16: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512BW-LABEL: test_bitreverse_v16i16: +; GFNIAVX512BW: # %bb.0: +; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] +; GFNIAVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 +; GFNIAVX512BW-NEXT: retq %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) ret <16 x i16> %b } @@ -953,6 +1387,54 @@ ; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq +; +; GFNISSE2-LABEL: test_bitreverse_v8i32: +; GFNISSE2: # %bb.0: +; GFNISSE2-NEXT: pxor %xmm2, %xmm2 +; GFNISSE2-NEXT: movdqa %xmm0, %xmm3 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm3, %xmm0 +; GFNISSE2-NEXT: movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 +; GFNISSE2-NEXT: movdqa %xmm1, %xmm4 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm4, %xmm1 +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 +; GFNISSE2-NEXT: retq +; +; GFNISSSE3-LABEL: test_bitreverse_v8i32: +; GFNISSSE3: # %bb.0: +; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; GFNISSSE3-NEXT: pshufb %xmm2, %xmm0 +; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 +; GFNISSSE3-NEXT: pshufb %xmm2, %xmm1 +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 +; GFNISSSE3-NEXT: retq +; +; GFNIAVX2-LABEL: test_bitreverse_v8i32: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512BW-LABEL: test_bitreverse_v8i32: +; GFNIAVX512BW: # %bb.0: +; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] +; GFNIAVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 +; GFNIAVX512BW-NEXT: retq %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) ret <8 x i32> %b } @@ -1121,6 +1603,58 @@ ; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq +; +; GFNISSE2-LABEL: test_bitreverse_v4i64: +; GFNISSE2: # %bb.0: +; GFNISSE2-NEXT: pxor %xmm2, %xmm2 +; GFNISSE2-NEXT: movdqa %xmm0, %xmm3 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm3, %xmm0 +; GFNISSE2-NEXT: movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 +; GFNISSE2-NEXT: movdqa %xmm1, %xmm4 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm4, %xmm1 +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 +; GFNISSE2-NEXT: retq +; +; GFNISSSE3-LABEL: test_bitreverse_v4i64: +; GFNISSSE3: # %bb.0: +; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; GFNISSSE3-NEXT: pshufb %xmm2, %xmm0 +; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 +; GFNISSSE3-NEXT: pshufb %xmm2, %xmm1 +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 +; GFNISSSE3-NEXT: retq +; +; GFNIAVX2-LABEL: test_bitreverse_v4i64: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512BW-LABEL: test_bitreverse_v4i64: +; GFNIAVX512BW: # %bb.0: +; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] +; GFNIAVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 +; GFNIAVX512BW-NEXT: retq %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) ret <4 x i64> %b } @@ -1366,6 +1900,27 @@ ; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; XOPAVX2-NEXT: retq +; +; GFNISSE-LABEL: test_bitreverse_v64i8: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745] +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm0 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm1 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm2 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm3 +; GFNISSE-NEXT: retq +; +; GFNIAVX2-LABEL: test_bitreverse_v64i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512BW-LABEL: test_bitreverse_v64i8: +; GFNIAVX512BW: # %bb.0: +; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: retq %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) ret <64 x i8> %b } @@ -1645,6 +2200,61 @@ ; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; XOPAVX2-NEXT: retq +; +; GFNISSE2-LABEL: test_bitreverse_v32i16: +; GFNISSE2: # %bb.0: +; GFNISSE2-NEXT: movdqa %xmm0, %xmm4 +; GFNISSE2-NEXT: psrlw $8, %xmm4 +; GFNISSE2-NEXT: psllw $8, %xmm0 +; GFNISSE2-NEXT: por %xmm4, %xmm0 +; GFNISSE2-NEXT: movdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745] +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm4, %xmm0 +; GFNISSE2-NEXT: movdqa %xmm1, %xmm5 +; GFNISSE2-NEXT: psrlw $8, %xmm5 +; GFNISSE2-NEXT: psllw $8, %xmm1 +; GFNISSE2-NEXT: por %xmm5, %xmm1 +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm4, %xmm1 +; GFNISSE2-NEXT: movdqa %xmm2, %xmm5 +; GFNISSE2-NEXT: psrlw $8, %xmm5 +; GFNISSE2-NEXT: psllw $8, %xmm2 +; GFNISSE2-NEXT: por %xmm5, %xmm2 +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm4, %xmm2 +; GFNISSE2-NEXT: movdqa %xmm3, %xmm5 +; GFNISSE2-NEXT: psrlw $8, %xmm5 +; GFNISSE2-NEXT: psllw $8, %xmm3 +; GFNISSE2-NEXT: por %xmm5, %xmm3 +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm4, %xmm3 +; GFNISSE2-NEXT: retq +; +; GFNISSSE3-LABEL: test_bitreverse_v32i16: +; GFNISSSE3: # %bb.0: +; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; GFNISSSE3-NEXT: pshufb %xmm4, %xmm0 +; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745] +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 +; GFNISSSE3-NEXT: pshufb %xmm4, %xmm1 +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 +; GFNISSSE3-NEXT: pshufb %xmm4, %xmm2 +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 +; GFNISSSE3-NEXT: pshufb %xmm4, %xmm3 +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 +; GFNISSSE3-NEXT: retq +; +; GFNIAVX2-LABEL: test_bitreverse_v32i16: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512BW-LABEL: test_bitreverse_v32i16: +; GFNIAVX512BW: # %bb.0: +; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62] +; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: retq %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) ret <32 x i16> %b } @@ -1941,6 +2551,78 @@ ; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; XOPAVX2-NEXT: retq +; +; GFNISSE2-LABEL: test_bitreverse_v16i32: +; GFNISSE2: # %bb.0: +; GFNISSE2-NEXT: pxor %xmm4, %xmm4 +; GFNISSE2-NEXT: movdqa %xmm0, %xmm5 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm5, %xmm0 +; GFNISSE2-NEXT: movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745] +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 +; GFNISSE2-NEXT: movdqa %xmm1, %xmm6 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm6, %xmm1 +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 +; GFNISSE2-NEXT: movdqa %xmm2, %xmm6 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm6, %xmm2 +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 +; GFNISSE2-NEXT: movdqa %xmm3, %xmm6 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm6, %xmm3 +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 +; GFNISSE2-NEXT: retq +; +; GFNISSSE3-LABEL: test_bitreverse_v16i32: +; GFNISSSE3: # %bb.0: +; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; GFNISSSE3-NEXT: pshufb %xmm4, %xmm0 +; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745] +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 +; GFNISSSE3-NEXT: pshufb %xmm4, %xmm1 +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 +; GFNISSSE3-NEXT: pshufb %xmm4, %xmm2 +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 +; GFNISSSE3-NEXT: pshufb %xmm4, %xmm3 +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 +; GFNISSSE3-NEXT: retq +; +; GFNIAVX2-LABEL: test_bitreverse_v16i32: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512BW-LABEL: test_bitreverse_v16i32: +; GFNIAVX512BW: # %bb.0: +; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60] +; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: retq %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) ret <16 x i32> %b } @@ -2245,6 +2927,86 @@ ; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; XOPAVX2-NEXT: retq +; +; GFNISSE2-LABEL: test_bitreverse_v8i64: +; GFNISSE2: # %bb.0: +; GFNISSE2-NEXT: pxor %xmm4, %xmm4 +; GFNISSE2-NEXT: movdqa %xmm0, %xmm5 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm5, %xmm0 +; GFNISSE2-NEXT: movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745] +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 +; GFNISSE2-NEXT: movdqa %xmm1, %xmm6 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm6, %xmm1 +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 +; GFNISSE2-NEXT: movdqa %xmm2, %xmm6 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm6, %xmm2 +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 +; GFNISSE2-NEXT: movdqa %xmm3, %xmm6 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm6, %xmm3 +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 +; GFNISSE2-NEXT: retq +; +; GFNISSSE3-LABEL: test_bitreverse_v8i64: +; GFNISSSE3: # %bb.0: +; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; GFNISSSE3-NEXT: pshufb %xmm4, %xmm0 +; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745] +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 +; GFNISSSE3-NEXT: pshufb %xmm4, %xmm1 +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 +; GFNISSSE3-NEXT: pshufb %xmm4, %xmm2 +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 +; GFNISSSE3-NEXT: pshufb %xmm4, %xmm3 +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 +; GFNISSSE3-NEXT: retq +; +; GFNIAVX2-LABEL: test_bitreverse_v8i64: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512BW-LABEL: test_bitreverse_v8i64: +; GFNIAVX512BW: # %bb.0: +; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56] +; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: retq %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) ret <8 x i64> %b } @@ -2277,6 +3039,21 @@ ; XOP: # %bb.0: ; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] ; XOP-NEXT: retq +; +; GFNISSE-LABEL: fold_bitreverse_v16i8: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] +; GFNISSE-NEXT: retq +; +; GFNIAVX2-LABEL: fold_bitreverse_v16i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512BW-LABEL: fold_bitreverse_v16i8: +; GFNIAVX512BW: # %bb.0: +; GFNIAVX512BW-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] +; GFNIAVX512BW-NEXT: retq %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> ) ret <16 x i8> %b } @@ -2297,6 +3074,22 @@ ; XOP: # %bb.0: ; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] ; XOP-NEXT: retq +; +; GFNISSE-LABEL: fold_bitreverse_v16i16: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959] +; GFNISSE-NEXT: movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863] +; GFNISSE-NEXT: retq +; +; GFNIAVX2-LABEL: fold_bitreverse_v16i16: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512BW-LABEL: fold_bitreverse_v16i16: +; GFNIAVX512BW: # %bb.0: +; GFNIAVX512BW-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] +; GFNIAVX512BW-NEXT: retq %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> ) ret <16 x i16> %b } @@ -2332,6 +3125,25 @@ ; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] ; XOP-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] ; XOP-NEXT: retq +; +; GFNISSE-LABEL: fold_bitreverse_v16i32: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471] +; GFNISSE-NEXT: movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559] +; GFNISSE-NEXT: movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015] +; GFNISSE-NEXT: movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103] +; GFNISSE-NEXT: retq +; +; GFNIAVX2-LABEL: fold_bitreverse_v16i32: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] +; GFNIAVX2-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512BW-LABEL: fold_bitreverse_v16i32: +; GFNIAVX512BW: # %bb.0: +; GFNIAVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] +; GFNIAVX512BW-NEXT: retq %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> ) ret <16 x i32> %b }