diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -860,18 +860,21 @@ FeatureGFNI, FeatureRDPID, FeatureFSRM]; - list ICLTuning = [TuningFastGather, - TuningMacroFusion, - TuningSlow3OpsLEA, - TuningSlowDivide64, - TuningFastScalarFSQRT, - TuningFastVectorFSQRT, - TuningFastSHLDRotate, - TuningFast15ByteNOP, - TuningFastVariableCrossLaneShuffle, - TuningFastVariablePerLaneShuffle, - TuningPrefer256Bit, - TuningInsertVZEROUPPER]; + list ICLCommonTuning = [TuningFastGather, + TuningMacroFusion, + TuningSlow3OpsLEA, + TuningSlowDivide64, + TuningFastScalarFSQRT, + TuningFastVectorFSQRT, + TuningFastSHLDRotate, + TuningFast15ByteNOP, + TuningFastVariableCrossLaneShuffle, + TuningFastVariablePerLaneShuffle, + TuningInsertVZEROUPPER]; + + list ICLAdditionalTuning = [TuningPrefer256Bit]; + list ICLTuning = + !listconcat(ICLCommonTuning, ICLAdditionalTuning); list ICLFeatures = !listconcat(CNLFeatures, ICLAdditionalFeatures); @@ -879,7 +882,7 @@ list ICXAdditionalFeatures = [FeaturePCONFIG, FeatureCLWB, FeatureWBNOINVD]; - list ICXTuning = ICLTuning; + list ICXTuning = ICLCommonTuning; list ICXFeatures = !listconcat(ICLFeatures, ICXAdditionalFeatures); diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -1,28 +1,37 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit,avx512vbmi | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit | FileCheck %s --check-prefixes=CHECK,CHECK-AVX256,CHECK-NOVBMI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit,avx512vbmi | FileCheck %s --check-prefixes=CHECK,CHECK-AVX256,CHECK-VBMI256 ; Make sure CPUs default to prefer-256-bit. avx512vnni isn't interesting as it just adds an isel peephole for vpmaddwd+vpaddd -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=cascadelake | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=cooperlake | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=cannonlake | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=icelake-client | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=icelake-server | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX256,CHECK-NOVBMI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=cascadelake | FileCheck %s --check-prefixes=CHECK,CHECK-AVX256,CHECK-NOVBMI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=cooperlake | FileCheck %s --check-prefixes=CHECK,CHECK-AVX256,CHECK-NOVBMI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=cannonlake | FileCheck %s --check-prefixes=CHECK,CHECK-AVX256,CHECK-VBMI256 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=icelake-client | FileCheck %s --check-prefixes=CHECK,CHECK-AVX256,CHECK-VBMI256 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,CHECK-AVX256,CHECK-VBMI256 +; IceLake Server defaults to 512-bit. +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=icelake-server | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512 ; This file primarily contains tests for specific places in X86ISelLowering.cpp that needed be made aware of the legalizer not allowing 512-bit vectors due to prefer-256-bit even though AVX512 is enabled. define dso_local void @add256(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "min-legal-vector-width"="256" { -; CHECK-LABEL: add256: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpaddd 32(%rsi), %ymm1, %ymm1 -; CHECK-NEXT: vpaddd (%rsi), %ymm0, %ymm0 -; CHECK-NEXT: vmovdqa %ymm0, (%rdx) -; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: add256: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-AVX256-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-AVX256-NEXT: vpaddd 32(%rsi), %ymm1, %ymm1 +; CHECK-AVX256-NEXT: vpaddd (%rsi), %ymm0, %ymm0 +; CHECK-AVX256-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-AVX256-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-AVX256-NEXT: vzeroupper +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: add256: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-AVX512-NEXT: vpaddd (%rsi), %zmm0, %zmm0 +; CHECK-AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq %d = load <16 x i32>, <16 x i32>* %a %e = load <16 x i32>, <16 x i32>* %b %f = add <16 x i32> %d, %e @@ -46,16 +55,24 @@ } define dso_local void @avg_v64i8_256(<64 x i8>* %a, <64 x i8>* %b) "min-legal-vector-width"="256" { -; CHECK-LABEL: avg_v64i8_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rsi), %ymm0 -; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1 -; CHECK-NEXT: vpavgb (%rdi), %ymm0, %ymm0 -; CHECK-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1 -; CHECK-NEXT: vmovdqu %ymm1, (%rax) -; CHECK-NEXT: vmovdqu %ymm0, (%rax) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: avg_v64i8_256: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vmovdqa (%rsi), %ymm0 +; CHECK-AVX256-NEXT: vmovdqa 32(%rsi), %ymm1 +; CHECK-AVX256-NEXT: vpavgb (%rdi), %ymm0, %ymm0 +; CHECK-AVX256-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1 +; CHECK-AVX256-NEXT: vmovdqu %ymm1, (%rax) +; CHECK-AVX256-NEXT: vmovdqu %ymm0, (%rax) +; CHECK-AVX256-NEXT: vzeroupper +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: avg_v64i8_256: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-AVX512-NEXT: vpavgb (%rsi), %zmm0, %zmm0 +; CHECK-AVX512-NEXT: vmovdqu64 %zmm0, (%rax) +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq %1 = load <64 x i8>, <64 x i8>* %a %2 = load <64 x i8>, <64 x i8>* %b %3 = zext <64 x i8> %1 to <64 x i32> @@ -90,16 +107,24 @@ } define dso_local void @pmaddwd_32_256(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "min-legal-vector-width"="256" { -; CHECK-LABEL: pmaddwd_32_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 -; CHECK-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 -; CHECK-NEXT: vmovdqa %ymm0, (%rdx) -; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: pmaddwd_32_256: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-AVX256-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-AVX256-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 +; CHECK-AVX256-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 +; CHECK-AVX256-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-AVX256-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-AVX256-NEXT: vzeroupper +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: pmaddwd_32_256: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-AVX512-NEXT: vpmaddwd (%rsi), %zmm0, %zmm0 +; CHECK-AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq %A = load <32 x i16>, <32 x i16>* %APtr %B = load <32 x i16>, <32 x i16>* %BPtr %a = sext <32 x i16> %A to <32 x i32> @@ -133,16 +158,24 @@ } define dso_local void @psubus_64i8_max_256(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "min-legal-vector-width"="256" { -; CHECK-LABEL: psubus_64i8_max_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpsubusb 32(%rsi), %ymm1, %ymm1 -; CHECK-NEXT: vpsubusb (%rsi), %ymm0, %ymm0 -; CHECK-NEXT: vmovdqa %ymm0, (%rdx) -; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: psubus_64i8_max_256: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-AVX256-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-AVX256-NEXT: vpsubusb 32(%rsi), %ymm1, %ymm1 +; CHECK-AVX256-NEXT: vpsubusb (%rsi), %ymm0, %ymm0 +; CHECK-AVX256-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-AVX256-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-AVX256-NEXT: vzeroupper +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: psubus_64i8_max_256: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-AVX512-NEXT: vpsubusb (%rsi), %zmm0, %zmm0 +; CHECK-AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq %x = load <64 x i8>, <64 x i8>* %xptr %y = load <64 x i8>, <64 x i8>* %yptr %cmp = icmp ult <64 x i8> %x, %y @@ -170,40 +203,70 @@ } define dso_local i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly, i32) "min-legal-vector-width"="256" { -; CHECK-LABEL: _Z9test_charPcS_i_256: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB8_1: # %vector.body -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 -; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 -; CHECK-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 -; CHECK-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 -; CHECK-NEXT: vpaddd %ymm2, %ymm3, %ymm2 -; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 -; CHECK-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 -; CHECK-NEXT: vpaddd %ymm1, %ymm3, %ymm1 -; CHECK-NEXT: addq $32, %rcx -; CHECK-NEXT: cmpq %rcx, %rax -; CHECK-NEXT: jne .LBB8_1 -; CHECK-NEXT: # %bb.2: # %middle.block -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm1 -; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: _Z9test_charPcS_i_256: +; CHECK-AVX256: # %bb.0: # %entry +; CHECK-AVX256-NEXT: movl %edx, %eax +; CHECK-AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-AVX256-NEXT: xorl %ecx, %ecx +; CHECK-AVX256-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX256-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-AVX256-NEXT: .p2align 4, 0x90 +; CHECK-AVX256-NEXT: .LBB8_1: # %vector.body +; CHECK-AVX256-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-AVX256-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 +; CHECK-AVX256-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 +; CHECK-AVX256-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 +; CHECK-AVX256-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 +; CHECK-AVX256-NEXT: vpaddd %ymm2, %ymm3, %ymm2 +; CHECK-AVX256-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 +; CHECK-AVX256-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 +; CHECK-AVX256-NEXT: vpaddd %ymm1, %ymm3, %ymm1 +; CHECK-AVX256-NEXT: addq $32, %rcx +; CHECK-AVX256-NEXT: cmpq %rcx, %rax +; CHECK-AVX256-NEXT: jne .LBB8_1 +; CHECK-AVX256-NEXT: # %bb.2: # %middle.block +; CHECK-AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm1 +; CHECK-AVX256-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; CHECK-AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; CHECK-AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX256-NEXT: vmovd %xmm0, %eax +; CHECK-AVX256-NEXT: vzeroupper +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: _Z9test_charPcS_i_256: +; CHECK-AVX512: # %bb.0: # %entry +; CHECK-AVX512-NEXT: movl %edx, %eax +; CHECK-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: xorl %ecx, %ecx +; CHECK-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512-NEXT: .p2align 4, 0x90 +; CHECK-AVX512-NEXT: .LBB8_1: # %vector.body +; CHECK-AVX512-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 +; CHECK-AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 +; CHECK-AVX512-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 +; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1 +; CHECK-AVX512-NEXT: addq $32, %rcx +; CHECK-AVX512-NEXT: cmpq %rcx, %rax +; CHECK-AVX512-NEXT: jne .LBB8_1 +; CHECK-AVX512-NEXT: # %bb.2: # %middle.block +; CHECK-AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vmovd %xmm0, %eax +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq entry: %3 = zext i32 %2 to i64 br label %vector.body @@ -310,30 +373,55 @@ @b = dso_local global [1024 x i8] zeroinitializer, align 16 define dso_local i32 @sad_16i8_256() "min-legal-vector-width"="256" { -; CHECK-LABEL: sad_16i8_256: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: movq $-1024, %rax # imm = 0xFC00 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB10_1: # %vector.body -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu a+1024(%rax), %xmm2 -; CHECK-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 -; CHECK-NEXT: vpaddd %ymm1, %ymm2, %ymm1 -; CHECK-NEXT: addq $4, %rax -; CHECK-NEXT: jne .LBB10_1 -; CHECK-NEXT: # %bb.2: # %middle.block -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: sad_16i8_256: +; CHECK-AVX256: # %bb.0: # %entry +; CHECK-AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-AVX256-NEXT: movq $-1024, %rax # imm = 0xFC00 +; CHECK-AVX256-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX256-NEXT: .p2align 4, 0x90 +; CHECK-AVX256-NEXT: .LBB10_1: # %vector.body +; CHECK-AVX256-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-AVX256-NEXT: vmovdqu a+1024(%rax), %xmm2 +; CHECK-AVX256-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 +; CHECK-AVX256-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; CHECK-AVX256-NEXT: addq $4, %rax +; CHECK-AVX256-NEXT: jne .LBB10_1 +; CHECK-AVX256-NEXT: # %bb.2: # %middle.block +; CHECK-AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; CHECK-AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX256-NEXT: vmovd %xmm0, %eax +; CHECK-AVX256-NEXT: vzeroupper +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: sad_16i8_256: +; CHECK-AVX512: # %bb.0: # %entry +; CHECK-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: movq $-1024, %rax # imm = 0xFC00 +; CHECK-AVX512-NEXT: .p2align 4, 0x90 +; CHECK-AVX512-NEXT: .LBB10_1: # %vector.body +; CHECK-AVX512-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-AVX512-NEXT: vmovdqu a+1024(%rax), %xmm1 +; CHECK-AVX512-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 +; CHECK-AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-AVX512-NEXT: addq $4, %rax +; CHECK-AVX512-NEXT: jne .LBB10_1 +; CHECK-AVX512-NEXT: # %bb.2: # %middle.block +; CHECK-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vmovd %xmm0, %eax +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq entry: br label %vector.body @@ -432,18 +520,27 @@ } define dso_local void @sbto16f32_256(<16 x i16> %a, <16 x float>* %res) "min-legal-vector-width"="256" { -; CHECK-LABEL: sbto16f32_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovw2m %ymm0, %k0 -; CHECK-NEXT: kshiftrw $8, %k0, %k1 -; CHECK-NEXT: vpmovm2d %k1, %ymm0 -; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 -; CHECK-NEXT: vpmovm2d %k0, %ymm1 -; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 -; CHECK-NEXT: vmovaps %ymm1, (%rdi) -; CHECK-NEXT: vmovaps %ymm0, 32(%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: sbto16f32_256: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vpmovw2m %ymm0, %k0 +; CHECK-AVX256-NEXT: kshiftrw $8, %k0, %k1 +; CHECK-AVX256-NEXT: vpmovm2d %k1, %ymm0 +; CHECK-AVX256-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-AVX256-NEXT: vpmovm2d %k0, %ymm1 +; CHECK-AVX256-NEXT: vcvtdq2ps %ymm1, %ymm1 +; CHECK-AVX256-NEXT: vmovaps %ymm1, (%rdi) +; CHECK-AVX256-NEXT: vmovaps %ymm0, 32(%rdi) +; CHECK-AVX256-NEXT: vzeroupper +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: sbto16f32_256: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vpmovw2m %ymm0, %k0 +; CHECK-AVX512-NEXT: vpmovm2d %k0, %zmm0 +; CHECK-AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0 +; CHECK-AVX512-NEXT: vmovaps %zmm0, (%rdi) +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq %mask = icmp slt <16 x i16> %a, zeroinitializer %1 = sitofp <16 x i1> %mask to <16 x float> store <16 x float> %1, <16 x float>* %res @@ -466,24 +563,36 @@ } define dso_local void @sbto16f64_256(<16 x i16> %a, <16 x double>* %res) "min-legal-vector-width"="256" { -; CHECK-LABEL: sbto16f64_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovw2m %ymm0, %k0 -; CHECK-NEXT: kshiftrw $8, %k0, %k1 -; CHECK-NEXT: vpmovm2d %k1, %ymm0 -; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0 -; CHECK-NEXT: vpmovm2d %k0, %ymm2 -; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm3 -; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2 -; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm2 -; CHECK-NEXT: vmovaps %ymm2, 32(%rdi) -; CHECK-NEXT: vmovaps %ymm3, (%rdi) -; CHECK-NEXT: vmovaps %ymm0, 96(%rdi) -; CHECK-NEXT: vmovaps %ymm1, 64(%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: sbto16f64_256: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vpmovw2m %ymm0, %k0 +; CHECK-AVX256-NEXT: kshiftrw $8, %k0, %k1 +; CHECK-AVX256-NEXT: vpmovm2d %k1, %ymm0 +; CHECK-AVX256-NEXT: vcvtdq2pd %xmm0, %ymm1 +; CHECK-AVX256-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-AVX256-NEXT: vcvtdq2pd %xmm0, %ymm0 +; CHECK-AVX256-NEXT: vpmovm2d %k0, %ymm2 +; CHECK-AVX256-NEXT: vcvtdq2pd %xmm2, %ymm3 +; CHECK-AVX256-NEXT: vextracti128 $1, %ymm2, %xmm2 +; CHECK-AVX256-NEXT: vcvtdq2pd %xmm2, %ymm2 +; CHECK-AVX256-NEXT: vmovaps %ymm2, 32(%rdi) +; CHECK-AVX256-NEXT: vmovaps %ymm3, (%rdi) +; CHECK-AVX256-NEXT: vmovaps %ymm0, 96(%rdi) +; CHECK-AVX256-NEXT: vmovaps %ymm1, 64(%rdi) +; CHECK-AVX256-NEXT: vzeroupper +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: sbto16f64_256: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vpmovw2m %ymm0, %k0 +; CHECK-AVX512-NEXT: vpmovm2d %k0, %zmm0 +; CHECK-AVX512-NEXT: vcvtdq2pd %ymm0, %zmm1 +; CHECK-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; CHECK-AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 +; CHECK-AVX512-NEXT: vmovaps %zmm0, 64(%rdi) +; CHECK-AVX512-NEXT: vmovaps %zmm1, (%rdi) +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq %mask = icmp slt <16 x i16> %a, zeroinitializer %1 = sitofp <16 x i1> %mask to <16 x double> store <16 x double> %1, <16 x double>* %res @@ -509,20 +618,30 @@ } define dso_local void @ubto16f32_256(<16 x i16> %a, <16 x float>* %res) "min-legal-vector-width"="256" { -; CHECK-LABEL: ubto16f32_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovw2m %ymm0, %k0 -; CHECK-NEXT: kshiftrw $8, %k0, %k1 -; CHECK-NEXT: vpmovm2d %k1, %ymm0 -; CHECK-NEXT: vpsrld $31, %ymm0, %ymm0 -; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 -; CHECK-NEXT: vpmovm2d %k0, %ymm1 -; CHECK-NEXT: vpsrld $31, %ymm1, %ymm1 -; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 -; CHECK-NEXT: vmovaps %ymm1, (%rdi) -; CHECK-NEXT: vmovaps %ymm0, 32(%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: ubto16f32_256: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vpmovw2m %ymm0, %k0 +; CHECK-AVX256-NEXT: kshiftrw $8, %k0, %k1 +; CHECK-AVX256-NEXT: vpmovm2d %k1, %ymm0 +; CHECK-AVX256-NEXT: vpsrld $31, %ymm0, %ymm0 +; CHECK-AVX256-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-AVX256-NEXT: vpmovm2d %k0, %ymm1 +; CHECK-AVX256-NEXT: vpsrld $31, %ymm1, %ymm1 +; CHECK-AVX256-NEXT: vcvtdq2ps %ymm1, %ymm1 +; CHECK-AVX256-NEXT: vmovaps %ymm1, (%rdi) +; CHECK-AVX256-NEXT: vmovaps %ymm0, 32(%rdi) +; CHECK-AVX256-NEXT: vzeroupper +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: ubto16f32_256: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vpmovw2m %ymm0, %k0 +; CHECK-AVX512-NEXT: vpmovm2d %k0, %zmm0 +; CHECK-AVX512-NEXT: vpsrld $31, %zmm0, %zmm0 +; CHECK-AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0 +; CHECK-AVX512-NEXT: vmovaps %zmm0, (%rdi) +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq %mask = icmp slt <16 x i16> %a, zeroinitializer %1 = uitofp <16 x i1> %mask to <16 x float> store <16 x float> %1, <16 x float>* %res @@ -546,26 +665,39 @@ } define dso_local void @ubto16f64_256(<16 x i16> %a, <16 x double>* %res) "min-legal-vector-width"="256" { -; CHECK-LABEL: ubto16f64_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovw2m %ymm0, %k0 -; CHECK-NEXT: kshiftrw $8, %k0, %k1 -; CHECK-NEXT: vpmovm2d %k1, %ymm0 -; CHECK-NEXT: vpsrld $31, %ymm0, %ymm0 -; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0 -; CHECK-NEXT: vpmovm2d %k0, %ymm2 -; CHECK-NEXT: vpsrld $31, %ymm2, %ymm2 -; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm3 -; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2 -; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm2 -; CHECK-NEXT: vmovaps %ymm2, 32(%rdi) -; CHECK-NEXT: vmovaps %ymm3, (%rdi) -; CHECK-NEXT: vmovaps %ymm0, 96(%rdi) -; CHECK-NEXT: vmovaps %ymm1, 64(%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: ubto16f64_256: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vpmovw2m %ymm0, %k0 +; CHECK-AVX256-NEXT: kshiftrw $8, %k0, %k1 +; CHECK-AVX256-NEXT: vpmovm2d %k1, %ymm0 +; CHECK-AVX256-NEXT: vpsrld $31, %ymm0, %ymm0 +; CHECK-AVX256-NEXT: vcvtdq2pd %xmm0, %ymm1 +; CHECK-AVX256-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-AVX256-NEXT: vcvtdq2pd %xmm0, %ymm0 +; CHECK-AVX256-NEXT: vpmovm2d %k0, %ymm2 +; CHECK-AVX256-NEXT: vpsrld $31, %ymm2, %ymm2 +; CHECK-AVX256-NEXT: vcvtdq2pd %xmm2, %ymm3 +; CHECK-AVX256-NEXT: vextracti128 $1, %ymm2, %xmm2 +; CHECK-AVX256-NEXT: vcvtdq2pd %xmm2, %ymm2 +; CHECK-AVX256-NEXT: vmovaps %ymm2, 32(%rdi) +; CHECK-AVX256-NEXT: vmovaps %ymm3, (%rdi) +; CHECK-AVX256-NEXT: vmovaps %ymm0, 96(%rdi) +; CHECK-AVX256-NEXT: vmovaps %ymm1, 64(%rdi) +; CHECK-AVX256-NEXT: vzeroupper +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: ubto16f64_256: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vpmovw2m %ymm0, %k0 +; CHECK-AVX512-NEXT: vpmovm2d %k0, %zmm0 +; CHECK-AVX512-NEXT: vpsrld $31, %zmm0, %zmm0 +; CHECK-AVX512-NEXT: vcvtdq2pd %ymm0, %zmm1 +; CHECK-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; CHECK-AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 +; CHECK-AVX512-NEXT: vmovaps %zmm0, 64(%rdi) +; CHECK-AVX512-NEXT: vmovaps %zmm1, (%rdi) +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq %mask = icmp slt <16 x i16> %a, zeroinitializer %1 = uitofp <16 x i1> %mask to <16 x double> store <16 x double> %1, <16 x double>* %res @@ -592,17 +724,25 @@ } define <16 x i16> @test_16f32toub_256(<16 x float>* %ptr, <16 x i16> %passthru) "min-legal-vector-width"="256" { -; CHECK-LABEL: test_16f32toub_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vcvttps2dq (%rdi), %ymm1 -; CHECK-NEXT: vpslld $31, %ymm1, %ymm1 -; CHECK-NEXT: vpmovd2m %ymm1, %k0 -; CHECK-NEXT: vcvttps2dq 32(%rdi), %ymm1 -; CHECK-NEXT: vpslld $31, %ymm1, %ymm1 -; CHECK-NEXT: vpmovd2m %ymm1, %k1 -; CHECK-NEXT: kunpckbw %k0, %k1, %k1 -; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: test_16f32toub_256: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vcvttps2dq (%rdi), %ymm1 +; CHECK-AVX256-NEXT: vpslld $31, %ymm1, %ymm1 +; CHECK-AVX256-NEXT: vpmovd2m %ymm1, %k0 +; CHECK-AVX256-NEXT: vcvttps2dq 32(%rdi), %ymm1 +; CHECK-AVX256-NEXT: vpslld $31, %ymm1, %ymm1 +; CHECK-AVX256-NEXT: vpmovd2m %ymm1, %k1 +; CHECK-AVX256-NEXT: kunpckbw %k0, %k1, %k1 +; CHECK-AVX256-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: test_16f32toub_256: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vcvttps2dq (%rdi), %zmm1 +; CHECK-AVX512-NEXT: vpslld $31, %zmm1, %zmm1 +; CHECK-AVX512-NEXT: vpmovd2m %zmm1, %k1 +; CHECK-AVX512-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} +; CHECK-AVX512-NEXT: retq %a = load <16 x float>, <16 x float>* %ptr %mask = fptoui <16 x float> %a to <16 x i1> %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer @@ -624,15 +764,22 @@ } define <16 x i16> @test_16f32tosb_256(<16 x float>* %ptr, <16 x i16> %passthru) "min-legal-vector-width"="256" { -; CHECK-LABEL: test_16f32tosb_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vcvttps2dq (%rdi), %ymm1 -; CHECK-NEXT: vpmovd2m %ymm1, %k0 -; CHECK-NEXT: vcvttps2dq 32(%rdi), %ymm1 -; CHECK-NEXT: vpmovd2m %ymm1, %k1 -; CHECK-NEXT: kunpckbw %k0, %k1, %k1 -; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: test_16f32tosb_256: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vcvttps2dq (%rdi), %ymm1 +; CHECK-AVX256-NEXT: vpmovd2m %ymm1, %k0 +; CHECK-AVX256-NEXT: vcvttps2dq 32(%rdi), %ymm1 +; CHECK-AVX256-NEXT: vpmovd2m %ymm1, %k1 +; CHECK-AVX256-NEXT: kunpckbw %k0, %k1, %k1 +; CHECK-AVX256-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: test_16f32tosb_256: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vcvttps2dq (%rdi), %zmm1 +; CHECK-AVX512-NEXT: vpmovd2m %zmm1, %k1 +; CHECK-AVX512-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} +; CHECK-AVX512-NEXT: retq %a = load <16 x float>, <16 x float>* %ptr %mask = fptosi <16 x float> %a to <16 x i1> %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer @@ -653,61 +800,77 @@ } define dso_local void @mul256(<64 x i8>* %a, <64 x i8>* %b, <64 x i8>* %c) "min-legal-vector-width"="256" { +; CHECK-NOVBMI-LABEL: mul256: +; CHECK-NOVBMI: # %bb.0: +; CHECK-NOVBMI-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-NOVBMI-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-NOVBMI-NEXT: vmovdqa (%rsi), %ymm2 +; CHECK-NOVBMI-NEXT: vmovdqa 32(%rsi), %ymm3 +; CHECK-NOVBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-NOVBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-NOVBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; CHECK-NOVBMI-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-NOVBMI-NEXT: vpand %ymm5, %ymm4, %ymm4 +; CHECK-NOVBMI-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-NOVBMI-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-NOVBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm1 +; CHECK-NOVBMI-NEXT: vpand %ymm5, %ymm1, %ymm1 +; CHECK-NOVBMI-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 +; CHECK-NOVBMI-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-NOVBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-NOVBMI-NEXT: vpmullw %ymm3, %ymm4, %ymm3 +; CHECK-NOVBMI-NEXT: vpand %ymm5, %ymm3, %ymm3 +; CHECK-NOVBMI-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-NOVBMI-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-NOVBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; CHECK-NOVBMI-NEXT: vpand %ymm5, %ymm0, %ymm0 +; CHECK-NOVBMI-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; CHECK-NOVBMI-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-NOVBMI-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NOVBMI-NEXT: vzeroupper +; CHECK-NOVBMI-NEXT: retq +; +; CHECK-VBMI256-LABEL: mul256: +; CHECK-VBMI256: # %bb.0: +; CHECK-VBMI256-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-VBMI256-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-VBMI256-NEXT: vmovdqa (%rsi), %ymm2 +; CHECK-VBMI256-NEXT: vmovdqa 32(%rsi), %ymm3 +; CHECK-VBMI256-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-VBMI256-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-VBMI256-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; CHECK-VBMI256-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-VBMI256-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-VBMI256-NEXT: vpmullw %ymm3, %ymm1, %ymm1 +; CHECK-VBMI256-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62] +; CHECK-VBMI256-NEXT: vpermt2b %ymm4, %ymm3, %ymm1 +; CHECK-VBMI256-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-VBMI256-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-VBMI256-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; CHECK-VBMI256-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-VBMI256-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-VBMI256-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; CHECK-VBMI256-NEXT: vpermt2b %ymm4, %ymm3, %ymm0 +; CHECK-VBMI256-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-VBMI256-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-VBMI256-NEXT: vzeroupper +; CHECK-VBMI256-NEXT: retq +; ; CHECK-AVX512-LABEL: mul256: ; CHECK-AVX512: # %bb.0: -; CHECK-AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-AVX512-NEXT: vmovdqa (%rsi), %ymm2 -; CHECK-AVX512-NEXT: vmovdqa 32(%rsi), %ymm3 -; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-AVX512-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; CHECK-AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-AVX512-NEXT: vpand %ymm5, %ymm4, %ymm4 -; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-AVX512-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; CHECK-AVX512-NEXT: vpand %ymm5, %ymm1, %ymm1 -; CHECK-AVX512-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 -; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-AVX512-NEXT: vpmullw %ymm3, %ymm4, %ymm3 -; CHECK-AVX512-NEXT: vpand %ymm5, %ymm3, %ymm3 -; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0 -; CHECK-AVX512-NEXT: vpand %ymm5, %ymm0, %ymm0 -; CHECK-AVX512-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 -; CHECK-AVX512-NEXT: vmovdqa %ymm0, (%rdx) -; CHECK-AVX512-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 +; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; CHECK-AVX512-NEXT: vpmullw %zmm2, %zmm3, %zmm2 +; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] +; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] +; CHECK-AVX512-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; CHECK-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126] +; CHECK-AVX512-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 +; CHECK-AVX512-NEXT: vmovdqa64 %zmm1, (%rdx) ; CHECK-AVX512-NEXT: vzeroupper ; CHECK-AVX512-NEXT: retq -; -; CHECK-VBMI-LABEL: mul256: -; CHECK-VBMI: # %bb.0: -; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-VBMI-NEXT: vmovdqa (%rsi), %ymm2 -; CHECK-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3 -; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62] -; CHECK-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm1 -; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-VBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm0 -; CHECK-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm0 -; CHECK-VBMI-NEXT: vmovdqa %ymm0, (%rdx) -; CHECK-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx) -; CHECK-VBMI-NEXT: vzeroupper -; CHECK-VBMI-NEXT: retq %d = load <64 x i8>, <64 x i8>* %a %e = load <64 x i8>, <64 x i8>* %b %f = mul <64 x i8> %d, %e @@ -716,6 +879,40 @@ } define dso_local void @mul512(<64 x i8>* %a, <64 x i8>* %b, <64 x i8>* %c) "min-legal-vector-width"="512" { +; CHECK-NOVBMI-LABEL: mul512: +; CHECK-NOVBMI: # %bb.0: +; CHECK-NOVBMI-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NOVBMI-NEXT: vmovdqa64 (%rsi), %zmm1 +; CHECK-NOVBMI-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; CHECK-NOVBMI-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; CHECK-NOVBMI-NEXT: vpmullw %zmm2, %zmm3, %zmm2 +; CHECK-NOVBMI-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-NOVBMI-NEXT: vpandq %zmm3, %zmm2, %zmm2 +; CHECK-NOVBMI-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] +; CHECK-NOVBMI-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] +; CHECK-NOVBMI-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; CHECK-NOVBMI-NEXT: vpandq %zmm3, %zmm0, %zmm0 +; CHECK-NOVBMI-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; CHECK-NOVBMI-NEXT: vmovdqa64 %zmm0, (%rdx) +; CHECK-NOVBMI-NEXT: vzeroupper +; CHECK-NOVBMI-NEXT: retq +; +; CHECK-VBMI256-LABEL: mul512: +; CHECK-VBMI256: # %bb.0: +; CHECK-VBMI256-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-VBMI256-NEXT: vmovdqa64 (%rsi), %zmm1 +; CHECK-VBMI256-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; CHECK-VBMI256-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; CHECK-VBMI256-NEXT: vpmullw %zmm2, %zmm3, %zmm2 +; CHECK-VBMI256-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] +; CHECK-VBMI256-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] +; CHECK-VBMI256-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; CHECK-VBMI256-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126] +; CHECK-VBMI256-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 +; CHECK-VBMI256-NEXT: vmovdqa64 %zmm1, (%rdx) +; CHECK-VBMI256-NEXT: vzeroupper +; CHECK-VBMI256-NEXT: retq +; ; CHECK-AVX512-LABEL: mul512: ; CHECK-AVX512: # %bb.0: ; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 @@ -723,32 +920,14 @@ ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; CHECK-AVX512-NEXT: vpmullw %zmm2, %zmm3, %zmm2 -; CHECK-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-AVX512-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; CHECK-AVX512-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; CHECK-AVX512-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; CHECK-AVX512-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 -; CHECK-AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) +; CHECK-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126] +; CHECK-AVX512-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 +; CHECK-AVX512-NEXT: vmovdqa64 %zmm1, (%rdx) ; CHECK-AVX512-NEXT: vzeroupper ; CHECK-AVX512-NEXT: retq -; -; CHECK-VBMI-LABEL: mul512: -; CHECK-VBMI: # %bb.0: -; CHECK-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0 -; CHECK-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1 -; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; CHECK-VBMI-NEXT: vpmullw %zmm2, %zmm3, %zmm2 -; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; CHECK-VBMI-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; CHECK-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126] -; CHECK-VBMI-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 -; CHECK-VBMI-NEXT: vmovdqa64 %zmm1, (%rdx) -; CHECK-VBMI-NEXT: vzeroupper -; CHECK-VBMI-NEXT: retq %d = load <64 x i8>, <64 x i8>* %a %e = load <64 x i8>, <64 x i8>* %b %f = mul <64 x i8> %d, %e @@ -770,98 +949,144 @@ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) define <16 x i32> @trunc_v16i64_v16i32(<16 x i64>* %x) nounwind "min-legal-vector-width"="256" { -; CHECK-LABEL: trunc_v16i64_v16i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vmovdqa 64(%rdi), %ymm2 -; CHECK-NEXT: vmovdqa 96(%rdi), %ymm3 -; CHECK-NEXT: vpmovqd %ymm0, %xmm0 -; CHECK-NEXT: vpmovqd %ymm1, %xmm1 -; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; CHECK-NEXT: vpmovqd %ymm2, %xmm1 -; CHECK-NEXT: vpmovqd %ymm3, %xmm2 -; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: trunc_v16i64_v16i32: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-AVX256-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-AVX256-NEXT: vmovdqa 64(%rdi), %ymm2 +; CHECK-AVX256-NEXT: vmovdqa 96(%rdi), %ymm3 +; CHECK-AVX256-NEXT: vpmovqd %ymm0, %xmm0 +; CHECK-AVX256-NEXT: vpmovqd %ymm1, %xmm1 +; CHECK-AVX256-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-AVX256-NEXT: vpmovqd %ymm2, %xmm1 +; CHECK-AVX256-NEXT: vpmovqd %ymm3, %xmm2 +; CHECK-AVX256-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: trunc_v16i64_v16i32: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 +; CHECK-AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; CHECK-AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; CHECK-AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; CHECK-AVX512-NEXT: retq %a = load <16 x i64>, <16 x i64>* %x %b = trunc <16 x i64> %a to <16 x i32> ret <16 x i32> %b } define <16 x i8> @trunc_v16i64_v16i8(<16 x i64>* %x) nounwind "min-legal-vector-width"="256" { -; CHECK-LABEL: trunc_v16i64_v16i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vmovdqa 64(%rdi), %ymm2 -; CHECK-NEXT: vmovdqa 96(%rdi), %ymm3 -; CHECK-NEXT: vpmovqb %ymm3, %xmm3 -; CHECK-NEXT: vpmovqb %ymm2, %xmm2 -; CHECK-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-NEXT: vpmovqb %ymm1, %xmm1 -; CHECK-NEXT: vpmovqb %ymm0, %xmm0 -; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: trunc_v16i64_v16i8: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-AVX256-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-AVX256-NEXT: vmovdqa 64(%rdi), %ymm2 +; CHECK-AVX256-NEXT: vmovdqa 96(%rdi), %ymm3 +; CHECK-AVX256-NEXT: vpmovqb %ymm3, %xmm3 +; CHECK-AVX256-NEXT: vpmovqb %ymm2, %xmm2 +; CHECK-AVX256-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-AVX256-NEXT: vpmovqb %ymm1, %xmm1 +; CHECK-AVX256-NEXT: vpmovqb %ymm0, %xmm0 +; CHECK-AVX256-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-AVX256-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; CHECK-AVX256-NEXT: vzeroupper +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: trunc_v16i64_v16i8: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 +; CHECK-AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; CHECK-AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; CHECK-AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq %a = load <16 x i64>, <16 x i64>* %x %b = trunc <16 x i64> %a to <16 x i8> ret <16 x i8> %b } define <16 x i8> @trunc_v16i32_v16i8(<16 x i32>* %x) nounwind "min-legal-vector-width"="256" { -; CHECK-LABEL: trunc_v16i32_v16i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpmovdb %ymm1, %xmm1 -; CHECK-NEXT: vpmovdb %ymm0, %xmm0 -; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: trunc_v16i32_v16i8: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-AVX256-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-AVX256-NEXT: vpmovdb %ymm1, %xmm1 +; CHECK-AVX256-NEXT: vpmovdb %ymm0, %xmm0 +; CHECK-AVX256-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-AVX256-NEXT: vzeroupper +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: trunc_v16i32_v16i8: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq %a = load <16 x i32>, <16 x i32>* %x %b = trunc <16 x i32> %a to <16 x i8> ret <16 x i8> %b } define <8 x i8> @trunc_v8i64_v8i8(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" { -; CHECK-LABEL: trunc_v8i64_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpmovqb %ymm1, %xmm1 -; CHECK-NEXT: vpmovqb %ymm0, %xmm0 -; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: trunc_v8i64_v8i8: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-AVX256-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-AVX256-NEXT: vpmovqb %ymm1, %xmm1 +; CHECK-AVX256-NEXT: vpmovqb %ymm0, %xmm0 +; CHECK-AVX256-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-AVX256-NEXT: vzeroupper +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: trunc_v8i64_v8i8: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq %a = load <8 x i64>, <8 x i64>* %x %b = trunc <8 x i64> %a to <8 x i8> ret <8 x i8> %b } define <8 x i16> @trunc_v8i64_v8i16(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" { -; CHECK-LABEL: trunc_v8i64_v8i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpmovqw %ymm1, %xmm1 -; CHECK-NEXT: vpmovqw %ymm0, %xmm0 -; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: trunc_v8i64_v8i16: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-AVX256-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-AVX256-NEXT: vpmovqw %ymm1, %xmm1 +; CHECK-AVX256-NEXT: vpmovqw %ymm0, %xmm0 +; CHECK-AVX256-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-AVX256-NEXT: vzeroupper +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: trunc_v8i64_v8i16: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq %a = load <8 x i64>, <8 x i64>* %x %b = trunc <8 x i64> %a to <8 x i16> ret <8 x i16> %b } define <8 x i32> @trunc_v8i64_v8i32_zeroes(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" { -; CHECK-LABEL: trunc_v8i64_v8i32_zeroes: -; CHECK: # %bb.0: -; CHECK-NEXT: vpsrlq $48, 32(%rdi), %ymm0 -; CHECK-NEXT: vpsrlq $48, (%rdi), %ymm1 -; CHECK-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: trunc_v8i64_v8i32_zeroes: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vpsrlq $48, 32(%rdi), %ymm0 +; CHECK-AVX256-NEXT: vpsrlq $48, (%rdi), %ymm1 +; CHECK-AVX256-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; CHECK-AVX256-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: trunc_v8i64_v8i32_zeroes: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vpsrlq $48, (%rdi), %zmm0 +; CHECK-AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; CHECK-AVX512-NEXT: retq %a = load <8 x i64>, <8 x i64>* %x %b = lshr <8 x i64> %a, %c = trunc <8 x i64> %b to <8 x i32> @@ -869,12 +1094,18 @@ } define <16 x i16> @trunc_v16i32_v16i16_zeroes(<16 x i32>* %x) nounwind "min-legal-vector-width"="256" { -; CHECK-LABEL: trunc_v16i32_v16i16_zeroes: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] -; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: trunc_v16i32_v16i16_zeroes: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-AVX256-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; CHECK-AVX256-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: trunc_v16i32_v16i16_zeroes: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vpsrld $16, (%rdi), %zmm0 +; CHECK-AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; CHECK-AVX512-NEXT: retq %a = load <16 x i32>, <16 x i32>* %x %b = lshr <16 x i32> %a, %c = trunc <16 x i32> %b to <16 x i16> @@ -882,20 +1113,26 @@ } define <32 x i8> @trunc_v32i16_v32i8_zeroes(<32 x i16>* %x) nounwind "min-legal-vector-width"="256" { +; CHECK-NOVBMI-LABEL: trunc_v32i16_v32i8_zeroes: +; CHECK-NOVBMI: # %bb.0: +; CHECK-NOVBMI-NEXT: vpsrlw $8, 32(%rdi), %ymm0 +; CHECK-NOVBMI-NEXT: vpsrlw $8, (%rdi), %ymm1 +; CHECK-NOVBMI-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 +; CHECK-NOVBMI-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; CHECK-NOVBMI-NEXT: retq +; +; CHECK-VBMI256-LABEL: trunc_v32i16_v32i8_zeroes: +; CHECK-VBMI256: # %bb.0: +; CHECK-VBMI256-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-VBMI256-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] +; CHECK-VBMI256-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0 +; CHECK-VBMI256-NEXT: retq +; ; CHECK-AVX512-LABEL: trunc_v32i16_v32i8_zeroes: ; CHECK-AVX512: # %bb.0: -; CHECK-AVX512-NEXT: vpsrlw $8, 32(%rdi), %ymm0 -; CHECK-AVX512-NEXT: vpsrlw $8, (%rdi), %ymm1 -; CHECK-AVX512-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 -; CHECK-AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; CHECK-AVX512-NEXT: vpsrlw $8, (%rdi), %zmm0 +; CHECK-AVX512-NEXT: vpmovwb %zmm0, %ymm0 ; CHECK-AVX512-NEXT: retq -; -; CHECK-VBMI-LABEL: trunc_v32i16_v32i8_zeroes: -; CHECK-VBMI: # %bb.0: -; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] -; CHECK-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0 -; CHECK-VBMI-NEXT: retq %a = load <32 x i16>, <32 x i16>* %x %b = lshr <32 x i16> %a, %c = trunc <32 x i16> %b to <32 x i8> @@ -903,14 +1140,20 @@ } define <8 x i32> @trunc_v8i64_v8i32_sign(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" { -; CHECK-LABEL: trunc_v8i64_v8i32_sign: -; CHECK: # %bb.0: -; CHECK-NEXT: vpsraq $48, 32(%rdi), %ymm0 -; CHECK-NEXT: vpsraq $48, (%rdi), %ymm1 -; CHECK-NEXT: vpmovqd %ymm1, %xmm1 -; CHECK-NEXT: vpmovqd %ymm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: trunc_v8i64_v8i32_sign: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vpsraq $48, 32(%rdi), %ymm0 +; CHECK-AVX256-NEXT: vpsraq $48, (%rdi), %ymm1 +; CHECK-AVX256-NEXT: vpmovqd %ymm1, %xmm1 +; CHECK-AVX256-NEXT: vpmovqd %ymm0, %xmm0 +; CHECK-AVX256-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: trunc_v8i64_v8i32_sign: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vpsraq $48, (%rdi), %zmm0 +; CHECK-AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; CHECK-AVX512-NEXT: retq %a = load <8 x i64>, <8 x i64>* %x %b = ashr <8 x i64> %a, %c = trunc <8 x i64> %b to <8 x i32> @@ -918,12 +1161,18 @@ } define <16 x i16> @trunc_v16i32_v16i16_sign(<16 x i32>* %x) nounwind "min-legal-vector-width"="256" { -; CHECK-LABEL: trunc_v16i32_v16i16_sign: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] -; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: trunc_v16i32_v16i16_sign: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-AVX256-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; CHECK-AVX256-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: trunc_v16i32_v16i16_sign: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vpsrld $16, (%rdi), %zmm0 +; CHECK-AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; CHECK-AVX512-NEXT: retq %a = load <16 x i32>, <16 x i32>* %x %b = ashr <16 x i32> %a, %c = trunc <16 x i32> %b to <16 x i16> @@ -931,20 +1180,26 @@ } define <32 x i8> @trunc_v32i16_v32i8_sign(<32 x i16>* %x) nounwind "min-legal-vector-width"="256" { +; CHECK-NOVBMI-LABEL: trunc_v32i16_v32i8_sign: +; CHECK-NOVBMI: # %bb.0: +; CHECK-NOVBMI-NEXT: vpsrlw $8, 32(%rdi), %ymm0 +; CHECK-NOVBMI-NEXT: vpsrlw $8, (%rdi), %ymm1 +; CHECK-NOVBMI-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 +; CHECK-NOVBMI-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; CHECK-NOVBMI-NEXT: retq +; +; CHECK-VBMI256-LABEL: trunc_v32i16_v32i8_sign: +; CHECK-VBMI256: # %bb.0: +; CHECK-VBMI256-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-VBMI256-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] +; CHECK-VBMI256-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0 +; CHECK-VBMI256-NEXT: retq +; ; CHECK-AVX512-LABEL: trunc_v32i16_v32i8_sign: ; CHECK-AVX512: # %bb.0: -; CHECK-AVX512-NEXT: vpsrlw $8, 32(%rdi), %ymm0 -; CHECK-AVX512-NEXT: vpsrlw $8, (%rdi), %ymm1 -; CHECK-AVX512-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 -; CHECK-AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; CHECK-AVX512-NEXT: vpsrlw $8, (%rdi), %zmm0 +; CHECK-AVX512-NEXT: vpmovwb %zmm0, %ymm0 ; CHECK-AVX512-NEXT: retq -; -; CHECK-VBMI-LABEL: trunc_v32i16_v32i8_sign: -; CHECK-VBMI: # %bb.0: -; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] -; CHECK-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0 -; CHECK-VBMI-NEXT: retq %a = load <32 x i16>, <32 x i16>* %x %b = ashr <32 x i16> %a, %c = trunc <32 x i16> %b to <32 x i8> @@ -952,62 +1207,93 @@ } define dso_local void @zext_v16i8_v16i64(<16 x i8> %x, <16 x i64>* %y) nounwind "min-legal-vector-width"="256" { -; CHECK-LABEL: zext_v16i8_v16i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vmovdqa %ymm0, (%rdi) -; CHECK-NEXT: vmovdqa %ymm1, 64(%rdi) -; CHECK-NEXT: vmovdqa %ymm3, 96(%rdi) -; CHECK-NEXT: vmovdqa %ymm2, 32(%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: zext_v16i8_v16i64: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; CHECK-AVX256-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; CHECK-AVX256-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; CHECK-AVX256-NEXT: vextracti128 $1, %ymm1, %xmm1 +; CHECK-AVX256-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; CHECK-AVX256-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; CHECK-AVX256-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; CHECK-AVX256-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; CHECK-AVX256-NEXT: vmovdqa %ymm0, (%rdi) +; CHECK-AVX256-NEXT: vmovdqa %ymm1, 64(%rdi) +; CHECK-AVX256-NEXT: vmovdqa %ymm3, 96(%rdi) +; CHECK-AVX256-NEXT: vmovdqa %ymm2, 32(%rdi) +; CHECK-AVX256-NEXT: vzeroupper +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: zext_v16i8_v16i64: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; CHECK-AVX512-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; CHECK-AVX512-NEXT: vmovdqa64 %zmm0, 64(%rdi) +; CHECK-AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq %a = zext <16 x i8> %x to <16 x i64> store <16 x i64> %a, <16 x i64>* %y ret void } define dso_local void @sext_v16i8_v16i64(<16 x i8> %x, <16 x i64>* %y) nounwind "min-legal-vector-width"="256" { -; CHECK-LABEL: sext_v16i8_v16i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw %xmm0, %ymm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; CHECK-NEXT: vpmovsxwq %xmm2, %ymm2 -; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; CHECK-NEXT: vpmovsxwq %xmm3, %ymm3 -; CHECK-NEXT: vpmovsxwq %xmm1, %ymm1 -; CHECK-NEXT: vpmovsxbq %xmm0, %ymm0 -; CHECK-NEXT: vmovdqa %ymm0, (%rdi) -; CHECK-NEXT: vmovdqa %ymm1, 64(%rdi) -; CHECK-NEXT: vmovdqa %ymm3, 96(%rdi) -; CHECK-NEXT: vmovdqa %ymm2, 32(%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: sext_v16i8_v16i64: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vpmovsxbw %xmm0, %ymm1 +; CHECK-AVX256-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; CHECK-AVX256-NEXT: vpmovsxwq %xmm2, %ymm2 +; CHECK-AVX256-NEXT: vextracti128 $1, %ymm1, %xmm1 +; CHECK-AVX256-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; CHECK-AVX256-NEXT: vpmovsxwq %xmm3, %ymm3 +; CHECK-AVX256-NEXT: vpmovsxwq %xmm1, %ymm1 +; CHECK-AVX256-NEXT: vpmovsxbq %xmm0, %ymm0 +; CHECK-AVX256-NEXT: vmovdqa %ymm0, (%rdi) +; CHECK-AVX256-NEXT: vmovdqa %ymm1, 64(%rdi) +; CHECK-AVX256-NEXT: vmovdqa %ymm3, 96(%rdi) +; CHECK-AVX256-NEXT: vmovdqa %ymm2, 32(%rdi) +; CHECK-AVX256-NEXT: vzeroupper +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: sext_v16i8_v16i64: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vpmovsxbw %xmm0, %ymm0 +; CHECK-AVX512-NEXT: vpmovsxwq %xmm0, %zmm1 +; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 +; CHECK-AVX512-NEXT: vmovdqa64 %zmm0, 64(%rdi) +; CHECK-AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq %a = sext <16 x i8> %x to <16 x i64> store <16 x i64> %a, <16 x i64>* %y ret void } define dso_local void @vselect_split_v8i16_setcc(<8 x i16> %s, <8 x i16> %t, <8 x i64>* %p, <8 x i64>* %q, <8 x i64>* %r) "min-legal-vector-width"="256" { -; CHECK-LABEL: vselect_split_v8i16_setcc: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rsi), %ymm2 -; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 -; CHECK-NEXT: kshiftrb $4, %k1, %k2 -; CHECK-NEXT: vmovdqa64 32(%rdi), %ymm3 {%k2} -; CHECK-NEXT: vmovdqa64 (%rdi), %ymm2 {%k1} -; CHECK-NEXT: vmovdqa %ymm2, (%rdx) -; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: vselect_split_v8i16_setcc: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vmovdqa (%rsi), %ymm2 +; CHECK-AVX256-NEXT: vmovdqa 32(%rsi), %ymm3 +; CHECK-AVX256-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; CHECK-AVX256-NEXT: kshiftrb $4, %k1, %k2 +; CHECK-AVX256-NEXT: vmovdqa64 32(%rdi), %ymm3 {%k2} +; CHECK-AVX256-NEXT: vmovdqa64 (%rdi), %ymm2 {%k1} +; CHECK-AVX256-NEXT: vmovdqa %ymm2, (%rdx) +; CHECK-AVX256-NEXT: vmovdqa %ymm3, 32(%rdx) +; CHECK-AVX256-NEXT: vzeroupper +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: vselect_split_v8i16_setcc: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 {%k1} +; CHECK-AVX512-NEXT: vmovdqa64 %zmm2, (%rdx) +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq %x = load <8 x i64>, <8 x i64>* %p %y = load <8 x i64>, <8 x i64>* %q %a = icmp eq <8 x i16> %s, %t @@ -1017,18 +1303,27 @@ } define dso_local void @vselect_split_v8i32_setcc(<8 x i32> %s, <8 x i32> %t, <8 x i64>* %p, <8 x i64>* %q, <8 x i64>* %r) "min-legal-vector-width"="256" { -; CHECK-LABEL: vselect_split_v8i32_setcc: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rsi), %ymm2 -; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 -; CHECK-NEXT: kshiftrb $4, %k1, %k2 -; CHECK-NEXT: vmovdqa64 32(%rdi), %ymm3 {%k2} -; CHECK-NEXT: vmovdqa64 (%rdi), %ymm2 {%k1} -; CHECK-NEXT: vmovdqa %ymm2, (%rdx) -; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: vselect_split_v8i32_setcc: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vmovdqa (%rsi), %ymm2 +; CHECK-AVX256-NEXT: vmovdqa 32(%rsi), %ymm3 +; CHECK-AVX256-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-AVX256-NEXT: kshiftrb $4, %k1, %k2 +; CHECK-AVX256-NEXT: vmovdqa64 32(%rdi), %ymm3 {%k2} +; CHECK-AVX256-NEXT: vmovdqa64 (%rdi), %ymm2 {%k1} +; CHECK-AVX256-NEXT: vmovdqa %ymm2, (%rdx) +; CHECK-AVX256-NEXT: vmovdqa %ymm3, 32(%rdx) +; CHECK-AVX256-NEXT: vzeroupper +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: vselect_split_v8i32_setcc: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-AVX512-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 {%k1} +; CHECK-AVX512-NEXT: vmovdqa64 %zmm2, (%rdx) +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq %x = load <8 x i64>, <8 x i64>* %p %y = load <8 x i64>, <8 x i64>* %q %a = icmp eq <8 x i32> %s, %t @@ -1038,18 +1333,27 @@ } define dso_local void @vselect_split_v16i8_setcc(<16 x i8> %s, <16 x i8> %t, <16 x i32>* %p, <16 x i32>* %q, <16 x i32>* %r) "min-legal-vector-width"="256" { -; CHECK-LABEL: vselect_split_v16i8_setcc: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rsi), %ymm2 -; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1 -; CHECK-NEXT: kshiftrw $8, %k1, %k2 -; CHECK-NEXT: vmovdqa32 32(%rdi), %ymm3 {%k2} -; CHECK-NEXT: vmovdqa32 (%rdi), %ymm2 {%k1} -; CHECK-NEXT: vmovdqa %ymm2, (%rdx) -; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: vselect_split_v16i8_setcc: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vmovdqa (%rsi), %ymm2 +; CHECK-AVX256-NEXT: vmovdqa 32(%rsi), %ymm3 +; CHECK-AVX256-NEXT: vpcmpeqb %xmm1, %xmm0, %k1 +; CHECK-AVX256-NEXT: kshiftrw $8, %k1, %k2 +; CHECK-AVX256-NEXT: vmovdqa32 32(%rdi), %ymm3 {%k2} +; CHECK-AVX256-NEXT: vmovdqa32 (%rdi), %ymm2 {%k1} +; CHECK-AVX256-NEXT: vmovdqa %ymm2, (%rdx) +; CHECK-AVX256-NEXT: vmovdqa %ymm3, 32(%rdx) +; CHECK-AVX256-NEXT: vzeroupper +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: vselect_split_v16i8_setcc: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %k1 +; CHECK-AVX512-NEXT: vmovdqa32 (%rdi), %zmm2 {%k1} +; CHECK-AVX512-NEXT: vmovdqa64 %zmm2, (%rdx) +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq %x = load <16 x i32>, <16 x i32>* %p %y = load <16 x i32>, <16 x i32>* %q %a = icmp eq <16 x i8> %s, %t @@ -1059,18 +1363,27 @@ } define dso_local void @vselect_split_v16i16_setcc(<16 x i16> %s, <16 x i16> %t, <16 x i32>* %p, <16 x i32>* %q, <16 x i32>* %r) "min-legal-vector-width"="256" { -; CHECK-LABEL: vselect_split_v16i16_setcc: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rsi), %ymm2 -; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 -; CHECK-NEXT: kshiftrw $8, %k1, %k2 -; CHECK-NEXT: vmovdqa32 32(%rdi), %ymm3 {%k2} -; CHECK-NEXT: vmovdqa32 (%rdi), %ymm2 {%k1} -; CHECK-NEXT: vmovdqa %ymm2, (%rdx) -; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: vselect_split_v16i16_setcc: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vmovdqa (%rsi), %ymm2 +; CHECK-AVX256-NEXT: vmovdqa 32(%rsi), %ymm3 +; CHECK-AVX256-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; CHECK-AVX256-NEXT: kshiftrw $8, %k1, %k2 +; CHECK-AVX256-NEXT: vmovdqa32 32(%rdi), %ymm3 {%k2} +; CHECK-AVX256-NEXT: vmovdqa32 (%rdi), %ymm2 {%k1} +; CHECK-AVX256-NEXT: vmovdqa %ymm2, (%rdx) +; CHECK-AVX256-NEXT: vmovdqa %ymm3, 32(%rdx) +; CHECK-AVX256-NEXT: vzeroupper +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: vselect_split_v16i16_setcc: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vmovdqa64 (%rsi), %zmm2 +; CHECK-AVX512-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; CHECK-AVX512-NEXT: vmovdqa32 (%rdi), %zmm2 {%k1} +; CHECK-AVX512-NEXT: vmovdqa64 %zmm2, (%rdx) +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq %x = load <16 x i32>, <16 x i32>* %p %y = load <16 x i32>, <16 x i32>* %q %a = icmp eq <16 x i16> %s, %t @@ -1080,14 +1393,22 @@ } define <16 x i8> @trunc_packus_v16i32_v16i8(<16 x i32>* %p) "min-legal-vector-width"="256" { -; CHECK-LABEL: trunc_packus_v16i32_v16i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0 -; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; CHECK-NEXT: vpmovuswb %ymm0, %xmm0 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: trunc_packus_v16i32_v16i8: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-AVX256-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0 +; CHECK-AVX256-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; CHECK-AVX256-NEXT: vpmovuswb %ymm0, %xmm0 +; CHECK-AVX256-NEXT: vzeroupper +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: trunc_packus_v16i32_v16i8: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpmaxsd (%rdi), %zmm0, %zmm0 +; CHECK-AVX512-NEXT: vpmovusdb %zmm0, %xmm0 +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq %a = load <16 x i32>, <16 x i32>* %p %b = icmp slt <16 x i32> %a, %c = select <16 x i1> %b, <16 x i32> %a, <16 x i32> @@ -1098,14 +1419,22 @@ } define dso_local void @trunc_packus_v16i32_v16i8_store(<16 x i32>* %p, <16 x i8>* %q) "min-legal-vector-width"="256" { -; CHECK-LABEL: trunc_packus_v16i32_v16i8_store: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0 -; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; CHECK-NEXT: vpmovuswb %ymm0, (%rsi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: trunc_packus_v16i32_v16i8_store: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-AVX256-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0 +; CHECK-AVX256-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; CHECK-AVX256-NEXT: vpmovuswb %ymm0, (%rsi) +; CHECK-AVX256-NEXT: vzeroupper +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: trunc_packus_v16i32_v16i8_store: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpmaxsd (%rdi), %zmm0, %zmm0 +; CHECK-AVX512-NEXT: vpmovusdb %zmm0, (%rsi) +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq %a = load <16 x i32>, <16 x i32>* %p %b = icmp slt <16 x i32> %a, %c = select <16 x i1> %b, <16 x i32> %a, <16 x i32> @@ -1124,454 +1453,465 @@ } define dso_local void @v64i1_shuffle(<64 x i8>* %x, <64 x i8>* %y) "min-legal-vector-width"="256" { -; CHECK-LABEL: v64i1_shuffle: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm0 -; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k0 -; CHECK-NEXT: kshiftrd $1, %k0, %k1 -; CHECK-NEXT: movq $-3, %rax -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftlq $63, %k0, %k2 -; CHECK-NEXT: kshiftrq $62, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-5, %rax -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $3, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $61, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-9, %rax -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $2, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $60, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-17, %rax -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $5, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $59, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-33, %rax -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $4, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $58, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-65, %rax -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $7, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $57, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-129, %rax -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $6, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $56, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-257, %rax # imm = 0xFEFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $9, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $55, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-513, %rax # imm = 0xFDFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $8, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $54, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-1025, %rax # imm = 0xFBFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $11, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $53, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-2049, %rax # imm = 0xF7FF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $10, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $52, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-4097, %rax # imm = 0xEFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $13, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $51, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-8193, %rax # imm = 0xDFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $12, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $50, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-16385, %rax # imm = 0xBFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $15, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $49, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-32769, %rax # imm = 0xFFFF7FFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $14, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $48, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-65537, %rax # imm = 0xFFFEFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $17, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $47, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-131073, %rax # imm = 0xFFFDFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $16, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $46, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-262145, %rax # imm = 0xFFFBFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $19, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $45, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-524289, %rax # imm = 0xFFF7FFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $18, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $44, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-1048577, %rax # imm = 0xFFEFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $21, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $43, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-2097153, %rax # imm = 0xFFDFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $20, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $42, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-4194305, %rax # imm = 0xFFBFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $23, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $41, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-8388609, %rax # imm = 0xFF7FFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $22, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $40, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-16777217, %rax # imm = 0xFEFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $25, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $39, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-33554433, %rax # imm = 0xFDFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $24, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $38, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-67108865, %rax # imm = 0xFBFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $27, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $37, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-134217729, %rax # imm = 0xF7FFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $26, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $36, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-268435457, %rax # imm = 0xEFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $29, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $35, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-536870913, %rax # imm = 0xDFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $28, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $34, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movq $-1073741825, %rax # imm = 0xBFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k1 -; CHECK-NEXT: kshiftrd $31, %k0, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $33, %k2, %k2 -; CHECK-NEXT: korq %k2, %k1, %k1 -; CHECK-NEXT: movabsq $-2147483649, %rax # imm = 0xFFFFFFFF7FFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k1, %k2 -; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1 -; CHECK-NEXT: kshiftrd $30, %k0, %k0 -; CHECK-NEXT: kshiftlq $63, %k0, %k0 -; CHECK-NEXT: kshiftrq $32, %k0, %k0 -; CHECK-NEXT: korq %k0, %k2, %k0 -; CHECK-NEXT: movabsq $-4294967297, %rax # imm = 0xFFFFFFFEFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $1, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $31, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-8589934593, %rax # imm = 0xFFFFFFFDFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftlq $63, %k1, %k2 -; CHECK-NEXT: kshiftrq $30, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-17179869185, %rax # imm = 0xFFFFFFFBFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $3, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $29, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-34359738369, %rax # imm = 0xFFFFFFF7FFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $2, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $28, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-68719476737, %rax # imm = 0xFFFFFFEFFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $5, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $27, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-137438953473, %rax # imm = 0xFFFFFFDFFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $4, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $26, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-274877906945, %rax # imm = 0xFFFFFFBFFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $7, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $25, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-549755813889, %rax # imm = 0xFFFFFF7FFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $6, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $24, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-1099511627777, %rax # imm = 0xFFFFFEFFFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $9, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $23, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-2199023255553, %rax # imm = 0xFFFFFDFFFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $8, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $22, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-4398046511105, %rax # imm = 0xFFFFFBFFFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $11, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $21, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-8796093022209, %rax # imm = 0xFFFFF7FFFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $10, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $20, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-17592186044417, %rax # imm = 0xFFFFEFFFFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $13, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $19, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-35184372088833, %rax # imm = 0xFFFFDFFFFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $12, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $18, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-70368744177665, %rax # imm = 0xFFFFBFFFFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $15, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $17, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-140737488355329, %rax # imm = 0xFFFF7FFFFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $14, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $16, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-281474976710657, %rax # imm = 0xFFFEFFFFFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $17, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $15, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-562949953421313, %rax # imm = 0xFFFDFFFFFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $16, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $14, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-1125899906842625, %rax # imm = 0xFFFBFFFFFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $19, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $13, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-2251799813685249, %rax # imm = 0xFFF7FFFFFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $18, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $12, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-4503599627370497, %rax # imm = 0xFFEFFFFFFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $21, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $11, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-9007199254740993, %rax # imm = 0xFFDFFFFFFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $20, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $10, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-18014398509481985, %rax # imm = 0xFFBFFFFFFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $23, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $9, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-36028797018963969, %rax # imm = 0xFF7FFFFFFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $22, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $8, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-72057594037927937, %rax # imm = 0xFEFFFFFFFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $25, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $7, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-144115188075855873, %rax # imm = 0xFDFFFFFFFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $24, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $6, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-288230376151711745, %rax # imm = 0xFBFFFFFFFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $27, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $5, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-576460752303423489, %rax # imm = 0xF7FFFFFFFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $26, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $4, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-1152921504606846977, %rax # imm = 0xEFFFFFFFFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $29, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $3, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-2305843009213693953, %rax # imm = 0xDFFFFFFFFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $28, %k1, %k2 -; CHECK-NEXT: kshiftlq $63, %k2, %k2 -; CHECK-NEXT: kshiftrq $2, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: movabsq $-4611686018427387905, %rax # imm = 0xBFFFFFFFFFFFFFFF -; CHECK-NEXT: kmovq %rax, %k2 -; CHECK-NEXT: kandq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $31, %k1, %k2 -; CHECK-NEXT: kshiftlq $62, %k2, %k2 -; CHECK-NEXT: korq %k2, %k0, %k0 -; CHECK-NEXT: kshiftrd $30, %k1, %k1 -; CHECK-NEXT: kshiftlq $1, %k0, %k0 -; CHECK-NEXT: kshiftrq $1, %k0, %k0 -; CHECK-NEXT: kshiftlq $63, %k1, %k1 -; CHECK-NEXT: korq %k1, %k0, %k1 -; CHECK-NEXT: vmovdqu8 %ymm1, (%rsi) {%k1} -; CHECK-NEXT: kshiftrq $32, %k1, %k1 -; CHECK-NEXT: vmovdqu8 %ymm0, 32(%rsi) {%k1} -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: v64i1_shuffle: +; CHECK-AVX256: # %bb.0: # %entry +; CHECK-AVX256-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-AVX256-NEXT: vmovdqa 32(%rdi), %ymm0 +; CHECK-AVX256-NEXT: vptestnmb %ymm1, %ymm1, %k0 +; CHECK-AVX256-NEXT: kshiftrd $1, %k0, %k1 +; CHECK-AVX256-NEXT: movq $-3, %rax +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftlq $63, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftrq $62, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-5, %rax +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $3, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $61, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-9, %rax +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $2, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $60, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-17, %rax +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $5, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $59, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-33, %rax +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $4, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $58, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-65, %rax +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $7, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $57, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-129, %rax +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $6, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $56, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-257, %rax # imm = 0xFEFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $9, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $55, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-513, %rax # imm = 0xFDFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $8, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $54, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-1025, %rax # imm = 0xFBFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $11, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $53, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-2049, %rax # imm = 0xF7FF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $10, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $52, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-4097, %rax # imm = 0xEFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $13, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $51, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-8193, %rax # imm = 0xDFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $12, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $50, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-16385, %rax # imm = 0xBFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $15, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $49, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-32769, %rax # imm = 0xFFFF7FFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $14, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $48, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-65537, %rax # imm = 0xFFFEFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $17, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $47, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-131073, %rax # imm = 0xFFFDFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $16, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $46, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-262145, %rax # imm = 0xFFFBFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $19, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $45, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-524289, %rax # imm = 0xFFF7FFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $18, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $44, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-1048577, %rax # imm = 0xFFEFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $21, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $43, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-2097153, %rax # imm = 0xFFDFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $20, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $42, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-4194305, %rax # imm = 0xFFBFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $23, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $41, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-8388609, %rax # imm = 0xFF7FFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $22, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $40, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-16777217, %rax # imm = 0xFEFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $25, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $39, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-33554433, %rax # imm = 0xFDFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $24, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $38, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-67108865, %rax # imm = 0xFBFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $27, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $37, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-134217729, %rax # imm = 0xF7FFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $26, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $36, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-268435457, %rax # imm = 0xEFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $29, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $35, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-536870913, %rax # imm = 0xDFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $28, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $34, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movq $-1073741825, %rax # imm = 0xBFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftrd $31, %k0, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $33, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k1, %k1 +; CHECK-AVX256-NEXT: movabsq $-2147483649, %rax # imm = 0xFFFFFFFF7FFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k1, %k2 +; CHECK-AVX256-NEXT: vptestnmb %ymm0, %ymm0, %k1 +; CHECK-AVX256-NEXT: kshiftrd $30, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftlq $63, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrq $32, %k0, %k0 +; CHECK-AVX256-NEXT: korq %k0, %k2, %k0 +; CHECK-AVX256-NEXT: movabsq $-4294967297, %rax # imm = 0xFFFFFFFEFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $1, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $31, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-8589934593, %rax # imm = 0xFFFFFFFDFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftlq $63, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftrq $30, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-17179869185, %rax # imm = 0xFFFFFFFBFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $3, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $29, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-34359738369, %rax # imm = 0xFFFFFFF7FFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $2, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $28, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-68719476737, %rax # imm = 0xFFFFFFEFFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $5, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $27, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-137438953473, %rax # imm = 0xFFFFFFDFFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $4, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $26, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-274877906945, %rax # imm = 0xFFFFFFBFFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $7, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $25, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-549755813889, %rax # imm = 0xFFFFFF7FFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $6, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $24, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-1099511627777, %rax # imm = 0xFFFFFEFFFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $9, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $23, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-2199023255553, %rax # imm = 0xFFFFFDFFFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $8, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $22, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-4398046511105, %rax # imm = 0xFFFFFBFFFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $11, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $21, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-8796093022209, %rax # imm = 0xFFFFF7FFFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $10, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $20, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-17592186044417, %rax # imm = 0xFFFFEFFFFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $13, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $19, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-35184372088833, %rax # imm = 0xFFFFDFFFFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $12, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $18, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-70368744177665, %rax # imm = 0xFFFFBFFFFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $15, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $17, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-140737488355329, %rax # imm = 0xFFFF7FFFFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $14, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $16, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-281474976710657, %rax # imm = 0xFFFEFFFFFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $17, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $15, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-562949953421313, %rax # imm = 0xFFFDFFFFFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $16, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $14, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-1125899906842625, %rax # imm = 0xFFFBFFFFFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $19, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $13, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-2251799813685249, %rax # imm = 0xFFF7FFFFFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $18, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $12, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-4503599627370497, %rax # imm = 0xFFEFFFFFFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $21, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $11, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-9007199254740993, %rax # imm = 0xFFDFFFFFFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $20, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $10, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-18014398509481985, %rax # imm = 0xFFBFFFFFFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $23, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $9, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-36028797018963969, %rax # imm = 0xFF7FFFFFFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $22, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $8, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-72057594037927937, %rax # imm = 0xFEFFFFFFFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $25, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $7, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-144115188075855873, %rax # imm = 0xFDFFFFFFFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $24, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $6, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-288230376151711745, %rax # imm = 0xFBFFFFFFFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $27, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $5, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-576460752303423489, %rax # imm = 0xF7FFFFFFFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $26, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $4, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-1152921504606846977, %rax # imm = 0xEFFFFFFFFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $29, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $3, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-2305843009213693953, %rax # imm = 0xDFFFFFFFFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $28, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $63, %k2, %k2 +; CHECK-AVX256-NEXT: kshiftrq $2, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: movabsq $-4611686018427387905, %rax # imm = 0xBFFFFFFFFFFFFFFF +; CHECK-AVX256-NEXT: kmovq %rax, %k2 +; CHECK-AVX256-NEXT: kandq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $31, %k1, %k2 +; CHECK-AVX256-NEXT: kshiftlq $62, %k2, %k2 +; CHECK-AVX256-NEXT: korq %k2, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrd $30, %k1, %k1 +; CHECK-AVX256-NEXT: kshiftlq $1, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftrq $1, %k0, %k0 +; CHECK-AVX256-NEXT: kshiftlq $63, %k1, %k1 +; CHECK-AVX256-NEXT: korq %k1, %k0, %k1 +; CHECK-AVX256-NEXT: vmovdqu8 %ymm1, (%rsi) {%k1} +; CHECK-AVX256-NEXT: kshiftrq $32, %k1, %k1 +; CHECK-AVX256-NEXT: vmovdqu8 %ymm0, 32(%rsi) {%k1} +; CHECK-AVX256-NEXT: vzeroupper +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: v64i1_shuffle: +; CHECK-AVX512: # %bb.0: # %entry +; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-AVX512-NEXT: vptestnmb %zmm0, %zmm0, %k0 +; CHECK-AVX512-NEXT: vpmovm2b %k0, %zmm1 +; CHECK-AVX512-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62] +; CHECK-AVX512-NEXT: vpmovb2m %zmm1, %k1 +; CHECK-AVX512-NEXT: vmovdqu8 %zmm0, (%rsi) {%k1} +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq entry: %a = load <64 x i8>, <64 x i8>* %x %b = icmp eq <64 x i8> %a, zeroinitializer @@ -1601,16 +1941,25 @@ } define dso_local void @cmp_v8i64_sext(<8 x i64>* %xptr, <8 x i64>* %yptr, <8 x i64>* %zptr) "min-legal-vector-width"="256" { -; CHECK-LABEL: cmp_v8i64_sext: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rsi), %ymm0 -; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1 -; CHECK-NEXT: vpcmpgtq 32(%rdi), %ymm1, %ymm1 -; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 -; CHECK-NEXT: vmovdqa %ymm0, (%rdx) -; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: cmp_v8i64_sext: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vmovdqa (%rsi), %ymm0 +; CHECK-AVX256-NEXT: vmovdqa 32(%rsi), %ymm1 +; CHECK-AVX256-NEXT: vpcmpgtq 32(%rdi), %ymm1, %ymm1 +; CHECK-AVX256-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 +; CHECK-AVX256-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-AVX256-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-AVX256-NEXT: vzeroupper +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: cmp_v8i64_sext: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vmovdqa64 (%rsi), %zmm0 +; CHECK-AVX512-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 +; CHECK-AVX512-NEXT: vpmovm2q %k0, %zmm0 +; CHECK-AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq %x = load <8 x i64>, <8 x i64>* %xptr %y = load <8 x i64>, <8 x i64>* %yptr %cmp = icmp slt <8 x i64> %x, %y @@ -1620,18 +1969,28 @@ } define dso_local void @cmp_v8i64_zext(<8 x i64>* %xptr, <8 x i64>* %yptr, <8 x i64>* %zptr) "min-legal-vector-width"="256" { -; CHECK-LABEL: cmp_v8i64_zext: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rsi), %ymm0 -; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1 -; CHECK-NEXT: vpcmpgtq 32(%rdi), %ymm1, %ymm1 -; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 -; CHECK-NEXT: vpsrlq $63, %ymm1, %ymm1 -; CHECK-NEXT: vpsrlq $63, %ymm0, %ymm0 -; CHECK-NEXT: vmovdqa %ymm0, (%rdx) -; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: cmp_v8i64_zext: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vmovdqa (%rsi), %ymm0 +; CHECK-AVX256-NEXT: vmovdqa 32(%rsi), %ymm1 +; CHECK-AVX256-NEXT: vpcmpgtq 32(%rdi), %ymm1, %ymm1 +; CHECK-AVX256-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 +; CHECK-AVX256-NEXT: vpsrlq $63, %ymm1, %ymm1 +; CHECK-AVX256-NEXT: vpsrlq $63, %ymm0, %ymm0 +; CHECK-AVX256-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-AVX256-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-AVX256-NEXT: vzeroupper +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: cmp_v8i64_zext: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vmovdqa64 (%rsi), %zmm0 +; CHECK-AVX512-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 +; CHECK-AVX512-NEXT: vpmovm2q %k0, %zmm0 +; CHECK-AVX512-NEXT: vpsrlq $63, %zmm0, %zmm0 +; CHECK-AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq %x = load <8 x i64>, <8 x i64>* %xptr %y = load <8 x i64>, <8 x i64>* %yptr %cmp = icmp slt <8 x i64> %x, %y @@ -1641,30 +2000,43 @@ } define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind "min-legal-vector-width"="256" { +; CHECK-NOVBMI-LABEL: var_rotate_v16i8: +; CHECK-NOVBMI: # %bb.0: +; CHECK-NOVBMI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-NOVBMI-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; CHECK-NOVBMI-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; CHECK-NOVBMI-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30] +; CHECK-NOVBMI-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 +; CHECK-NOVBMI-NEXT: vpsrlw $8, %ymm0, %ymm0 +; CHECK-NOVBMI-NEXT: vpmovwb %ymm0, %xmm0 +; CHECK-NOVBMI-NEXT: vzeroupper +; CHECK-NOVBMI-NEXT: retq +; +; CHECK-VBMI256-LABEL: var_rotate_v16i8: +; CHECK-VBMI256: # %bb.0: +; CHECK-VBMI256-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-VBMI256-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-VBMI256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-VBMI256-NEXT: vpermb %ymm0, %ymm2, %ymm0 +; CHECK-VBMI256-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; CHECK-VBMI256-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 +; CHECK-VBMI256-NEXT: vpsrlw $8, %ymm0, %ymm0 +; CHECK-VBMI256-NEXT: vpmovwb %ymm0, %xmm0 +; CHECK-VBMI256-NEXT: vzeroupper +; CHECK-VBMI256-NEXT: retq +; ; CHECK-AVX512-LABEL: var_rotate_v16i8: ; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; CHECK-AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512-NEXT: vpermb %ymm0, %ymm2, %ymm0 ; CHECK-AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; CHECK-AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; CHECK-AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30] ; CHECK-AVX512-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 ; CHECK-AVX512-NEXT: vpsrlw $8, %ymm0, %ymm0 ; CHECK-AVX512-NEXT: vpmovwb %ymm0, %xmm0 ; CHECK-AVX512-NEXT: vzeroupper ; CHECK-AVX512-NEXT: retq -; -; CHECK-VBMI-LABEL: var_rotate_v16i8: -; CHECK-VBMI: # %bb.0: -; CHECK-VBMI-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-VBMI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-VBMI-NEXT: vpermb %ymm0, %ymm2, %ymm0 -; CHECK-VBMI-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; CHECK-VBMI-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 -; CHECK-VBMI-NEXT: vpsrlw $8, %ymm0, %ymm0 -; CHECK-VBMI-NEXT: vpmovwb %ymm0, %xmm0 -; CHECK-VBMI-NEXT: vzeroupper -; CHECK-VBMI-NEXT: retq %b8 = sub <16 x i8> , %b %shl = shl <16 x i8> %a, %b %lshr = lshr <16 x i8> %a, %b8 @@ -1673,20 +2045,32 @@ } define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind "min-legal-vector-width"="256" { -; CHECK-LABEL: var_rotate_v32i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] -; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-NEXT: vpsllvw %ymm3, %ymm4, %ymm3 -; CHECK-NEXT: vpsrlw $8, %ymm3, %ymm3 -; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] -; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpsrlw $8, %ymm0, %ymm0 -; CHECK-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: var_rotate_v32i8: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; CHECK-AVX256-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-AVX256-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] +; CHECK-AVX256-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-AVX256-NEXT: vpsllvw %ymm3, %ymm4, %ymm3 +; CHECK-AVX256-NEXT: vpsrlw $8, %ymm3, %ymm3 +; CHECK-AVX256-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] +; CHECK-AVX256-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-AVX256-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 +; CHECK-AVX256-NEXT: vpsrlw $8, %ymm0, %ymm0 +; CHECK-AVX256-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: var_rotate_v32i8: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; CHECK-AVX512-NEXT: vpermb %zmm0, %zmm2, %zmm0 +; CHECK-AVX512-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; CHECK-AVX512-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; CHECK-AVX512-NEXT: vpsrlw $8, %zmm0, %zmm0 +; CHECK-AVX512-NEXT: vpmovwb %zmm0, %ymm0 +; CHECK-AVX512-NEXT: retq %b8 = sub <32 x i8> , %b %shl = shl <32 x i8> %a, %b %lshr = lshr <32 x i8> %a, %b8 @@ -1715,16 +2099,25 @@ } define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind "min-legal-vector-width"="256" { -; CHECK-LABEL: constant_rotate_v32i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; CHECK-NEXT: vpsrlw $8, %ymm1, %ymm1 -; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; CHECK-NEXT: vpsrlw $8, %ymm0, %ymm0 -; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: retq +; CHECK-AVX256-LABEL: constant_rotate_v32i8: +; CHECK-AVX256: # %bb.0: +; CHECK-AVX256-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-AVX256-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; CHECK-AVX256-NEXT: vpsrlw $8, %ymm1, %ymm1 +; CHECK-AVX256-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-AVX256-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; CHECK-AVX256-NEXT: vpsrlw $8, %ymm0, %ymm0 +; CHECK-AVX256-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; CHECK-AVX256-NEXT: retq +; +; CHECK-AVX512-LABEL: constant_rotate_v32i8: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; CHECK-AVX512-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; CHECK-AVX512-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; CHECK-AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; CHECK-AVX512-NEXT: vpmovwb %zmm0, %ymm0 +; CHECK-AVX512-NEXT: retq %shl = shl <32 x i8> %a, %lshr = lshr <32 x i8> %a, %or = or <32 x i8> %shl, %lshr