Index: llvm/trunk/lib/Target/X86/X86DomainReassignment.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86DomainReassignment.cpp +++ llvm/trunk/lib/Target/X86/X86DomainReassignment.cpp @@ -70,13 +70,13 @@ static const TargetRegisterClass *getDstRC(const TargetRegisterClass *SrcRC, RegDomain Domain) { assert(Domain == MaskDomain && "add domain"); - if (SrcRC == &X86::GR8RegClass) + if (X86::GR8RegClass.hasSubClassEq(SrcRC)) return &X86::VK8RegClass; - if (SrcRC == &X86::GR16RegClass) + if (X86::GR16RegClass.hasSubClassEq(SrcRC)) return &X86::VK16RegClass; - if (SrcRC == &X86::GR32RegClass) + if (X86::GR32RegClass.hasSubClassEq(SrcRC)) return &X86::VK32RegClass; - if (SrcRC == &X86::GR64RegClass) + if (X86::GR64RegClass.hasSubClassEq(SrcRC)) return &X86::VK64RegClass; llvm_unreachable("add register class"); return nullptr; Index: llvm/trunk/test/CodeGen/X86/gpr-to-mask.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/gpr-to-mask.ll +++ llvm/trunk/test/CodeGen/X86/gpr-to-mask.ll @@ -1,20 +1,40 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq < %s | FileCheck %s --check-prefix=X86-64 +; RUN: llc -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq < %s | FileCheck %s --check-prefix=X86-32 define void @test_fcmp_storefloat(i1 %cond, float* %fptr, float %f1, float %f2, float %f3, float %f4, float %f5, float %f6) { -; CHECK-LABEL: test_fcmp_storefloat: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: je .LBB0_2 -; CHECK-NEXT: # %bb.1: # %if -; CHECK-NEXT: vcmpeqss %xmm3, %xmm2, %k1 -; CHECK-NEXT: jmp .LBB0_3 -; CHECK-NEXT: .LBB0_2: # %else -; CHECK-NEXT: vcmpeqss %xmm5, %xmm4, %k1 -; CHECK-NEXT: .LBB0_3: # %exit -; CHECK-NEXT: vmovss %xmm0, %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovss %xmm1, (%rsi) -; CHECK-NEXT: retq +; X86-64-LABEL: test_fcmp_storefloat: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: testb $1, %dil +; X86-64-NEXT: je .LBB0_2 +; X86-64-NEXT: # %bb.1: # %if +; X86-64-NEXT: vcmpeqss %xmm3, %xmm2, %k1 +; X86-64-NEXT: jmp .LBB0_3 +; X86-64-NEXT: .LBB0_2: # %else +; X86-64-NEXT: vcmpeqss %xmm5, %xmm4, %k1 +; X86-64-NEXT: .LBB0_3: # %exit +; X86-64-NEXT: vmovss %xmm0, %xmm0, %xmm1 {%k1} +; X86-64-NEXT: vmovss %xmm1, (%rsi) +; X86-64-NEXT: retq +; +; X86-32-LABEL: test_fcmp_storefloat: +; X86-32: # %bb.0: # %entry +; X86-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-32-NEXT: je .LBB0_2 +; X86-32-NEXT: # %bb.1: # %if +; X86-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-32-NEXT: vcmpeqss {{[0-9]+}}(%esp), %xmm2, %k1 +; X86-32-NEXT: jmp .LBB0_3 +; X86-32-NEXT: .LBB0_2: # %else +; X86-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-32-NEXT: vcmpeqss {{[0-9]+}}(%esp), %xmm2, %k1 +; X86-32-NEXT: .LBB0_3: # %exit +; X86-32-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; X86-32-NEXT: vmovss %xmm0, (%eax) +; X86-32-NEXT: retl entry: br i1 %cond, label %if, label %else @@ -34,20 +54,38 @@ } define void @test_fcmp_storei1(i1 %cond, float* %fptr, i1* %iptr, float %f1, float %f2, float %f3, float %f4) { -; CHECK-LABEL: test_fcmp_storei1: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: je .LBB1_2 -; CHECK-NEXT: # %bb.1: # %if -; CHECK-NEXT: vcmpeqss %xmm1, %xmm0, %k0 -; CHECK-NEXT: jmp .LBB1_3 -; CHECK-NEXT: .LBB1_2: # %else -; CHECK-NEXT: vcmpeqss %xmm3, %xmm2, %k0 -; CHECK-NEXT: .LBB1_3: # %exit -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: andb $1, %al -; CHECK-NEXT: movb %al, (%rdx) -; CHECK-NEXT: retq +; X86-64-LABEL: test_fcmp_storei1: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: testb $1, %dil +; X86-64-NEXT: je .LBB1_2 +; X86-64-NEXT: # %bb.1: # %if +; X86-64-NEXT: vcmpeqss %xmm1, %xmm0, %k0 +; X86-64-NEXT: jmp .LBB1_3 +; X86-64-NEXT: .LBB1_2: # %else +; X86-64-NEXT: vcmpeqss %xmm3, %xmm2, %k0 +; X86-64-NEXT: .LBB1_3: # %exit +; X86-64-NEXT: kmovd %k0, %eax +; X86-64-NEXT: andb $1, %al +; X86-64-NEXT: movb %al, (%rdx) +; X86-64-NEXT: retq +; +; X86-32-LABEL: test_fcmp_storei1: +; X86-32: # %bb.0: # %entry +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-32-NEXT: je .LBB1_2 +; X86-32-NEXT: # %bb.1: # %if +; X86-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-32-NEXT: vcmpeqss {{[0-9]+}}(%esp), %xmm0, %k0 +; X86-32-NEXT: jmp .LBB1_3 +; X86-32-NEXT: .LBB1_2: # %else +; X86-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-32-NEXT: vcmpeqss {{[0-9]+}}(%esp), %xmm0, %k0 +; X86-32-NEXT: .LBB1_3: # %exit +; X86-32-NEXT: kmovd %k0, %ecx +; X86-32-NEXT: andb $1, %cl +; X86-32-NEXT: movb %cl, (%eax) +; X86-32-NEXT: retl entry: br i1 %cond, label %if, label %else @@ -66,21 +104,42 @@ } define void @test_load_add(i1 %cond, float* %fptr, i1* %iptr1, i1* %iptr2, float %f1, float %f2) { -; CHECK-LABEL: test_load_add: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: je .LBB2_2 -; CHECK-NEXT: # %bb.1: # %if -; CHECK-NEXT: kmovb (%rdx), %k0 -; CHECK-NEXT: kmovb (%rcx), %k1 -; CHECK-NEXT: kaddb %k1, %k0, %k1 -; CHECK-NEXT: jmp .LBB2_3 -; CHECK-NEXT: .LBB2_2: # %else -; CHECK-NEXT: kmovb (%rcx), %k1 -; CHECK-NEXT: .LBB2_3: # %exit -; CHECK-NEXT: vmovss %xmm0, %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovss %xmm1, (%rsi) -; CHECK-NEXT: retq +; X86-64-LABEL: test_load_add: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: testb $1, %dil +; X86-64-NEXT: je .LBB2_2 +; X86-64-NEXT: # %bb.1: # %if +; X86-64-NEXT: kmovb (%rdx), %k0 +; X86-64-NEXT: kmovb (%rcx), %k1 +; X86-64-NEXT: kaddb %k1, %k0, %k1 +; X86-64-NEXT: jmp .LBB2_3 +; X86-64-NEXT: .LBB2_2: # %else +; X86-64-NEXT: kmovb (%rcx), %k1 +; X86-64-NEXT: .LBB2_3: # %exit +; X86-64-NEXT: vmovss %xmm0, %xmm0, %xmm1 {%k1} +; X86-64-NEXT: vmovss %xmm1, (%rsi) +; X86-64-NEXT: retq +; +; X86-32-LABEL: test_load_add: +; X86-32: # %bb.0: # %entry +; X86-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-32-NEXT: je .LBB2_2 +; X86-32-NEXT: # %bb.1: # %if +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-32-NEXT: kmovb (%edx), %k0 +; X86-32-NEXT: kmovb (%ecx), %k1 +; X86-32-NEXT: kaddb %k1, %k0, %k1 +; X86-32-NEXT: jmp .LBB2_3 +; X86-32-NEXT: .LBB2_2: # %else +; X86-32-NEXT: kmovb (%ecx), %k1 +; X86-32-NEXT: .LBB2_3: # %exit +; X86-32-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; X86-32-NEXT: vmovss %xmm0, (%eax) +; X86-32-NEXT: retl entry: br i1 %cond, label %if, label %else @@ -102,19 +161,37 @@ } define void @test_load_i1(i1 %cond, float* %fptr, i1* %iptr1, i1* %iptr2, float %f1, float %f2) { -; CHECK-LABEL: test_load_i1: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: je .LBB3_2 -; CHECK-NEXT: # %bb.1: # %if -; CHECK-NEXT: kmovb (%rdx), %k1 -; CHECK-NEXT: jmp .LBB3_3 -; CHECK-NEXT: .LBB3_2: # %else -; CHECK-NEXT: kmovb (%rcx), %k1 -; CHECK-NEXT: .LBB3_3: # %exit -; CHECK-NEXT: vmovss %xmm0, %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovss %xmm1, (%rsi) -; CHECK-NEXT: retq +; X86-64-LABEL: test_load_i1: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: testb $1, %dil +; X86-64-NEXT: je .LBB3_2 +; X86-64-NEXT: # %bb.1: # %if +; X86-64-NEXT: kmovb (%rdx), %k1 +; X86-64-NEXT: jmp .LBB3_3 +; X86-64-NEXT: .LBB3_2: # %else +; X86-64-NEXT: kmovb (%rcx), %k1 +; X86-64-NEXT: .LBB3_3: # %exit +; X86-64-NEXT: vmovss %xmm0, %xmm0, %xmm1 {%k1} +; X86-64-NEXT: vmovss %xmm1, (%rsi) +; X86-64-NEXT: retq +; +; X86-32-LABEL: test_load_i1: +; X86-32: # %bb.0: # %entry +; X86-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-32-NEXT: je .LBB3_2 +; X86-32-NEXT: # %bb.1: # %if +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: jmp .LBB3_3 +; X86-32-NEXT: .LBB3_2: # %else +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: .LBB3_3: # %exit +; X86-32-NEXT: kmovb (%ecx), %k1 +; X86-32-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; X86-32-NEXT: vmovss %xmm0, (%eax) +; X86-32-NEXT: retl entry: br i1 %cond, label %if, label %else @@ -134,19 +211,35 @@ } define void @test_loadi1_storei1(i1 %cond, i1* %iptr1, i1* %iptr2, i1* %iptr3) { -; CHECK-LABEL: test_loadi1_storei1: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: je .LBB4_2 -; CHECK-NEXT: # %bb.1: # %if -; CHECK-NEXT: movb (%rsi), %al -; CHECK-NEXT: jmp .LBB4_3 -; CHECK-NEXT: .LBB4_2: # %else -; CHECK-NEXT: movb (%rdx), %al -; CHECK-NEXT: .LBB4_3: # %exit -; CHECK-NEXT: andb $1, %al -; CHECK-NEXT: movb %al, (%rcx) -; CHECK-NEXT: retq +; X86-64-LABEL: test_loadi1_storei1: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: testb $1, %dil +; X86-64-NEXT: je .LBB4_2 +; X86-64-NEXT: # %bb.1: # %if +; X86-64-NEXT: movb (%rsi), %al +; X86-64-NEXT: jmp .LBB4_3 +; X86-64-NEXT: .LBB4_2: # %else +; X86-64-NEXT: movb (%rdx), %al +; X86-64-NEXT: .LBB4_3: # %exit +; X86-64-NEXT: andb $1, %al +; X86-64-NEXT: movb %al, (%rcx) +; X86-64-NEXT: retq +; +; X86-32-LABEL: test_loadi1_storei1: +; X86-32: # %bb.0: # %entry +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-32-NEXT: je .LBB4_2 +; X86-32-NEXT: # %bb.1: # %if +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: jmp .LBB4_3 +; X86-32-NEXT: .LBB4_2: # %else +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: .LBB4_3: # %exit +; X86-32-NEXT: movb (%ecx), %cl +; X86-32-NEXT: andb $1, %cl +; X86-32-NEXT: movb %cl, (%eax) +; X86-32-NEXT: retl entry: br i1 %cond, label %if, label %else @@ -165,23 +258,44 @@ } define void @test_shl1(i1 %cond, i8* %ptr1, i8* %ptr2, <8 x float> %fvec1, <8 x float> %fvec2, <8 x float>* %fptrvec) { -; CHECK-LABEL: test_shl1: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: # kill: %ymm1 %ymm1 %zmm1 -; CHECK-NEXT: # kill: %ymm0 %ymm0 %zmm0 -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: je .LBB5_2 -; CHECK-NEXT: # %bb.1: # %if -; CHECK-NEXT: kmovb (%rsi), %k0 -; CHECK-NEXT: kaddb %k0, %k0, %k1 -; CHECK-NEXT: jmp .LBB5_3 -; CHECK-NEXT: .LBB5_2: # %else -; CHECK-NEXT: kmovb (%rdx), %k1 -; CHECK-NEXT: .LBB5_3: # %exit -; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %ymm1, (%rcx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; X86-64-LABEL: test_shl1: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: # kill: %ymm1 %ymm1 %zmm1 +; X86-64-NEXT: # kill: %ymm0 %ymm0 %zmm0 +; X86-64-NEXT: testb $1, %dil +; X86-64-NEXT: je .LBB5_2 +; X86-64-NEXT: # %bb.1: # %if +; X86-64-NEXT: kmovb (%rsi), %k0 +; X86-64-NEXT: kaddb %k0, %k0, %k1 +; X86-64-NEXT: jmp .LBB5_3 +; X86-64-NEXT: .LBB5_2: # %else +; X86-64-NEXT: kmovb (%rdx), %k1 +; X86-64-NEXT: .LBB5_3: # %exit +; X86-64-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; X86-64-NEXT: vmovaps %ymm1, (%rcx) +; X86-64-NEXT: vzeroupper +; X86-64-NEXT: retq +; +; X86-32-LABEL: test_shl1: +; X86-32: # %bb.0: # %entry +; X86-32-NEXT: # kill: %ymm1 %ymm1 %zmm1 +; X86-32-NEXT: # kill: %ymm0 %ymm0 %zmm0 +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-32-NEXT: je .LBB5_2 +; X86-32-NEXT: # %bb.1: # %if +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: kmovb (%ecx), %k0 +; X86-32-NEXT: kaddb %k0, %k0, %k1 +; X86-32-NEXT: jmp .LBB5_3 +; X86-32-NEXT: .LBB5_2: # %else +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: kmovb (%ecx), %k1 +; X86-32-NEXT: .LBB5_3: # %exit +; X86-32-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; X86-32-NEXT: vmovaps %ymm1, (%eax) +; X86-32-NEXT: vzeroupper +; X86-32-NEXT: retl entry: br i1 %cond, label %if, label %else @@ -203,24 +317,46 @@ } define void @test_shr1(i1 %cond, i8* %ptr1, i8* %ptr2, <8 x float> %fvec1, <8 x float> %fvec2, <8 x float>* %fptrvec) { -; CHECK-LABEL: test_shr1: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: # kill: %ymm1 %ymm1 %zmm1 -; CHECK-NEXT: # kill: %ymm0 %ymm0 %zmm0 -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: je .LBB6_2 -; CHECK-NEXT: # %bb.1: # %if -; CHECK-NEXT: movb (%rsi), %al -; CHECK-NEXT: shrb %al -; CHECK-NEXT: jmp .LBB6_3 -; CHECK-NEXT: .LBB6_2: # %else -; CHECK-NEXT: movb (%rdx), %al -; CHECK-NEXT: .LBB6_3: # %exit -; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %ymm1, (%rcx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; X86-64-LABEL: test_shr1: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: # kill: %ymm1 %ymm1 %zmm1 +; X86-64-NEXT: # kill: %ymm0 %ymm0 %zmm0 +; X86-64-NEXT: testb $1, %dil +; X86-64-NEXT: je .LBB6_2 +; X86-64-NEXT: # %bb.1: # %if +; X86-64-NEXT: movb (%rsi), %al +; X86-64-NEXT: shrb %al +; X86-64-NEXT: jmp .LBB6_3 +; X86-64-NEXT: .LBB6_2: # %else +; X86-64-NEXT: movb (%rdx), %al +; X86-64-NEXT: .LBB6_3: # %exit +; X86-64-NEXT: kmovd %eax, %k1 +; X86-64-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; X86-64-NEXT: vmovaps %ymm1, (%rcx) +; X86-64-NEXT: vzeroupper +; X86-64-NEXT: retq +; +; X86-32-LABEL: test_shr1: +; X86-32: # %bb.0: # %entry +; X86-32-NEXT: # kill: %ymm1 %ymm1 %zmm1 +; X86-32-NEXT: # kill: %ymm0 %ymm0 %zmm0 +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-32-NEXT: je .LBB6_2 +; X86-32-NEXT: # %bb.1: # %if +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: movb (%ecx), %cl +; X86-32-NEXT: shrb %cl +; X86-32-NEXT: jmp .LBB6_3 +; X86-32-NEXT: .LBB6_2: # %else +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: movb (%ecx), %cl +; X86-32-NEXT: .LBB6_3: # %exit +; X86-32-NEXT: kmovd %ecx, %k1 +; X86-32-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; X86-32-NEXT: vmovaps %ymm1, (%eax) +; X86-32-NEXT: vzeroupper +; X86-32-NEXT: retl entry: br i1 %cond, label %if, label %else @@ -242,23 +378,44 @@ } define void @test_shr2(i1 %cond, i8* %ptr1, i8* %ptr2, <8 x float> %fvec1, <8 x float> %fvec2, <8 x float>* %fptrvec) { -; CHECK-LABEL: test_shr2: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: # kill: %ymm1 %ymm1 %zmm1 -; CHECK-NEXT: # kill: %ymm0 %ymm0 %zmm0 -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: je .LBB7_2 -; CHECK-NEXT: # %bb.1: # %if -; CHECK-NEXT: kmovb (%rsi), %k0 -; CHECK-NEXT: kshiftrb $2, %k0, %k1 -; CHECK-NEXT: jmp .LBB7_3 -; CHECK-NEXT: .LBB7_2: # %else -; CHECK-NEXT: kmovb (%rdx), %k1 -; CHECK-NEXT: .LBB7_3: # %exit -; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %ymm1, (%rcx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; X86-64-LABEL: test_shr2: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: # kill: %ymm1 %ymm1 %zmm1 +; X86-64-NEXT: # kill: %ymm0 %ymm0 %zmm0 +; X86-64-NEXT: testb $1, %dil +; X86-64-NEXT: je .LBB7_2 +; X86-64-NEXT: # %bb.1: # %if +; X86-64-NEXT: kmovb (%rsi), %k0 +; X86-64-NEXT: kshiftrb $2, %k0, %k1 +; X86-64-NEXT: jmp .LBB7_3 +; X86-64-NEXT: .LBB7_2: # %else +; X86-64-NEXT: kmovb (%rdx), %k1 +; X86-64-NEXT: .LBB7_3: # %exit +; X86-64-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; X86-64-NEXT: vmovaps %ymm1, (%rcx) +; X86-64-NEXT: vzeroupper +; X86-64-NEXT: retq +; +; X86-32-LABEL: test_shr2: +; X86-32: # %bb.0: # %entry +; X86-32-NEXT: # kill: %ymm1 %ymm1 %zmm1 +; X86-32-NEXT: # kill: %ymm0 %ymm0 %zmm0 +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-32-NEXT: je .LBB7_2 +; X86-32-NEXT: # %bb.1: # %if +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: kmovb (%ecx), %k0 +; X86-32-NEXT: kshiftrb $2, %k0, %k1 +; X86-32-NEXT: jmp .LBB7_3 +; X86-32-NEXT: .LBB7_2: # %else +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: kmovb (%ecx), %k1 +; X86-32-NEXT: .LBB7_3: # %exit +; X86-32-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; X86-32-NEXT: vmovaps %ymm1, (%eax) +; X86-32-NEXT: vzeroupper +; X86-32-NEXT: retl entry: br i1 %cond, label %if, label %else @@ -280,23 +437,44 @@ } define void @test_shl(i1 %cond, i8* %ptr1, i8* %ptr2, <8 x float> %fvec1, <8 x float> %fvec2, <8 x float>* %fptrvec) { -; CHECK-LABEL: test_shl: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: # kill: %ymm1 %ymm1 %zmm1 -; CHECK-NEXT: # kill: %ymm0 %ymm0 %zmm0 -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: je .LBB8_2 -; CHECK-NEXT: # %bb.1: # %if -; CHECK-NEXT: kmovb (%rsi), %k0 -; CHECK-NEXT: kshiftlb $6, %k0, %k1 -; CHECK-NEXT: jmp .LBB8_3 -; CHECK-NEXT: .LBB8_2: # %else -; CHECK-NEXT: kmovb (%rdx), %k1 -; CHECK-NEXT: .LBB8_3: # %exit -; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %ymm1, (%rcx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; X86-64-LABEL: test_shl: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: # kill: %ymm1 %ymm1 %zmm1 +; X86-64-NEXT: # kill: %ymm0 %ymm0 %zmm0 +; X86-64-NEXT: testb $1, %dil +; X86-64-NEXT: je .LBB8_2 +; X86-64-NEXT: # %bb.1: # %if +; X86-64-NEXT: kmovb (%rsi), %k0 +; X86-64-NEXT: kshiftlb $6, %k0, %k1 +; X86-64-NEXT: jmp .LBB8_3 +; X86-64-NEXT: .LBB8_2: # %else +; X86-64-NEXT: kmovb (%rdx), %k1 +; X86-64-NEXT: .LBB8_3: # %exit +; X86-64-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; X86-64-NEXT: vmovaps %ymm1, (%rcx) +; X86-64-NEXT: vzeroupper +; X86-64-NEXT: retq +; +; X86-32-LABEL: test_shl: +; X86-32: # %bb.0: # %entry +; X86-32-NEXT: # kill: %ymm1 %ymm1 %zmm1 +; X86-32-NEXT: # kill: %ymm0 %ymm0 %zmm0 +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-32-NEXT: je .LBB8_2 +; X86-32-NEXT: # %bb.1: # %if +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: kmovb (%ecx), %k0 +; X86-32-NEXT: kshiftlb $6, %k0, %k1 +; X86-32-NEXT: jmp .LBB8_3 +; X86-32-NEXT: .LBB8_2: # %else +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: kmovb (%ecx), %k1 +; X86-32-NEXT: .LBB8_3: # %exit +; X86-32-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; X86-32-NEXT: vmovaps %ymm1, (%eax) +; X86-32-NEXT: vzeroupper +; X86-32-NEXT: retl entry: br i1 %cond, label %if, label %else @@ -318,24 +496,46 @@ } define void @test_add(i1 %cond, i8* %ptr1, i8* %ptr2, <8 x float> %fvec1, <8 x float> %fvec2, <8 x float>* %fptrvec) { -; CHECK-LABEL: test_add: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: # kill: %ymm1 %ymm1 %zmm1 -; CHECK-NEXT: # kill: %ymm0 %ymm0 %zmm0 -; CHECK-NEXT: kmovb (%rsi), %k0 -; CHECK-NEXT: kmovb (%rdx), %k1 -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: je .LBB9_2 -; CHECK-NEXT: # %bb.1: # %if -; CHECK-NEXT: kandb %k1, %k0, %k1 -; CHECK-NEXT: jmp .LBB9_3 -; CHECK-NEXT: .LBB9_2: # %else -; CHECK-NEXT: kaddb %k1, %k0, %k1 -; CHECK-NEXT: .LBB9_3: # %exit -; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %ymm1, (%rcx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; X86-64-LABEL: test_add: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: # kill: %ymm1 %ymm1 %zmm1 +; X86-64-NEXT: # kill: %ymm0 %ymm0 %zmm0 +; X86-64-NEXT: kmovb (%rsi), %k0 +; X86-64-NEXT: kmovb (%rdx), %k1 +; X86-64-NEXT: testb $1, %dil +; X86-64-NEXT: je .LBB9_2 +; X86-64-NEXT: # %bb.1: # %if +; X86-64-NEXT: kandb %k1, %k0, %k1 +; X86-64-NEXT: jmp .LBB9_3 +; X86-64-NEXT: .LBB9_2: # %else +; X86-64-NEXT: kaddb %k1, %k0, %k1 +; X86-64-NEXT: .LBB9_3: # %exit +; X86-64-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; X86-64-NEXT: vmovaps %ymm1, (%rcx) +; X86-64-NEXT: vzeroupper +; X86-64-NEXT: retq +; +; X86-32-LABEL: test_add: +; X86-32: # %bb.0: # %entry +; X86-32-NEXT: # kill: %ymm1 %ymm1 %zmm1 +; X86-32-NEXT: # kill: %ymm0 %ymm0 %zmm0 +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-32-NEXT: kmovb (%edx), %k0 +; X86-32-NEXT: kmovb (%ecx), %k1 +; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-32-NEXT: je .LBB9_2 +; X86-32-NEXT: # %bb.1: # %if +; X86-32-NEXT: kandb %k1, %k0, %k1 +; X86-32-NEXT: jmp .LBB9_3 +; X86-32-NEXT: .LBB9_2: # %else +; X86-32-NEXT: kaddb %k1, %k0, %k1 +; X86-32-NEXT: .LBB9_3: # %exit +; X86-32-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; X86-32-NEXT: vmovaps %ymm1, (%eax) +; X86-32-NEXT: vzeroupper +; X86-32-NEXT: retl entry: %loaded1 = load i8, i8* %ptr1 %loaded2 = load i8, i8* %ptr2