Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -19049,11 +19049,16 @@ SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - if (isAllOnesConstant(Mask)) - return Op; - MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); + + if (auto *MaskConst = dyn_cast(Mask)) { + if (MaskConst->getZExtValue() & 0x1) + return Op; + return PreservedSrc.isUndef() ? getZeroVector(VT, Subtarget, DAG, dl) + : PreservedSrc; + } + // The mask should be of type MVT::i1 SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask); Index: test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics.ll +++ test/CodeGen/X86/avx512-intrinsics.ll @@ -5201,8 +5201,7 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm: ; CHECK: ## BB#0: -; CHECK-NEXT: kxorw %k0, %k0, %k1 -; CHECK-NEXT: vfmadd213ss (%rdi), %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: retq %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 Index: test/CodeGen/X86/avx512-scalar_mask.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/avx512-scalar_mask.ll @@ -0,0 +1,97 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s + +declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) +declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float>@test_var_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, i8 %mask) { +; CHECK-LABEL: test_var_mask: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 %mask, i32 4) + ret < 4 x float> %res +} + +define <4 x float>@test_var_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, i8 %mask) { +; CHECK-LABEL: test_var_maskz: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 %mask, i32 4) + ret < 4 x float> %res +} + +define <4 x float>@test_const0_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: test_const0_mask: +; CHECK: ## BB#0: +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 0, i32 4) + ret < 4 x float> %res +} + +define <4 x float>@test_const0_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: test_const0_maskz: +; CHECK: ## BB#0: +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 0, i32 4) + ret < 4 x float> %res +} + +define <4 x float>@test_const2_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: test_const2_mask: +; CHECK: ## BB#0: +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 2, i32 4) + ret < 4 x float> %res +} + +define <4 x float>@test_const2_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: test_const2_maskz: +; CHECK: ## BB#0: +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 2, i32 4) + ret < 4 x float> %res +} + +define <4 x float>@test_const_allone_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: test_const_allone_mask: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 -1, i32 4) + ret < 4 x float> %res +} + +define <4 x float>@test_const_allone_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: test_const_allone_maskz: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 -1, i32 4) + ret < 4 x float> %res +} + +define <4 x float>@test_const_3_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: test_const_3_mask: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 3, i32 4) + ret < 4 x float> %res +} + +define <4 x float>@test_const_3_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: test_const_3_maskz: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 3, i32 4) + ret < 4 x float> %res +} Index: test/CodeGen/X86/avx512er-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512er-intrinsics.ll +++ test/CodeGen/X86/avx512er-intrinsics.ll @@ -121,9 +121,7 @@ define <4 x float> @test_rsqrt28_ss_maskz(<4 x float> %a0) { ; CHECK-LABEL: test_rsqrt28_ss_maskz: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] -; CHECK-NEXT: vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcd,0xc0] +; CHECK-NEXT: vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0] ; CHECK-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 7, i32 8) ; ret <4 x float> %res @@ -132,10 +130,7 @@ define <4 x float> @test_rsqrt28_ss_mask(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0) { ; CHECK-LABEL: test_rsqrt28_ss_mask: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] -; CHECK-NEXT: vrsqrt28ss {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcd,0xd1] -; CHECK-NEXT: vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2] +; CHECK-NEXT: vrsqrt28ss {sae}, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc1] ; CHECK-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 7, i32 8) ; ret <4 x float> %res @@ -144,9 +139,7 @@ define <2 x double> @test_rsqrt28_sd_maskz(<2 x double> %a0) { ; CHECK-LABEL: test_rsqrt28_sd_maskz: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] -; CHECK-NEXT: vrsqrt28sd {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0xcd,0xc0] +; CHECK-NEXT: vrsqrt28sd {sae}, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x18,0xcd,0xc0] ; CHECK-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 7, i32 8) ; ret <2 x double> %res @@ -155,10 +148,7 @@ define <2 x double> @test_rsqrt28_sd_mask(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0) { ; CHECK-LABEL: test_rsqrt28_sd_mask: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] -; CHECK-NEXT: vrsqrt28sd {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x19,0xcd,0xd1] -; CHECK-NEXT: vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2] +; CHECK-NEXT: vrsqrt28sd {sae}, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x18,0xcd,0xc1] ; CHECK-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0, i8 7, i32 8) ; ret <2 x double> %res @@ -169,9 +159,7 @@ define <2 x double> @test_rsqrt28_sd_maskz_mem(<2 x double> %a0, double* %ptr ) { ; CHECK-LABEL: test_rsqrt28_sd_maskz_mem: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] -; CHECK-NEXT: vrsqrt28sd (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x07] +; CHECK-NEXT: vrsqrt28sd (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0xcd,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] %mem = load double , double * %ptr, align 8 %mem_v = insertelement <2 x double> undef, double %mem, i32 0 @@ -182,9 +170,7 @@ define <2 x double> @test_rsqrt28_sd_maskz_mem_offset(<2 x double> %a0, double* %ptr ) { ; CHECK-LABEL: test_rsqrt28_sd_maskz_mem_offset: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] -; CHECK-NEXT: vrsqrt28sd 144(%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x47,0x12] +; CHECK-NEXT: vrsqrt28sd 144(%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0xcd,0x47,0x12] ; CHECK-NEXT: retq # encoding: [0xc3] %ptr1 = getelementptr double, double* %ptr, i32 18 %mem = load double , double * %ptr1, align 8