Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -8703,116 +8703,47 @@ int_x86_avx2_maskstore_q, int_x86_avx2_maskstore_q_256>, VEX_W; -def: Pat<(X86mstore addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)), - (VMASKMOVPSYmr addr:$ptr, VR256:$mask, VR256:$src)>; - -def: Pat<(X86mstore addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)), - (VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>; - -def: Pat<(X86mstore addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src)), - (VMASKMOVPSmr addr:$ptr, VR128:$mask, VR128:$src)>; - -def: Pat<(X86mstore addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src)), - (VPMASKMOVDmr addr:$ptr, VR128:$mask, VR128:$src)>; - -def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)), - (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>; - -def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), - (bc_v8f32 (v8i32 immAllZerosV)))), - (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>; - -def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src0))), - (VBLENDVPSYrr VR256:$src0, (VMASKMOVPSYrm VR256:$mask, addr:$ptr), - VR256:$mask)>; - -def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)), - (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>; - -def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 immAllZerosV))), - (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>; - -def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src0))), - (VBLENDVPSYrr VR256:$src0, (VPMASKMOVDYrm VR256:$mask, addr:$ptr), - VR256:$mask)>; - -def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)), - (VMASKMOVPSrm VR128:$mask, addr:$ptr)>; - -def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), - (bc_v4f32 (v4i32 immAllZerosV)))), - (VMASKMOVPSrm VR128:$mask, addr:$ptr)>; - -def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src0))), - (VBLENDVPSrr VR128:$src0, (VMASKMOVPSrm VR128:$mask, addr:$ptr), - VR128:$mask)>; - -def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)), - (VPMASKMOVDrm VR128:$mask, addr:$ptr)>; - -def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 immAllZerosV))), - (VPMASKMOVDrm VR128:$mask, addr:$ptr)>; - -def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src0))), - (VBLENDVPSrr VR128:$src0, (VPMASKMOVDrm VR128:$mask, addr:$ptr), - VR128:$mask)>; - -def: Pat<(X86mstore addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)), - (VMASKMOVPDYmr addr:$ptr, VR256:$mask, VR256:$src)>; - -def: Pat<(X86mstore addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)), - (VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>; - -def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)), - (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>; - -def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), - (v4f64 immAllZerosV))), - (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>; - -def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src0))), - (VBLENDVPDYrr VR256:$src0, (VMASKMOVPDYrm VR256:$mask, addr:$ptr), - VR256:$mask)>; - -def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)), - (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>; - -def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), - (bc_v4i64 (v8i32 immAllZerosV)))), - (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>; - -def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src0))), - (VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr), - VR256:$mask)>; - -def: Pat<(X86mstore addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src)), - (VMASKMOVPDmr addr:$ptr, VR128:$mask, VR128:$src)>; - -def: Pat<(X86mstore addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src)), - (VPMASKMOVQmr addr:$ptr, VR128:$mask, VR128:$src)>; - -def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)), - (VMASKMOVPDrm VR128:$mask, addr:$ptr)>; - -def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), - (v2f64 immAllZerosV))), - (VMASKMOVPDrm VR128:$mask, addr:$ptr)>; - -def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src0))), - (VBLENDVPDrr VR128:$src0, (VMASKMOVPDrm VR128:$mask, addr:$ptr), - VR128:$mask)>; - -def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)), - (VPMASKMOVQrm VR128:$mask, addr:$ptr)>; - -def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), - (bc_v2i64 (v4i32 immAllZerosV)))), - (VPMASKMOVQrm VR128:$mask, addr:$ptr)>; - -def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src0))), - (VBLENDVPDrr VR128:$src0, (VPMASKMOVQrm VR128:$mask, addr:$ptr), - VR128:$mask)>; - +multiclass maskmov_lowering { + // masked store + def: Pat<(X86mstore addr:$ptr, (MaskVT RC:$mask), (VT RC:$src)), + (!cast(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>; + // masked load + def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)), + (!cast(InstrStr#"rm") RC:$mask, addr:$ptr)>; + def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), + (VT (bitconvert (ZeroVT immAllZerosV))))), + (!cast(InstrStr#"rm") RC:$mask, addr:$ptr)>; + def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), (VT RC:$src0))), + (!cast(BlendStr#"rr") + RC:$src0, + (!cast(InstrStr#"rm") RC:$mask, addr:$ptr), + RC:$mask)>; +} +let Predicates = [HasAVX] in { + defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32, "VBLENDVPS", v4i32>; + defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64, "VBLENDVPD", v4i32>; +} +let Predicates = [HasAVX1Only] in { + // zero vector created as v8f32 (base on X86TargetLowering::LowerBUILD_VECTOR) + defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32, "VBLENDVPSY", v8f32>; + defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64, "VBLENDVPDY", v8f32>; + // load/store i32/i64 not supported use ps/pd version + defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>; + defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64, "VBLENDVPDY", v8f32>; + defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>; + defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>; +} +let Predicates = [HasAVX2] in { + // zero vector created as v8i32 (base on X86TargetLowering::LowerBUILD_VECTOR) + defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32, "VBLENDVPSY", v8i32>; + defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64, "VBLENDVPDY", v8i32>; + + defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>; + defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>; + defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>; + defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>; +} //===----------------------------------------------------------------------===// // Variable Bit Shifts // Index: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp +++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1438,7 +1438,7 @@ int DataWidth = isa(ScalarTy) ? DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); - return (DataWidth >= 32 && ST->hasAVX2()); + return (DataWidth >= 32 && ST->hasAVX()); } bool X86TTIImpl::isLegalMaskedStore(Type *DataType) { Index: llvm/trunk/test/CodeGen/X86/masked_memop.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/masked_memop.ll +++ llvm/trunk/test/CodeGen/X86/masked_memop.ll @@ -1,20 +1,29 @@ -; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx < %s | FileCheck %s --check-prefix=AVX1 -; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx2 < %s | FileCheck %s --check-prefix=AVX2 -; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f < %s | FileCheck %s --check-prefix=AVX512 -; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl < %s | FileCheck %s --check-prefix=SKX +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx2 < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=SKX -; FIXME: AVX1 supports vmaskmovp[s/d], so its codegen should be identical to AVX2 for FP cases. -; For integer cases, AVX1 could use the FP instructions in place of vpmaskmov? - -; To test for the case where masked load/store is not legal, we should add a run with a target +; To test for the case where masked load/store is not legal, we should add a run with a target ; that does not have AVX, but that case should probably be a separate test file using less tests -; because it takes over 1.2 seconds to codegen these tests on Haswell 4GHz if there's no maskmov. +; because it takes over 1.2 seconds to codegen these tests on Haswell 4GHz if there's no maskmov. define <16 x i32> @test1(<16 x i32> %trigger, <16 x i32>* %addr) { -; Bypassing exact checking here because it's over 300 lines. ; AVX1-LABEL: test1: -; AVX1-NOT: maskmov - +; AVX1: ## BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmaskmovps 32(%rdi), %ymm1, %ymm1 +; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0 +; AVX1-NEXT: retq +; ; AVX2-LABEL: test1: ; AVX2: ## BB#0: ; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 @@ -30,22 +39,26 @@ ; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 ; AVX512-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} ; AVX512-NEXT: retq -; -; SKX-LABEL: test1: -; SKX: ## BB#0: -; SKX-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 -; SKX-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} -; SKX-NEXT: retq %mask = icmp eq <16 x i32> %trigger, zeroinitializer %res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>undef) ret <16 x i32> %res } define <16 x i32> @test2(<16 x i32> %trigger, <16 x i32>* %addr) { -; Bypassing exact checking here because it's over 300 lines. ; AVX1-LABEL: test2: -; AVX1-NOT: maskmov +; AVX1: ## BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmaskmovps 32(%rdi), %ymm1, %ymm1 +; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0 +; AVX1-NEXT: retq ; ; AVX2-LABEL: test2: ; AVX2: ## BB#0: @@ -62,22 +75,27 @@ ; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 ; AVX512-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} ; AVX512-NEXT: retq -; -; SKX-LABEL: test2: -; SKX: ## BB#0: -; SKX-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 -; SKX-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} -; SKX-NEXT: retq %mask = icmp eq <16 x i32> %trigger, zeroinitializer %res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>zeroinitializer) ret <16 x i32> %res } define void @test3(<16 x i32> %trigger, <16 x i32>* %addr, <16 x i32> %val) { -; Bypassing exact checking here because it's over 300 lines. ; AVX1-LABEL: test3: -; AVX1-NOT: maskmov +; AVX1: ## BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vmaskmovps %ymm3, %ymm1, 32(%rdi) +; AVX1-NEXT: vmaskmovps %ymm2, %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq ; ; AVX2-LABEL: test3: ; AVX2: ## BB#0: @@ -95,22 +113,28 @@ ; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 ; AVX512-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} ; AVX512-NEXT: retq -; -; SKX-LABEL: test3: -; SKX: ## BB#0: -; SKX-NEXT: vpxord %zmm2, %zmm2, %zmm2 -; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 -; SKX-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} -; SKX-NEXT: retq %mask = icmp eq <16 x i32> %trigger, zeroinitializer call void @llvm.masked.store.v16i32(<16 x i32>%val, <16 x i32>* %addr, i32 4, <16 x i1>%mask) ret void } define <16 x float> @test4(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %dst) { -; Bypassing exact checking here because it's over 300 lines. ; AVX1-LABEL: test4: -; AVX1-NOT: maskmov +; AVX1: ## BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm4 +; AVX1-NEXT: vblendvps %ymm0, %ymm4, %ymm2, %ymm0 +; AVX1-NEXT: vmaskmovps 32(%rdi), %ymm1, %ymm2 +; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm3, %ymm1 +; AVX1-NEXT: retq ; ; AVX2-LABEL: test4: ; AVX2: ## BB#0: @@ -130,23 +154,31 @@ ; AVX512-NEXT: vmovups (%rdi), %zmm1 {%k1} ; AVX512-NEXT: vmovaps %zmm1, %zmm0 ; AVX512-NEXT: retq -; -; SKX-LABEL: test4: -; SKX: ## BB#0: -; SKX-NEXT: vpxord %zmm2, %zmm2, %zmm2 -; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 -; SKX-NEXT: vmovups (%rdi), %zmm1 {%k1} -; SKX-NEXT: vmovaps %zmm1, %zmm0 -; SKX-NEXT: retq %mask = icmp eq <16 x i32> %trigger, zeroinitializer %res = call <16 x float> @llvm.masked.load.v16f32(<16 x float>* %addr, i32 4, <16 x i1>%mask, <16 x float> %dst) ret <16 x float> %res } define <8 x double> @test5(<8 x i32> %trigger, <8 x double>* %addr, <8 x double> %dst) { -; Bypassing exact checking here because it's over 100 lines. ; AVX1-LABEL: test5: -; AVX1-NOT: maskmov +; AVX1: ## BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxdq %xmm3, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4 +; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0 +; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1 +; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: retq ; ; AVX2-LABEL: test5: ; AVX2: ## BB#0: @@ -162,13 +194,13 @@ ; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; -; AVX512-LABEL: test5: -; AVX512: ## BB#0: -; AVX512-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 -; AVX512-NEXT: vmovupd (%rdi), %zmm1 {%k1} -; AVX512-NEXT: vmovaps %zmm1, %zmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: test5: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; AVX512F-NEXT: vmovupd (%rdi), %zmm1 {%k1} +; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: retq ; ; SKX-LABEL: test5: ; SKX: ## BB#0: @@ -183,43 +215,21 @@ } define <2 x double> @test6(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) { -; AVX1-LABEL: test6: -; AVX1: ## BB#0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpextrb $0, %xmm3, %eax -; AVX1-NEXT: ## implicit-def: %XMM2 -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB5_2 -; AVX1-NEXT: ## BB#1: ## %cond.load -; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX1-NEXT: LBB5_2: ## %else -; AVX1-NEXT: vpextrb $8, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB5_4 -; AVX1-NEXT: ## BB#3: ## %cond.load1 -; AVX1-NEXT: vmovhpd 8(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: LBB5_4: ## %else2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: test6: -; AVX2: ## BB#0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 -; AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: test6: -; AVX512: ## BB#0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 -; AVX512-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: test6: +; AVX: ## BB#0: +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 +; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX512F-LABEL: test6: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 +; AVX512F-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 +; AVX512F-NEXT: retq ; ; SKX-LABEL: test6: ; SKX: ## BB#0: @@ -234,57 +244,21 @@ } define <4 x float> @test7(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %dst) { -; AVX1-LABEL: test7: -; AVX1: ## BB#0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpextrb $0, %xmm3, %eax -; AVX1-NEXT: ## implicit-def: %XMM2 -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB6_2 -; AVX1-NEXT: ## BB#1: ## %cond.load -; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX1-NEXT: LBB6_2: ## %else -; AVX1-NEXT: vpextrb $4, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB6_4 -; AVX1-NEXT: ## BB#3: ## %cond.load1 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] -; AVX1-NEXT: LBB6_4: ## %else2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpextrb $8, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB6_6 -; AVX1-NEXT: ## BB#5: ## %cond.load4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] -; AVX1-NEXT: LBB6_6: ## %else5 -; AVX1-NEXT: vpextrb $12, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB6_8 -; AVX1-NEXT: ## BB#7: ## %cond.load7 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] -; AVX1-NEXT: LBB6_8: ## %else8 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: test7: -; AVX2: ## BB#0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 -; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: test7: -; AVX512: ## BB#0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 -; AVX512-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: test7: +; AVX: ## BB#0: +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 +; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX512F-LABEL: test7: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 +; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX512F-NEXT: retq ; ; SKX-LABEL: test7: ; SKX: ## BB#0: @@ -302,36 +276,8 @@ ; AVX1-LABEL: test8: ; AVX1: ## BB#0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpextrb $0, %xmm3, %eax -; AVX1-NEXT: ## implicit-def: %XMM2 -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB7_2 -; AVX1-NEXT: ## BB#1: ## %cond.load -; AVX1-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX1-NEXT: LBB7_2: ## %else -; AVX1-NEXT: vpextrb $4, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB7_4 -; AVX1-NEXT: ## BB#3: ## %cond.load1 -; AVX1-NEXT: vpinsrd $1, 4(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: LBB7_4: ## %else2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpextrb $8, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB7_6 -; AVX1-NEXT: ## BB#5: ## %cond.load4 -; AVX1-NEXT: vpinsrd $2, 8(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: LBB7_6: ## %else5 -; AVX1-NEXT: vpextrb $12, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB7_8 -; AVX1-NEXT: ## BB#7: ## %cond.load7 -; AVX1-NEXT: vpinsrd $3, 12(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: LBB7_8: ## %else8 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 ; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -343,13 +289,13 @@ ; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: test8: -; AVX512: ## BB#0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 -; AVX512-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: test8: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 +; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX512F-NEXT: retq ; ; SKX-LABEL: test8: ; SKX: ## BB#0: @@ -367,33 +313,8 @@ ; AVX1-LABEL: test9: ; AVX1: ## BB#0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB8_2 -; AVX1-NEXT: ## BB#1: ## %cond.store -; AVX1-NEXT: vmovd %xmm1, (%rdi) -; AVX1-NEXT: LBB8_2: ## %else -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB8_4 -; AVX1-NEXT: ## BB#3: ## %cond.store1 -; AVX1-NEXT: vpextrd $1, %xmm1, 4(%rdi) -; AVX1-NEXT: LBB8_4: ## %else2 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $8, %xmm0, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB8_6 -; AVX1-NEXT: ## BB#5: ## %cond.store3 -; AVX1-NEXT: vpextrd $2, %xmm1, 8(%rdi) -; AVX1-NEXT: LBB8_6: ## %else4 -; AVX1-NEXT: vpextrb $12, %xmm0, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB8_8 -; AVX1-NEXT: ## BB#7: ## %cond.store5 -; AVX1-NEXT: vpextrd $3, %xmm1, 12(%rdi) -; AVX1-NEXT: LBB8_8: ## %else6 +; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: test9: @@ -403,12 +324,12 @@ ; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) ; AVX2-NEXT: retq ; -; AVX512-LABEL: test9: -; AVX512: ## BB#0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) -; AVX512-NEXT: retq +; AVX512F-LABEL: test9: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) +; AVX512F-NEXT: retq ; ; SKX-LABEL: test9: ; SKX: ## BB#0: @@ -425,45 +346,12 @@ ; AVX1-LABEL: test10: ; AVX1: ## BB#0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpextrb $0, %xmm3, %eax -; AVX1-NEXT: ## implicit-def: %YMM2 -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB9_2 -; AVX1-NEXT: ## BB#1: ## %cond.load -; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX1-NEXT: LBB9_2: ## %else -; AVX1-NEXT: vpextrb $4, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB9_4 -; AVX1-NEXT: ## BB#3: ## %cond.load1 -; AVX1-NEXT: vmovhpd 8(%rdi), %xmm2, %xmm3 -; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] -; AVX1-NEXT: LBB9_4: ## %else2 -; AVX1-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpextrb $8, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB9_6 -; AVX1-NEXT: ## BB#5: ## %cond.load4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vmovlpd 16(%rdi), %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-NEXT: LBB9_6: ## %else5 -; AVX1-NEXT: vpextrb $12, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB9_8 -; AVX1-NEXT: ## BB#7: ## %cond.load7 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vmovhpd 24(%rdi), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: LBB9_8: ## %else8 -; AVX1-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 ; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -476,14 +364,14 @@ ; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: test10: -; AVX512: ## BB#0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX512-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 -; AVX512-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: test10: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX512F-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 +; AVX512F-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 +; AVX512F-NEXT: retq ; ; SKX-LABEL: test10: ; SKX: ## BB#0: @@ -497,10 +385,56 @@ ret <4 x double> %res } +define <4 x double> @test10b(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) { +; AVX1-LABEL: test10b: +; AVX1: ## BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test10b: +; AVX2: ## BB#0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: test10b: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX512F-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; SKX-LABEL: test10b: +; SKX: ## BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; SKX-NEXT: vmovapd (%rdi), %ymm0 {%k1} {z} +; SKX-NEXT: retq + %mask = icmp eq <4 x i32> %trigger, zeroinitializer + %res = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>zeroinitializer) + ret <4 x double> %res +} + define <8 x float> @test11a(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) { -; Bypassing exact checking here because it's over 100 lines. ; AVX1-LABEL: test11a: -; AVX1-NOT: maskmov +; AVX1: ## BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 +; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 +; AVX1-NEXT: retq ; ; AVX2-LABEL: test11a: ; AVX2: ## BB#0: @@ -510,15 +444,15 @@ ; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: test11a: -; AVX512: ## BB#0: -; AVX512-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 -; AVX512-NEXT: kshiftlw $8, %k0, %k0 -; AVX512-NEXT: kshiftrw $8, %k0, %k1 -; AVX512-NEXT: vmovups (%rdi), %zmm1 {%k1} -; AVX512-NEXT: vmovaps %zmm1, %zmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: test11a: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 +; AVX512F-NEXT: kshiftlw $8, %k0, %k0 +; AVX512F-NEXT: kshiftrw $8, %k0, %k1 +; AVX512F-NEXT: vmovups (%rdi), %zmm1 {%k1} +; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: retq ; ; SKX-LABEL: test11a: ; SKX: ## BB#0: @@ -533,9 +467,18 @@ } define <8 x i32> @test11b(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) { -; Bypassing exact checking here because it's over 70 lines. ; AVX1-LABEL: test11b: -; AVX1-NOT: maskmov +; AVX1: ## BB#0: +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 +; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 +; AVX1-NEXT: retq ; ; AVX2-LABEL: test11b: ; AVX2: ## BB#0: @@ -546,16 +489,16 @@ ; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: test11b: -; AVX512: ## BB#0: -; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k0 -; AVX512-NEXT: kshiftlw $8, %k0, %k0 -; AVX512-NEXT: kshiftrw $8, %k0, %k1 -; AVX512-NEXT: vmovdqu32 (%rdi), %zmm1 {%k1} -; AVX512-NEXT: vmovaps %zmm1, %zmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: test11b: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kshiftlw $8, %k0, %k0 +; AVX512F-NEXT: kshiftrw $8, %k0, %k1 +; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm1 {%k1} +; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: retq ; ; SKX-LABEL: test11b: ; SKX: ## BB#0: @@ -569,9 +512,17 @@ } define <8 x float> @test11c(<8 x i1> %mask, <8 x float>* %addr) { -; Bypassing exact checking here because it's over 70 lines. ; AVX1-LABEL: test11c: -; AVX1-NOT: maskmov +; AVX1: ## BB#0: +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0 +; AVX1-NEXT: retq ; ; AVX2-LABEL: test11c: ; AVX2: ## BB#0: @@ -581,15 +532,15 @@ ; AVX2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: test11c: -; AVX512: ## BB#0: -; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k0 -; AVX512-NEXT: kshiftlw $8, %k0, %k0 -; AVX512-NEXT: kshiftrw $8, %k0, %k1 -; AVX512-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} -; AVX512-NEXT: retq +; AVX512F-LABEL: test11c: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kshiftlw $8, %k0, %k0 +; AVX512F-NEXT: kshiftrw $8, %k0, %k1 +; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} +; AVX512F-NEXT: retq ; ; SKX-LABEL: test11c: ; SKX: ## BB#0: @@ -602,9 +553,17 @@ } define <8 x i32> @test11d(<8 x i1> %mask, <8 x i32>* %addr) { -; Bypassing exact checking here because it's over 70 lines. ; AVX1-LABEL: test11d: -; AVX1-NOT: maskmov +; AVX1: ## BB#0: +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0 +; AVX1-NEXT: retq ; ; AVX2-LABEL: test11d: ; AVX2: ## BB#0: @@ -614,15 +573,15 @@ ; AVX2-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: test11d: -; AVX512: ## BB#0: -; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k0 -; AVX512-NEXT: kshiftlw $8, %k0, %k0 -; AVX512-NEXT: kshiftrw $8, %k0, %k1 -; AVX512-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} -; AVX512-NEXT: retq +; AVX512F-LABEL: test11d: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kshiftlw $8, %k0, %k0 +; AVX512F-NEXT: kshiftrw $8, %k0, %k1 +; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} +; AVX512F-NEXT: retq ; ; SKX-LABEL: test11d: ; SKX: ## BB#0: @@ -635,9 +594,16 @@ } define void @test12(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) { -; Bypassing exact checking here because it's over 90 lines. ; AVX1-LABEL: test12: -; AVX1-NOT: maskmov +; AVX1: ## BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq ; ; AVX2-LABEL: test12: ; AVX2: ## BB#0: @@ -647,14 +613,14 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test12: -; AVX512: ## BB#0: -; AVX512-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 -; AVX512-NEXT: kshiftlw $8, %k0, %k0 -; AVX512-NEXT: kshiftrw $8, %k0, %k1 -; AVX512-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} -; AVX512-NEXT: retq +; AVX512F-LABEL: test12: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 +; AVX512F-NEXT: kshiftlw $8, %k0, %k0 +; AVX512F-NEXT: kshiftrw $8, %k0, %k1 +; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} +; AVX512F-NEXT: retq ; ; SKX-LABEL: test12: ; SKX: ## BB#0: @@ -668,9 +634,21 @@ } define void @test13(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %val) { -; Bypassing exact checking here because it's over 300 lines. ; AVX1-LABEL: test13: -; AVX1-NOT: maskmov +; AVX1: ## BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vmaskmovps %ymm3, %ymm1, 32(%rdi) +; AVX1-NEXT: vmaskmovps %ymm2, %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq ; ; AVX2-LABEL: test13: ; AVX2: ## BB#0: @@ -688,13 +666,6 @@ ; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 ; AVX512-NEXT: vmovups %zmm1, (%rdi) {%k1} ; AVX512-NEXT: retq -; -; SKX-LABEL: test13: -; SKX: ## BB#0: -; SKX-NEXT: vpxord %zmm2, %zmm2, %zmm2 -; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 -; SKX-NEXT: vmovups %zmm1, (%rdi) {%k1} -; SKX-NEXT: retq %mask = icmp eq <16 x i32> %trigger, zeroinitializer call void @llvm.masked.store.v16f32(<16 x float>%val, <16 x float>* %addr, i32 4, <16 x i1>%mask) ret void @@ -704,22 +675,11 @@ ; AVX1-LABEL: test14: ; AVX1: ## BB#0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $0, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB16_2 -; AVX1-NEXT: ## BB#1: ## %cond.store -; AVX1-NEXT: vmovss %xmm1, (%rdi) -; AVX1-NEXT: LBB16_2: ## %else ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $8, %xmm0, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB16_4 -; AVX1-NEXT: ## BB#3: ## %cond.store1 -; AVX1-NEXT: vextractps $1, %xmm1, 4(%rdi) -; AVX1-NEXT: LBB16_4: ## %else2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: test14: @@ -732,15 +692,15 @@ ; AVX2-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) ; AVX2-NEXT: retq ; -; AVX512-LABEL: test14: -; AVX512: ## BB#0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) -; AVX512-NEXT: retq +; AVX512F-LABEL: test14: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX512F-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512F-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) +; AVX512F-NEXT: retq ; ; SKX-LABEL: test14: ; SKX: ## BB#0: @@ -760,22 +720,12 @@ ; AVX1-LABEL: test15: ; AVX1: ## BB#0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $0, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB17_2 -; AVX1-NEXT: ## BB#1: ## %cond.store -; AVX1-NEXT: vmovd %xmm1, (%rdi) -; AVX1-NEXT: LBB17_2: ## %else ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $8, %xmm0, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB17_4 -; AVX1-NEXT: ## BB#3: ## %cond.store1 -; AVX1-NEXT: vpextrd $2, %xmm1, 4(%rdi) -; AVX1-NEXT: LBB17_4: ## %else2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: test15: @@ -789,16 +739,16 @@ ; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) ; AVX2-NEXT: retq ; -; AVX512-LABEL: test15: -; AVX512: ## BB#0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) -; AVX512-NEXT: retq +; AVX512F-LABEL: test15: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX512F-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX512F-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) +; AVX512F-NEXT: retq ; ; SKX-LABEL: test15: ; SKX: ## BB#0: @@ -815,29 +765,12 @@ define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) { ; AVX1-LABEL: test16: ; AVX1: ## BB#0: -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] -; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax -; AVX1-NEXT: ## implicit-def: %XMM2 -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB18_2 -; AVX1-NEXT: ## BB#1: ## %cond.load -; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX1-NEXT: LBB18_2: ## %else -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] -; AVX1-NEXT: vpcmpeqq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpextrb $8, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB18_4 -; AVX1-NEXT: ## BB#3: ## %cond.load1 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] -; AVX1-NEXT: LBB18_4: ## %else2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] -; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 ; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -852,16 +785,16 @@ ; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: test16: -; AVX512: ## BB#0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 -; AVX512-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: test16: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX512F-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512F-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 +; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX512F-NEXT: retq ; ; SKX-LABEL: test16: ; SKX: ## BB#0: @@ -881,29 +814,15 @@ define <2 x i32> @test17(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) { ; AVX1-LABEL: test17: ; AVX1: ## BB#0: -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] -; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax -; AVX1-NEXT: ## implicit-def: %XMM2 -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB19_2 -; AVX1-NEXT: ## BB#1: ## %cond.load -; AVX1-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX1-NEXT: LBB19_2: ## %else -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] -; AVX1-NEXT: vpcmpeqq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpextrb $8, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB19_4 -; AVX1-NEXT: ## BB#3: ## %cond.load1 -; AVX1-NEXT: movl 4(%rdi), %eax -; AVX1-NEXT: vpinsrq $1, %rax, %xmm2, %xmm2 -; AVX1-NEXT: LBB19_4: ## %else2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] -; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test17: @@ -919,18 +838,18 @@ ; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: test17: -; AVX512: ## BB#0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX512-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: test17: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX512F-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512F-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX512F-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX512F-NEXT: retq ; ; SKX-LABEL: test17: ; SKX: ## BB#0: @@ -951,30 +870,12 @@ define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) { ; AVX1-LABEL: test18: ; AVX1: ## BB#0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $0, %xmm1, %eax -; AVX1-NEXT: ## implicit-def: %XMM1 -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB20_2 -; AVX1-NEXT: ## BB#1: ## %cond.load -; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: LBB20_2: ## %else -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpextrb $8, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB20_4 -; AVX1-NEXT: ## BB#3: ## %cond.load1 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; AVX1-NEXT: LBB20_4: ## %else2 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test18: @@ -987,15 +888,15 @@ ; AVX2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: test18: -; AVX512: ## BB#0: -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: test18: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX512F-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512F-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: retq ; ; SKX-LABEL: test18: ; SKX: ## BB#0: @@ -1012,22 +913,17 @@ } define <4 x float> @test19(<4 x i32> %trigger, <4 x float>* %addr) { -; AVX1-LABEL: test19: -; AVX1: ## BB#0: -; AVX1-NEXT: vmovups (%rdi), %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: test19: -; AVX2: ## BB#0: -; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: test19: -; AVX512: ## BB#0: -; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: test19: +; AVX: ## BB#0: +; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX512F-LABEL: test19: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: retq ; ; SKX-LABEL: test19: ; SKX: ## BB#0: @@ -1040,24 +936,19 @@ } define <4 x float> @test20(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %src0) { -; AVX1-LABEL: test20: -; AVX1: ## BB#0: -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = mem[0],xmm1[1],mem[2,3] -; AVX1-NEXT: retq -; -; AVX2-LABEL: test20: -; AVX2: ## BB#0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,0,4294967295,4294967295] -; AVX2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 -; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: test20: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,0,4294967295,4294967295] -; AVX512-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 -; AVX512-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: test20: +; AVX: ## BB#0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,0,4294967295,4294967295] +; AVX-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 +; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX512F-LABEL: test20: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,0,4294967295,4294967295] +; AVX512F-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 +; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX512F-NEXT: retq ; ; SKX-LABEL: test20: ; SKX: ## BB#0: @@ -1074,7 +965,8 @@ define void @test21(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) { ; AVX1-LABEL: test21: ; AVX1: ## BB#0: -; AVX1-NEXT: vmovups %xmm1, (%rdi) +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: test21: @@ -1083,11 +975,11 @@ ; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) ; AVX2-NEXT: retq ; -; AVX512-LABEL: test21: -; AVX512: ## BB#0: -; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) -; AVX512-NEXT: retq +; AVX512F-LABEL: test21: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) +; AVX512F-NEXT: retq ; ; SKX-LABEL: test21: ; SKX: ## BB#0: @@ -1102,7 +994,9 @@ define void @test22(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) { ; AVX1-LABEL: test22: ; AVX1: ## BB#0: -; AVX1-NEXT: vmovd %xmm1, (%rdi) +; AVX1-NEXT: movl $-1, %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: test22: @@ -1112,12 +1006,12 @@ ; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) ; AVX2-NEXT: retq ; -; AVX512-LABEL: test22: -; AVX512: ## BB#0: -; AVX512-NEXT: movl $-1, %eax -; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) -; AVX512-NEXT: retq +; AVX512F-LABEL: test22: +; AVX512F: ## BB#0: +; AVX512F-NEXT: movl $-1, %eax +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) +; AVX512F-NEXT: retq ; ; SKX-LABEL: test22: ; SKX: ## BB#0: @@ -1155,9 +1049,30 @@ declare <16 x i32*> @llvm.masked.load.v16p0i32(<16 x i32*>*, i32, <16 x i1>, <16 x i32*>) define <16 x i32*> @test23(<16 x i32*> %trigger, <16 x i32*>* %addr) { -; Bypassing exact checking here because it's over 700 lines. ; AVX1-LABEL: test23: -; AVX1-NOT: maskmov +; AVX1: ## BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqq %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqq %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqq %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqq %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vmaskmovpd 96(%rdi), %ymm3, %ymm3 +; AVX1-NEXT: vmaskmovpd 64(%rdi), %ymm2, %ymm2 +; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm1 +; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0 +; AVX1-NEXT: retq ; ; AVX2-LABEL: test23: ; AVX2: ## BB#0: @@ -1180,15 +1095,6 @@ ; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k2} {z} ; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} ; AVX512-NEXT: retq -; -; SKX-LABEL: test23: -; SKX: ## BB#0: -; SKX-NEXT: vpxord %zmm2, %zmm2, %zmm2 -; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k2 -; SKX-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k2} {z} -; SKX-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} -; SKX-NEXT: retq %mask = icmp eq <16 x i32*> %trigger, zeroinitializer %res = call <16 x i32*> @llvm.masked.load.v16p0i32(<16 x i32*>* %addr, i32 4, <16 x i1>%mask, <16 x i32*>zeroinitializer) ret <16 x i32*> %res @@ -1199,9 +1105,45 @@ declare <16 x %mystruct*> @llvm.masked.load.v16p0mystruct(<16 x %mystruct*>*, i32, <16 x i1>, <16 x %mystruct*>) define <16 x %mystruct*> @test24(<16 x i1> %mask, <16 x %mystruct*>* %addr) { -; Bypassing exact checking here because it's over 100 lines. ; AVX1-LABEL: test24: -; AVX1-NOT: maskmov +; AVX1: ## BB#0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmaskmovpd (%rdi), %ymm1, %ymm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmaskmovpd 96(%rdi), %ymm1, %ymm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmaskmovpd 64(%rdi), %ymm1, %ymm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm0, %ymm1 +; AVX1-NEXT: vmovapd %ymm4, %ymm0 +; AVX1-NEXT: retq ; ; AVX2-LABEL: test24: ; AVX2: ## BB#0: @@ -1231,15 +1173,15 @@ ; AVX2-NEXT: vmovdqa %ymm4, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: test24: -; AVX512: ## BB#0: -; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} -; AVX512-NEXT: kshiftrw $8, %k1, %k1 -; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} {z} -; AVX512-NEXT: retq +; AVX512F-LABEL: test24: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} +; AVX512F-NEXT: kshiftrw $8, %k1, %k1 +; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} {z} +; AVX512F-NEXT: retq ; ; SKX-LABEL: test24: ; SKX: ## BB#0: @@ -1254,9 +1196,45 @@ } define void @test_store_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0) { -; Bypassing exact checking here because it's over 100 lines. ; AVX1-LABEL: test_store_16i64: -; AVX1-NOT: maskmov +; AVX1: ## BB#0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm5, %xmm5 +; AVX1-NEXT: vpsrad $31, %xmm5, %xmm5 +; AVX1-NEXT: vpmovsxdq %xmm5, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm5, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-NEXT: vmaskmovpd %ymm1, %ymm5, (%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-NEXT: vmaskmovpd %ymm4, %ymm1, 96(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-NEXT: vmaskmovpd %ymm3, %ymm1, 64(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmaskmovpd %ymm2, %ymm0, 32(%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq ; ; AVX2-LABEL: test_store_16i64: ; AVX2: ## BB#0: @@ -1286,15 +1264,15 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_store_16i64: -; AVX512: ## BB#0: -; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1} -; AVX512-NEXT: kshiftrw $8, %k1, %k1 -; AVX512-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1} -; AVX512-NEXT: retq +; AVX512F-LABEL: test_store_16i64: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1} +; AVX512F-NEXT: kshiftrw $8, %k1, %k1 +; AVX512F-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1} +; AVX512F-NEXT: retq ; ; SKX-LABEL: test_store_16i64: ; SKX: ## BB#0: @@ -1310,9 +1288,45 @@ declare void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32, <16 x i1> %mask) define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) { -; Bypassing exact checking here because it's over 100 lines. ; AVX1-LABEL: test_store_16f64: -; AVX1-NOT: maskmov +; AVX1: ## BB#0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm5, %xmm5 +; AVX1-NEXT: vpsrad $31, %xmm5, %xmm5 +; AVX1-NEXT: vpmovsxdq %xmm5, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm5, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-NEXT: vmaskmovpd %ymm1, %ymm5, (%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-NEXT: vmaskmovpd %ymm4, %ymm1, 96(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-NEXT: vmaskmovpd %ymm3, %ymm1, 64(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmaskmovpd %ymm2, %ymm0, 32(%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq ; ; AVX2-LABEL: test_store_16f64: ; AVX2: ## BB#0: @@ -1342,15 +1356,15 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_store_16f64: -; AVX512: ## BB#0: -; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512-NEXT: vmovupd %zmm1, (%rdi) {%k1} -; AVX512-NEXT: kshiftrw $8, %k1, %k1 -; AVX512-NEXT: vmovupd %zmm2, 64(%rdi) {%k1} -; AVX512-NEXT: retq +; AVX512F-LABEL: test_store_16f64: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vmovupd %zmm1, (%rdi) {%k1} +; AVX512F-NEXT: kshiftrw $8, %k1, %k1 +; AVX512F-NEXT: vmovupd %zmm2, 64(%rdi) {%k1} +; AVX512F-NEXT: retq ; ; SKX-LABEL: test_store_16f64: ; SKX: ## BB#0: @@ -1366,9 +1380,49 @@ declare void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32, <16 x i1> %mask) define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0) { -; Bypassing exact checking here because it's over 100 lines. ; AVX1-LABEL: test_load_16i64: -; AVX1-NOT: maskmov +; AVX1: ## BB#0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm5, %xmm5 +; AVX1-NEXT: vpsrad $31, %xmm5, %xmm5 +; AVX1-NEXT: vpmovsxdq %xmm5, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm5, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-NEXT: vmaskmovpd (%rdi), %ymm5, %ymm6 +; AVX1-NEXT: vblendvpd %ymm5, %ymm6, %ymm1, %ymm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 +; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm6 +; AVX1-NEXT: vblendvpd %ymm1, %ymm6, %ymm2, %ymm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxdq %xmm2, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 +; AVX1-NEXT: vmaskmovpd 64(%rdi), %ymm2, %ymm6 +; AVX1-NEXT: vblendvpd %ymm2, %ymm6, %ymm3, %ymm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vmaskmovpd 96(%rdi), %ymm0, %ymm3 +; AVX1-NEXT: vblendvpd %ymm0, %ymm3, %ymm4, %ymm3 +; AVX1-NEXT: vmovapd %ymm5, %ymm0 +; AVX1-NEXT: retq ; ; AVX2-LABEL: test_load_16i64: ; AVX2: ## BB#0: @@ -1402,17 +1456,17 @@ ; AVX2-NEXT: vmovapd %ymm5, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_load_16i64: -; AVX512: ## BB#0: -; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1} -; AVX512-NEXT: kshiftrw $8, %k1, %k1 -; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1} -; AVX512-NEXT: vmovaps %zmm1, %zmm0 -; AVX512-NEXT: vmovaps %zmm2, %zmm1 -; AVX512-NEXT: retq +; AVX512F-LABEL: test_load_16i64: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1} +; AVX512F-NEXT: kshiftrw $8, %k1, %k1 +; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1} +; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: vmovaps %zmm2, %zmm1 +; AVX512F-NEXT: retq ; ; SKX-LABEL: test_load_16i64: ; SKX: ## BB#0: @@ -1430,9 +1484,49 @@ declare <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0) define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) { -; Bypassing exact checking here because it's over 100 lines. ; AVX1-LABEL: test_load_16f64: -; AVX1-NOT: maskmov +; AVX1: ## BB#0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm5, %xmm5 +; AVX1-NEXT: vpsrad $31, %xmm5, %xmm5 +; AVX1-NEXT: vpmovsxdq %xmm5, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm5, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-NEXT: vmaskmovpd (%rdi), %ymm5, %ymm6 +; AVX1-NEXT: vblendvpd %ymm5, %ymm6, %ymm1, %ymm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 +; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm6 +; AVX1-NEXT: vblendvpd %ymm1, %ymm6, %ymm2, %ymm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxdq %xmm2, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 +; AVX1-NEXT: vmaskmovpd 64(%rdi), %ymm2, %ymm6 +; AVX1-NEXT: vblendvpd %ymm2, %ymm6, %ymm3, %ymm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vmaskmovpd 96(%rdi), %ymm0, %ymm3 +; AVX1-NEXT: vblendvpd %ymm0, %ymm3, %ymm4, %ymm3 +; AVX1-NEXT: vmovapd %ymm5, %ymm0 +; AVX1-NEXT: retq ; ; AVX2-LABEL: test_load_16f64: ; AVX2: ## BB#0: @@ -1466,17 +1560,17 @@ ; AVX2-NEXT: vmovapd %ymm5, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_load_16f64: -; AVX512: ## BB#0: -; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512-NEXT: vmovupd (%rdi), %zmm1 {%k1} -; AVX512-NEXT: kshiftrw $8, %k1, %k1 -; AVX512-NEXT: vmovupd 64(%rdi), %zmm2 {%k1} -; AVX512-NEXT: vmovaps %zmm1, %zmm0 -; AVX512-NEXT: vmovaps %zmm2, %zmm1 -; AVX512-NEXT: retq +; AVX512F-LABEL: test_load_16f64: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vmovupd (%rdi), %zmm1 {%k1} +; AVX512F-NEXT: kshiftrw $8, %k1, %k1 +; AVX512F-NEXT: vmovupd 64(%rdi), %zmm2 {%k1} +; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: vmovaps %zmm2, %zmm1 +; AVX512F-NEXT: retq ; ; SKX-LABEL: test_load_16f64: ; SKX: ## BB#0: @@ -1494,9 +1588,111 @@ declare <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32, <16 x i1> %mask, <16 x double> %src0) define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 x double> %src0) { -; Bypassing exact checking here because it's over 300 lines. ; AVX1-LABEL: test_load_32f64: -; AVX1-NOT: maskmov +; AVX1: ## BB#0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: Ltmp0: +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: Ltmp1: +; AVX1-NEXT: .cfi_offset %rbp, -16 +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: Ltmp2: +; AVX1-NEXT: .cfi_def_cfa_register %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $32, %rsp +; AVX1-NEXT: vmovapd 16(%rbp), %ymm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm9, %xmm9 +; AVX1-NEXT: vpsrad $31, %xmm9, %xmm9 +; AVX1-NEXT: vpmovsxdq %xmm9, %xmm10 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm9, %xmm9 +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 +; AVX1-NEXT: vmaskmovpd 32(%rsi), %ymm9, %ymm10 +; AVX1-NEXT: vblendvpd %ymm9, %ymm10, %ymm2, %ymm9 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxdq %xmm2, %xmm10 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm10, %ymm2 +; AVX1-NEXT: vmaskmovpd 64(%rsi), %ymm2, %ymm10 +; AVX1-NEXT: vblendvpd %ymm2, %ymm10, %ymm3, %ymm11 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxdq %xmm2, %xmm10 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm10, %ymm2 +; AVX1-NEXT: vmaskmovpd 96(%rsi), %ymm2, %ymm10 +; AVX1-NEXT: vblendvpd %ymm2, %ymm10, %ymm4, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxdq %xmm3, %xmm10 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3 +; AVX1-NEXT: vmaskmovpd 160(%rsi), %ymm3, %ymm10 +; AVX1-NEXT: vblendvpd %ymm3, %ymm10, %ymm6, %ymm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxdq %xmm3, %xmm10 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3 +; AVX1-NEXT: vmaskmovpd 192(%rsi), %ymm3, %ymm10 +; AVX1-NEXT: vblendvpd %ymm3, %ymm10, %ymm7, %ymm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxdq %xmm3, %xmm10 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3 +; AVX1-NEXT: vmaskmovpd 224(%rsi), %ymm3, %ymm10 +; AVX1-NEXT: vblendvpd %ymm3, %ymm10, %ymm8, %ymm3 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0 +; AVX1-NEXT: vmaskmovpd (%rsi), %ymm0, %ymm8 +; AVX1-NEXT: vblendvpd %ymm0, %ymm8, %ymm1, %ymm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmaskmovpd 128(%rsi), %ymm1, %ymm2 +; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm5, %ymm1 +; AVX1-NEXT: vmovapd %ymm1, 128(%rdi) +; AVX1-NEXT: vmovapd %ymm0, (%rdi) +; AVX1-NEXT: vmovapd %ymm3, 224(%rdi) +; AVX1-NEXT: vmovapd %ymm7, 192(%rdi) +; AVX1-NEXT: vmovapd %ymm6, 160(%rdi) +; AVX1-NEXT: vmovapd %ymm4, 96(%rdi) +; AVX1-NEXT: vmovapd %ymm11, 64(%rdi) +; AVX1-NEXT: vmovapd %ymm9, 32(%rdi) +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq ; ; AVX2-LABEL: test_load_32f64: ; AVX2: ## BB#0: @@ -1580,26 +1776,26 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_load_32f64: -; AVX512: ## BB#0: -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX512-NEXT: vpmovsxbd %xmm5, %zmm5 -; AVX512-NEXT: vpslld $31, %zmm5, %zmm5 -; AVX512-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512-NEXT: vmovupd 128(%rdi), %zmm3 {%k1} -; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k2 -; AVX512-NEXT: vmovupd (%rdi), %zmm1 {%k2} -; AVX512-NEXT: kshiftrw $8, %k1, %k1 -; AVX512-NEXT: vmovupd 192(%rdi), %zmm4 {%k1} -; AVX512-NEXT: kshiftrw $8, %k2, %k1 -; AVX512-NEXT: vmovupd 64(%rdi), %zmm2 {%k1} -; AVX512-NEXT: vmovaps %zmm1, %zmm0 -; AVX512-NEXT: vmovaps %zmm2, %zmm1 -; AVX512-NEXT: vmovaps %zmm3, %zmm2 -; AVX512-NEXT: vmovaps %zmm4, %zmm3 -; AVX512-NEXT: retq +; AVX512F-LABEL: test_load_32f64: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX512F-NEXT: vpmovsxbd %xmm5, %zmm5 +; AVX512F-NEXT: vpslld $31, %zmm5, %zmm5 +; AVX512F-NEXT: vptestmd %zmm5, %zmm5, %k1 +; AVX512F-NEXT: vmovupd 128(%rdi), %zmm3 {%k1} +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 +; AVX512F-NEXT: vmovupd (%rdi), %zmm1 {%k2} +; AVX512F-NEXT: kshiftrw $8, %k1, %k1 +; AVX512F-NEXT: vmovupd 192(%rdi), %zmm4 {%k1} +; AVX512F-NEXT: kshiftrw $8, %k2, %k1 +; AVX512F-NEXT: vmovupd 64(%rdi), %zmm2 {%k1} +; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: vmovaps %zmm2, %zmm1 +; AVX512F-NEXT: vmovaps %zmm3, %zmm2 +; AVX512F-NEXT: vmovaps %zmm4, %zmm3 +; AVX512F-NEXT: retq ; ; SKX-LABEL: test_load_32f64: ; SKX: ## BB#0: Index: llvm/trunk/test/Transforms/LoopVectorize/X86/masked_load_store.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -1,9 +1,7 @@ -; RUN: opt < %s -O3 -mcpu=corei7-avx -S | FileCheck %s -check-prefix=AVX1 -; RUN: opt < %s -O3 -mcpu=core-avx2 -S | FileCheck %s -check-prefix=AVX2 +; RUN: opt < %s -O3 -mcpu=corei7-avx -S | FileCheck %s -check-prefix=AVX -check-prefix=AVX1 +; RUN: opt < %s -O3 -mcpu=core-avx2 -S | FileCheck %s -check-prefix=AVX -check-prefix=AVX2 ; RUN: opt < %s -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512 -;AVX1-NOT: llvm.masked - target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc_linux" @@ -18,12 +16,12 @@ ; } ;} -;AVX2-LABEL: @foo1 -;AVX2: icmp slt <8 x i32> %wide.load, @llvm.masked.load.v8i32 -;AVX2: add nsw <8 x i32> -;AVX2: call void @llvm.masked.store.v8i32 -;AVX2: ret void +;AVX-LABEL: @foo1 +;AVX: icmp slt <8 x i32> %wide.load, @llvm.masked.load.v8i32 +;AVX: add nsw <8 x i32> +;AVX: call void @llvm.masked.store.v8i32 +;AVX: ret void ;AVX512-LABEL: @foo1 ;AVX512: icmp slt <16 x i32> %wide.load, %wide.load, @llvm.masked.load.v8f32 -;AVX2: fadd <8 x float> -;AVX2: call void @llvm.masked.store.v8f32 -;AVX2: ret void +;AVX-LABEL: @foo2 +;AVX: icmp slt <8 x i32> %wide.load, @llvm.masked.load.v8f32 +;AVX: fadd <8 x float> +;AVX: call void @llvm.masked.store.v8f32 +;AVX: ret void ;AVX512-LABEL: @foo2 ;AVX512: icmp slt <16 x i32> %wide.load, %wide.load, @llvm.masked.load.v4f64 -;AVX2: sitofp <4 x i32> %wide.load to <4 x double> -;AVX2: fadd <4 x double> -;AVX2: call void @llvm.masked.store.v4f64 -;AVX2: ret void +;AVX-LABEL: @foo3 +;AVX: icmp slt <4 x i32> %wide.load, @llvm.masked.load.v4f64 +;AVX: sitofp <4 x i32> %wide.load to <4 x double> +;AVX: fadd <4 x double> +;AVX: call void @llvm.masked.store.v4f64 +;AVX: ret void ;AVX512-LABEL: @foo3 ;AVX512: icmp slt <8 x i32> %wide.load,