Index: ../lib/Target/X86/X86InstrAVX512.td =================================================================== --- ../lib/Target/X86/X86InstrAVX512.td +++ ../lib/Target/X86/X86InstrAVX512.td @@ -2774,18 +2774,22 @@ VR512:$src)>; let Predicates = [HasAVX512, NoVLX] in { -def: Pat<(X86mstore addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src)), +def: Pat<(X86mstore addr:$ptr, VK8WM:$mask, (v8f32 VR256X:$src)), (VMOVUPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), - (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>; + (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm))>; def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, undef)), (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmkz (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; -def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src0))), +def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, (bc_v8f32 (v8i32 immAllZerosV)))), + (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmkz + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; + +def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, (v8f32 VR256X:$src0))), (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmk - (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src0, sub_ymm), + (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src0, sub_ymm), (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; } @@ -2852,14 +2856,23 @@ } // NoVLX patterns let Predicates = [HasAVX512, NoVLX] in { -def: Pat<(X86mstore addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)), +def: Pat<(X86mstore addr:$ptr, VK8WM:$mask, (v8i32 VR256X:$src)), (VMOVDQU32Zmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), - (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>; + (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm))>; def: Pat<(v8i32 (masked_load addr:$ptr, VK8WM:$mask, undef)), (v8i32 (EXTRACT_SUBREG (v16i32 (VMOVDQU32Zrmkz (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; + +def: Pat<(v8i32 (masked_load addr:$ptr, VK8WM:$mask, (v8i32 immAllZerosV))), + (v8i32 (EXTRACT_SUBREG (v16i32 (VMOVDQU32Zrmkz + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; + +def: Pat<(v8i32 (masked_load addr:$ptr, VK8WM:$mask, (v8i32 VR256X:$src0))), + (v8i32 (EXTRACT_SUBREG (v16i32 (VMOVDQU32Zrmk + (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src0, sub_ymm), + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; } // Move Int Doubleword to Packed Double Int Index: ../test/CodeGen/X86/masked_memop.ll =================================================================== --- ../test/CodeGen/X86/masked_memop.ll +++ ../test/CodeGen/X86/masked_memop.ll @@ -139,18 +139,50 @@ ret <4 x double> %res } -; AVX2-LABEL: test11 +; AVX2-LABEL: test11a ; AVX2: vmaskmovps ; AVX2: vblendvps -; SKX-LABEL: test11 -; SKX: vmovaps {{.*}}{%k1} -define <8 x float> @test11(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) { +; SKX-LABEL: test11a +; SKX: vmovaps (%rdi), %ymm1 {%k1} +; AVX512-LABEL: test11a +; AVX512: vmovups (%rdi), %zmm1 {%k1} +define <8 x float> @test11a(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) { %mask = icmp eq <8 x i32> %trigger, zeroinitializer %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 32, <8 x i1>%mask, <8 x float>%dst) ret <8 x float> %res } +; SKX-LABEL: test11b +; SKX: vmovdqu32 (%rdi), %ymm1 {%k1} +; AVX512-LABEL: test11b +; AVX512: vmovdqu32 (%rdi), %zmm1 {%k1} +define <8 x i32> @test11b(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %dst) { + %mask = icmp eq <8 x i32> %trigger, zeroinitializer + %res = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %addr, i32 4, <8 x i1>%mask, <8 x i32>%dst) + ret <8 x i32> %res +} + +; SKX-LABEL: test11c +; SKX: vmovaps (%rdi), %ymm0 {%k1} {z} +; AVX512-LABEL: test11c +; AVX512: vmovups (%rdi), %zmm0 {%k1} {z} +define <8 x float> @test11c(<8 x i32> %trigger, <8 x float>* %addr) { + %mask = icmp eq <8 x i32> %trigger, zeroinitializer + %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer) + ret <8 x float> %res +} + +; SKX-LABEL: test11d +; SKX: vmovdqu32 (%rdi), %ymm0 {%k1} {z} +; AVX512-LABEL: test11d +; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z} +define <8 x i32> @test11d(<8 x i32> %trigger, <8 x i32>* %addr) { + %mask = icmp eq <8 x i32> %trigger, zeroinitializer + %res = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer) + ret <8 x i32> %res +} + ; AVX2-LABEL: test12 ; AVX2: vpmaskmovd %ymm @@ -291,6 +323,7 @@ declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>) declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>) declare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>) +declare <8 x i32> @llvm.masked.load.v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>) declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) declare <2 x float> @llvm.masked.load.v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>) declare <8 x double> @llvm.masked.load.v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)