Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -2737,7 +2737,7 @@ [(set _.RC:$dst, (_.VT (bitconvert (ld_frag addr:$src))))], _.ExeDomain>, EVEX; - let Constraints = "$src0 = $dst" in { + let Constraints = "$src0 = $dst", isConvertibleToThreeAddress = 1 in { def rrk : AVX512PI @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0) ret <32 x double> %res @@ -27,17 +24,14 @@ ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k1 -; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1} +; AVX512BW-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-NEXT: vmovdqu64 128(%rdi), %zmm3 {%k2} +; AVX512BW-NEXT: vpblendmq 128(%rdi), %zmm3, %zmm5 {%k2} ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1} +; AVX512BW-NEXT: vpblendmq 64(%rdi), %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: kshiftrw $8, %k2, %k1 -; AVX512BW-NEXT: vmovdqu64 192(%rdi), %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vpblendmq 192(%rdi), %zmm4, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 ; AVX512BW-NEXT: retq %res = call <32 x i64> @llvm.masked.load.v32i64.p0v32i64(<32 x i64>* %ptrs, i32 4, <32 x i1> %mask, <32 x i64> %src0) ret <32 x i64> %res Index: test/CodeGen/X86/avx512-masked-memop-64-32.ll =================================================================== --- test/CodeGen/X86/avx512-masked-memop-64-32.ll +++ test/CodeGen/X86/avx512-masked-memop-64-32.ll @@ -43,8 +43,7 @@ ; AVX512: ## BB#0: ; AVX512-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 -; AVX512-NEXT: vmovups (%rdi), %zmm1 {%k1} -; AVX512-NEXT: vmovaps %zmm1, %zmm0 +; AVX512-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1} ; AVX512-NEXT: retq %mask = icmp eq <16 x i32> %trigger, zeroinitializer %res = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %addr, i32 4, <16 x i1>%mask, <16 x float> %dst) @@ -189,22 +188,18 @@ ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1} +; AVX512F-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: kshiftrw $8, %k1, %k1 -; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-NEXT: vpblendmq 64(%rdi), %zmm2, %zmm1 {%k1} ; AVX512F-NEXT: retq ; ; SKX-LABEL: test_load_16i64: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 ; SKX-NEXT: vpmovb2m %xmm0, %k1 -; SKX-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1} +; SKX-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1} ; SKX-NEXT: kshiftrw $8, %k1, %k1 -; SKX-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1} -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 -; SKX-NEXT: vmovdqa64 %zmm2, %zmm1 +; SKX-NEXT: vpblendmq 64(%rdi), %zmm2, %zmm1 {%k1} ; SKX-NEXT: retq %res = call <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0) ret <16 x i64> %res @@ -217,22 +212,18 @@ ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vmovupd (%rdi), %zmm1 {%k1} +; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: kshiftrw $8, %k1, %k1 -; AVX512F-NEXT: vmovupd 64(%rdi), %zmm2 {%k1} -; AVX512F-NEXT: vmovapd %zmm1, %zmm0 -; AVX512F-NEXT: vmovapd %zmm2, %zmm1 +; AVX512F-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k1} ; AVX512F-NEXT: retq ; ; SKX-LABEL: test_load_16f64: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 ; SKX-NEXT: vpmovb2m %xmm0, %k1 -; SKX-NEXT: vmovupd (%rdi), %zmm1 {%k1} +; SKX-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} ; SKX-NEXT: kshiftrw $8, %k1, %k1 -; SKX-NEXT: vmovupd 64(%rdi), %zmm2 {%k1} -; SKX-NEXT: vmovapd %zmm1, %zmm0 -; SKX-NEXT: vmovapd %zmm2, %zmm1 +; SKX-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k1} ; SKX-NEXT: retq %res = call <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>* %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0) ret <16 x double> %res @@ -246,36 +237,30 @@ ; AVX512F-NEXT: vpmovsxbd %xmm5, %zmm5 ; AVX512F-NEXT: vpslld $31, %zmm5, %zmm5 ; AVX512F-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-NEXT: vmovupd 128(%rdi), %zmm3 {%k1} +; AVX512F-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm5 {%k1} ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 -; AVX512F-NEXT: vmovupd (%rdi), %zmm1 {%k2} +; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k2} ; AVX512F-NEXT: kshiftrw $8, %k1, %k1 -; AVX512F-NEXT: vmovupd 192(%rdi), %zmm4 {%k1} +; AVX512F-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1} ; AVX512F-NEXT: kshiftrw $8, %k2, %k1 -; AVX512F-NEXT: vmovupd 64(%rdi), %zmm2 {%k1} -; AVX512F-NEXT: vmovapd %zmm1, %zmm0 -; AVX512F-NEXT: vmovapd %zmm2, %zmm1 -; AVX512F-NEXT: vmovapd %zmm3, %zmm2 -; AVX512F-NEXT: vmovapd %zmm4, %zmm3 +; AVX512F-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k1} +; AVX512F-NEXT: vmovapd %zmm5, %zmm2 ; AVX512F-NEXT: retq ; ; SKX-LABEL: test_load_32f64: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 ; SKX-NEXT: vpmovb2m %ymm0, %k1 -; SKX-NEXT: vmovupd (%rdi), %zmm1 {%k1} +; SKX-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} ; SKX-NEXT: kshiftrd $16, %k1, %k2 -; SKX-NEXT: vmovupd 128(%rdi), %zmm3 {%k2} +; SKX-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm5 {%k2} ; SKX-NEXT: kshiftrw $8, %k1, %k1 -; SKX-NEXT: vmovupd 64(%rdi), %zmm2 {%k1} +; SKX-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k1} ; SKX-NEXT: kshiftrw $8, %k2, %k1 -; SKX-NEXT: vmovupd 192(%rdi), %zmm4 {%k1} -; SKX-NEXT: vmovapd %zmm1, %zmm0 -; SKX-NEXT: vmovapd %zmm2, %zmm1 -; SKX-NEXT: vmovapd %zmm3, %zmm2 -; SKX-NEXT: vmovapd %zmm4, %zmm3 +; SKX-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1} +; SKX-NEXT: vmovapd %zmm5, %zmm2 ; SKX-NEXT: retq %res = call <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0) ret <32 x double> %res Index: test/CodeGen/X86/avx512-masked_memop-16-8.ll =================================================================== --- test/CodeGen/X86/avx512-masked_memop-16-8.ll +++ test/CodeGen/X86/avx512-masked_memop-16-8.ll @@ -20,8 +20,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0 ; CHECK-NEXT: vpmovb2m %ymm0, %k1 -; CHECK-NEXT: vmovdqu8 (%rdi), %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmb (%rdi), %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %res = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* %addr, i32 4, <32 x i1>%mask, <32 x i8> %val) ret <32 x i8> %res @@ -33,8 +32,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpsllw $7, %zmm0, %zmm0 ; CHECK-NEXT: vpmovb2m %zmm0, %k1 -; CHECK-NEXT: vmovdqu8 (%rdi), %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmb (%rdi), %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %res = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* %addr, i32 4, <64 x i1>%mask, <64 x i8> %val) ret <64 x i8> %res @@ -70,8 +68,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0 ; CHECK-NEXT: vpmovb2m %ymm0, %k1 -; CHECK-NEXT: vmovdqu16 (%rdi), %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmw (%rdi), %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %res = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* %addr, i32 4, <32 x i1>%mask, <32 x i16> %val) ret <32 x i16> %res Index: test/CodeGen/X86/avx512-regcall-NoMask.ll =================================================================== --- test/CodeGen/X86/avx512-regcall-NoMask.ll +++ test/CodeGen/X86/avx512-regcall-NoMask.ll @@ -325,13 +325,11 @@ } ; X32-LABEL: test_argRet128Vector: -; X32: vmovdqa{{.*}} %xmm0, %xmm1 -; X32: vmovdqa{{.*}} %xmm1, %xmm0 +; X32: vpblend{{.*}} %xmm0, %xmm1, %xmm0 ; X32: ret{{.*}} ; WIN64-LABEL: test_argRet128Vector: -; WIN64: vmovdqa{{.*}} %xmm0, %xmm1 -; WIN64: vmovdqa{{.*}} %xmm1, %xmm0 +; WIN64: vpblend{{.*}} %xmm0, %xmm1, %xmm0 ; WIN64: ret{{.*}} ; Test regcall when receiving/returning 128 bit vector @@ -360,13 +358,11 @@ } ; X32-LABEL: test_argRet256Vector: -; X32: vmovdqa{{.*}} %ymm0, %ymm1 -; X32: vmovdqa{{.*}} %ymm1, %ymm0 +; X32: vpblend{{.*}} %ymm0, %ymm1, %ymm0 ; X32: ret{{.*}} ; WIN64-LABEL: test_argRet256Vector: -; WIN64: vmovdqa{{.*}} %ymm0, %ymm1 -; WIN64: vmovdqa{{.*}} %ymm1, %ymm0 +; WIN64: vpblend{{.*}} %ymm0, %ymm1, %ymm0 ; WIN64: ret{{.*}} ; Test regcall when receiving/returning 256 bit vector @@ -395,13 +391,11 @@ } ; X32-LABEL: test_argRet512Vector: -; X32: vmovdqa{{.*}} %zmm0, %zmm1 -; X32: vmovdqa{{.*}} %zmm1, %zmm0 +; X32: vpblend{{.*}} %zmm0, %zmm1, %zmm0 ; X32: ret{{.*}} ; WIN64-LABEL: test_argRet512Vector: -; WIN64: vmovdqa{{.*}} %zmm0, %zmm1 -; WIN64: vmovdqa{{.*}} %zmm1, %zmm0 +; WIN64: vpblend{{.*}} %zmm0, %zmm1, %zmm0 ; WIN64: ret{{.*}} ; Test regcall when receiving/returning 512 bit vector Index: test/CodeGen/X86/avx512-vec-cmp.ll =================================================================== --- test/CodeGen/X86/avx512-vec-cmp.ll +++ test/CodeGen/X86/avx512-vec-cmp.ll @@ -6,8 +6,7 @@ ; CHECK-LABEL: test1: ; CHECK: ## BB#0: ; CHECK-NEXT: vcmpleps %zmm1, %zmm0, %k1 -; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask = fcmp ole <16 x float> %x, %y %max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y @@ -18,8 +17,7 @@ ; CHECK-LABEL: test2: ; CHECK: ## BB#0: ; CHECK-NEXT: vcmplepd %zmm1, %zmm0, %k1 -; CHECK-NEXT: vmovapd %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovapd %zmm1, %zmm0 +; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask = fcmp ole <8 x double> %x, %y %max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y @@ -30,8 +28,7 @@ ; CHECK-LABEL: test3: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k1 -; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %y = load <16 x i32>, <16 x i32>* %yp, align 4 %mask = icmp eq <16 x i32> %x, %y @@ -43,8 +40,7 @@ ; CHECK-LABEL: test4_unsigned: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k1 -; CHECK-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp uge <16 x i32> %x, %y %max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y @@ -55,8 +51,7 @@ ; CHECK-LABEL: test5: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp eq <8 x i64> %x, %y %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y @@ -67,8 +62,7 @@ ; CHECK-LABEL: test6_unsigned: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmq %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp ugt <8 x i64> %x, %y %max = select <8 x i1> %mask, <8 x i64> %x1, <8 x i64> %y @@ -87,8 +81,7 @@ ; SKX: ## BB#0: ; SKX-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vcmpltps %xmm2, %xmm0, %k1 -; SKX-NEXT: vmovaps %xmm0, %xmm1 {%k1} -; SKX-NEXT: vmovaps %xmm1, %xmm0 +; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %mask = fcmp olt <4 x float> %a, zeroinitializer @@ -108,8 +101,7 @@ ; SKX: ## BB#0: ; SKX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vcmpltpd %xmm2, %xmm0, %k1 -; SKX-NEXT: vmovapd %xmm0, %xmm1 {%k1} -; SKX-NEXT: vmovapd %xmm1, %xmm0 +; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %mask = fcmp olt <2 x double> %a, zeroinitializer %c = select <2 x i1>%mask, <2 x double>%a, <2 x double>%b @@ -129,8 +121,7 @@ ; SKX-LABEL: test9: ; SKX: ## BB#0: ; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 -; SKX-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; SKX-NEXT: vmovdqa %ymm1, %ymm0 +; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y @@ -150,8 +141,7 @@ ; SKX-LABEL: test10: ; SKX: ## BB#0: ; SKX-NEXT: vcmpeqps %ymm1, %ymm0, %k1 -; SKX-NEXT: vmovaps %ymm0, %ymm1 {%k1} -; SKX-NEXT: vmovaps %ymm1, %ymm0 +; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq %mask = fcmp oeq <8 x float> %x, %y @@ -699,8 +689,7 @@ ; CHECK-LABEL: test16: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k1 -; CHECK-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp sge <16 x i32> %x, %y %max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y @@ -711,8 +700,7 @@ ; CHECK-LABEL: test17: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k1 -; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4 %mask = icmp sgt <16 x i32> %x, %y @@ -724,8 +712,7 @@ ; CHECK-LABEL: test18: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled (%rdi), %zmm0, %k1 -; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4 %mask = icmp sle <16 x i32> %x, %y @@ -737,8 +724,7 @@ ; CHECK-LABEL: test19: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1 -; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4 %mask = icmp ule <16 x i32> %x, %y @@ -751,8 +737,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 ; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 {%k1} -; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp eq <16 x i32> %x1, %y1 %mask0 = icmp eq <16 x i32> %x, %y @@ -766,8 +751,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleq %zmm1, %zmm0, %k1 ; CHECK-NEXT: vpcmpleq %zmm2, %zmm3, %k1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: vpblendmq %zmm0, %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <8 x i64> %x1, %y1 %mask0 = icmp sle <8 x i64> %x, %y @@ -781,8 +765,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtq %zmm2, %zmm1, %k1 ; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sgt <8 x i64> %x1, %y1 %y = load <8 x i64>, <8 x i64>* %y.ptr, align 4 @@ -797,8 +780,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1 ; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1 {%k1} -; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <16 x i32> %x1, %y1 %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4 @@ -812,8 +794,7 @@ ; CHECK-LABEL: test24: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k1 -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %yb = load i64, i64* %yb.ptr, align 4 %y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0 @@ -827,8 +808,7 @@ ; CHECK-LABEL: test25: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled (%rdi){1to16}, %zmm0, %k1 -; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %yb = load i32, i32* %yb.ptr, align 4 %y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0 @@ -843,8 +823,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1 ; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1} -; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <16 x i32> %x1, %y1 %yb = load i32, i32* %yb.ptr, align 4 @@ -861,8 +840,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleq %zmm1, %zmm2, %k1 ; CHECK-NEXT: vpcmpleq (%rdi){1to8}, %zmm0, %k1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <8 x i64> %x1, %y1 %yb = load i64, i64* %yb.ptr, align 4 @@ -934,8 +912,7 @@ ; SKX-LABEL: test30: ; SKX: ## BB#0: ; SKX-NEXT: vcmpeqpd %ymm1, %ymm0, %k1 -; SKX-NEXT: vmovapd %ymm0, %ymm1 {%k1} -; SKX-NEXT: vmovapd %ymm1, %ymm0 +; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq %mask = fcmp oeq <4 x double> %x, %y @@ -953,8 +930,7 @@ ; SKX-LABEL: test31: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltpd (%rdi), %xmm0, %k1 -; SKX-NEXT: vmovapd %xmm0, %xmm1 {%k1} -; SKX-NEXT: vmovapd %xmm1, %xmm0 +; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %y = load <2 x double>, <2 x double>* %yp, align 4 @@ -973,8 +949,7 @@ ; SKX-LABEL: test32: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltpd (%rdi), %ymm0, %k1 -; SKX-NEXT: vmovapd %ymm0, %ymm1 {%k1} -; SKX-NEXT: vmovapd %ymm1, %ymm0 +; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq %y = load <4 x double>, <4 x double>* %yp, align 4 @@ -987,8 +962,7 @@ ; CHECK-LABEL: test33: ; CHECK: ## BB#0: ; CHECK-NEXT: vcmpltpd (%rdi), %zmm0, %k1 -; CHECK-NEXT: vmovapd %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovapd %zmm1, %zmm0 +; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x double>, <8 x double>* %yp, align 4 %mask = fcmp olt <8 x double> %x, %y @@ -1006,8 +980,7 @@ ; SKX-LABEL: test34: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltps (%rdi), %xmm0, %k1 -; SKX-NEXT: vmovaps %xmm0, %xmm1 {%k1} -; SKX-NEXT: vmovaps %xmm1, %xmm0 +; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %y = load <4 x float>, <4 x float>* %yp, align 4 %mask = fcmp olt <4 x float> %x, %y @@ -1029,8 +1002,7 @@ ; SKX-LABEL: test35: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltps (%rdi), %ymm0, %k1 -; SKX-NEXT: vmovaps %ymm0, %ymm1 {%k1} -; SKX-NEXT: vmovaps %ymm1, %ymm0 +; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq %y = load <8 x float>, <8 x float>* %yp, align 4 @@ -1043,8 +1015,7 @@ ; CHECK-LABEL: test36: ; CHECK: ## BB#0: ; CHECK-NEXT: vcmpltps (%rdi), %zmm0, %k1 -; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %y = load <16 x float>, <16 x float>* %yp, align 4 %mask = fcmp olt <16 x float> %x, %y @@ -1056,8 +1027,7 @@ ; CHECK-LABEL: test37: ; CHECK: ## BB#0: ; CHECK-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 -; CHECK-NEXT: vmovapd %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovapd %zmm1, %zmm0 +; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %a = load double, double* %ptr @@ -1080,8 +1050,7 @@ ; SKX-LABEL: test38: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltpd (%rdi){1to4}, %ymm0, %k1 -; SKX-NEXT: vmovapd %ymm0, %ymm1 {%k1} -; SKX-NEXT: vmovapd %ymm1, %ymm0 +; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq %a = load double, double* %ptr @@ -1104,8 +1073,7 @@ ; SKX-LABEL: test39: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltpd (%rdi){1to2}, %xmm0, %k1 -; SKX-NEXT: vmovapd %xmm0, %xmm1 {%k1} -; SKX-NEXT: vmovapd %xmm1, %xmm0 +; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %a = load double, double* %ptr @@ -1122,8 +1090,7 @@ ; CHECK-LABEL: test40: ; CHECK: ## BB#0: ; CHECK-NEXT: vcmpltps (%rdi){1to16}, %zmm0, %k1 -; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %a = load float, float* %ptr @@ -1149,8 +1116,7 @@ ; SKX-LABEL: test41: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltps (%rdi){1to8}, %ymm0, %k1 -; SKX-NEXT: vmovaps %ymm0, %ymm1 {%k1} -; SKX-NEXT: vmovaps %ymm1, %ymm0 +; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq %a = load float, float* %ptr @@ -1173,8 +1139,7 @@ ; SKX-LABEL: test42: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltps (%rdi){1to4}, %xmm0, %k1 -; SKX-NEXT: vmovaps %xmm0, %xmm1 {%k1} -; SKX-NEXT: vmovaps %xmm1, %xmm0 +; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %a = load float, float* %ptr @@ -1193,8 +1158,7 @@ ; KNL-NEXT: vpsllq $63, %zmm2, %zmm2 ; KNL-NEXT: vptestmq %zmm2, %zmm2, %k1 ; KNL-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1} -; KNL-NEXT: vmovapd %zmm0, %zmm1 {%k1} -; KNL-NEXT: vmovapd %zmm1, %zmm0 +; KNL-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test43: @@ -1202,8 +1166,7 @@ ; SKX-NEXT: vpsllw $15, %xmm2, %xmm2 ; SKX-NEXT: vpmovw2m %xmm2, %k1 ; SKX-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1} -; SKX-NEXT: vmovapd %zmm0, %zmm1 {%k1} -; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ; SKX-NEXT: retq %a = load double, double* %ptr Index: test/CodeGen/X86/avx512bw-vec-cmp.ll =================================================================== --- test/CodeGen/X86/avx512bw-vec-cmp.ll +++ test/CodeGen/X86/avx512bw-vec-cmp.ll @@ -5,8 +5,7 @@ ; CHECK-LABEL: test1: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1 -; CHECK-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp eq <64 x i8> %x, %y %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %y @@ -17,8 +16,7 @@ ; CHECK-LABEL: test2: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 -; CHECK-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmb %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp sgt <64 x i8> %x, %y %max = select <64 x i1> %mask, <64 x i8> %x1, <64 x i8> %y @@ -29,8 +27,7 @@ ; CHECK-LABEL: test3: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmplew %zmm0, %zmm1, %k1 -; CHECK-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmw %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp sge <32 x i16> %x, %y %max = select <32 x i1> %mask, <32 x i16> %x1, <32 x i16> %y @@ -41,8 +38,7 @@ ; CHECK-LABEL: test4: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpnleub %zmm1, %zmm0, %k1 -; CHECK-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmb %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp ugt <64 x i8> %x, %y %max = select <64 x i1> %mask, <64 x i8> %x1, <64 x i8> %y @@ -53,8 +49,7 @@ ; CHECK-LABEL: test5: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqw (%rdi), %zmm0, %k1 -; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %y = load <32 x i16>, <32 x i16>* %yp, align 4 %mask = icmp eq <32 x i16> %x, %y @@ -66,8 +61,7 @@ ; CHECK-LABEL: test6: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtw (%rdi), %zmm0, %k1 -; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %y = load <32 x i16>, <32 x i16>* %y.ptr, align 4 %mask = icmp sgt <32 x i16> %x, %y @@ -79,8 +73,7 @@ ; CHECK-LABEL: test7: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmplew (%rdi), %zmm0, %k1 -; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %y = load <32 x i16>, <32 x i16>* %y.ptr, align 4 %mask = icmp sle <32 x i16> %x, %y @@ -92,8 +85,7 @@ ; CHECK-LABEL: test8: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleuw (%rdi), %zmm0, %k1 -; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %y = load <32 x i16>, <32 x i16>* %y.ptr, align 4 %mask = icmp ule <32 x i16> %x, %y @@ -106,8 +98,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 ; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 {%k1} -; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp eq <32 x i16> %x1, %y1 %mask0 = icmp eq <32 x i16> %x, %y @@ -121,8 +112,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleb %zmm1, %zmm0, %k1 ; CHECK-NEXT: vpcmpleb %zmm2, %zmm3, %k1 {%k1} -; CHECK-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: vpblendmb %zmm0, %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <64 x i8> %x1, %y1 %mask0 = icmp sle <64 x i8> %x, %y @@ -136,8 +126,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtb %zmm2, %zmm1, %k1 ; CHECK-NEXT: vpcmpgtb (%rdi), %zmm0, %k1 {%k1} -; CHECK-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sgt <64 x i8> %x1, %y1 %y = load <64 x i8>, <64 x i8>* %y.ptr, align 4 @@ -152,8 +141,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmplew %zmm1, %zmm2, %k1 ; CHECK-NEXT: vpcmpleuw (%rdi), %zmm0, %k1 {%k1} -; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <32 x i16> %x1, %y1 %y = load <32 x i16>, <32 x i16>* %y.ptr, align 4 Index: test/CodeGen/X86/avx512bwvl-vec-cmp.ll =================================================================== --- test/CodeGen/X86/avx512bwvl-vec-cmp.ll +++ test/CodeGen/X86/avx512bwvl-vec-cmp.ll @@ -5,8 +5,7 @@ ; CHECK-LABEL: test256_1: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1 -; CHECK-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask = icmp eq <32 x i8> %x, %y %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %y @@ -17,8 +16,7 @@ ; CHECK-LABEL: test256_2: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k1 -; CHECK-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: vpblendmb %ymm0, %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq %mask = icmp sgt <32 x i8> %x, %y %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1 @@ -29,8 +27,7 @@ ; CHECK-LABEL: test256_3: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k1 -; CHECK-NEXT: vmovdqu16 %ymm2, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmw %ymm2, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask = icmp sge <16 x i16> %x, %y %max = select <16 x i1> %mask, <16 x i16> %x1, <16 x i16> %y @@ -41,8 +38,7 @@ ; CHECK-LABEL: test256_4: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpnleub %ymm1, %ymm0, %k1 -; CHECK-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: vpblendmb %ymm0, %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq %mask = icmp ugt <32 x i8> %x, %y %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1 @@ -53,8 +49,7 @@ ; CHECK-LABEL: test256_5: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqw (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <16 x i16>, <16 x i16>* %yp, align 4 %mask = icmp eq <16 x i16> %x, %y @@ -66,8 +61,7 @@ ; CHECK-LABEL: test256_6: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtw (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <16 x i16>, <16 x i16>* %y.ptr, align 4 %mask = icmp sgt <16 x i16> %x, %y @@ -79,8 +73,7 @@ ; CHECK-LABEL: test256_7: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmplew (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <16 x i16>, <16 x i16>* %y.ptr, align 4 %mask = icmp sle <16 x i16> %x, %y @@ -92,8 +85,7 @@ ; CHECK-LABEL: test256_8: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleuw (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <16 x i16>, <16 x i16>* %y.ptr, align 4 %mask = icmp ule <16 x i16> %x, %y @@ -106,8 +98,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 ; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 {%k1} -; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp eq <16 x i16> %x1, %y1 %mask0 = icmp eq <16 x i16> %x, %y @@ -121,8 +112,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleb %ymm1, %ymm0, %k1 ; CHECK-NEXT: vpcmpleb %ymm2, %ymm3, %k1 {%k1} -; CHECK-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: vpblendmb %ymm0, %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <32 x i8> %x1, %y1 %mask0 = icmp sle <32 x i8> %x, %y @@ -136,8 +126,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtb %ymm2, %ymm1, %k1 ; CHECK-NEXT: vpcmpgtb (%rdi), %ymm0, %k1 {%k1} -; CHECK-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sgt <32 x i8> %x1, %y1 %y = load <32 x i8>, <32 x i8>* %y.ptr, align 4 @@ -152,8 +141,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmplew %ymm1, %ymm2, %k1 ; CHECK-NEXT: vpcmpleuw (%rdi), %ymm0, %k1 {%k1} -; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <16 x i16> %x1, %y1 %y = load <16 x i16>, <16 x i16>* %y.ptr, align 4 @@ -167,8 +155,7 @@ ; CHECK-LABEL: test128_1: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1 -; CHECK-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp eq <16 x i8> %x, %y %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %y @@ -179,8 +166,7 @@ ; CHECK-LABEL: test128_2: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k1 -; CHECK-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 +; CHECK-NEXT: vpblendmb %xmm0, %xmm2, %xmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp sgt <16 x i8> %x, %y %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1 @@ -191,8 +177,7 @@ ; CHECK-LABEL: test128_3: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm2, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmw %xmm2, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp sge <8 x i16> %x, %y %max = select <8 x i1> %mask, <8 x i16> %x1, <8 x i16> %y @@ -203,8 +188,7 @@ ; CHECK-LABEL: test128_4: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpnleub %xmm1, %xmm0, %k1 -; CHECK-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 +; CHECK-NEXT: vpblendmb %xmm0, %xmm2, %xmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp ugt <16 x i8> %x, %y %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1 @@ -215,8 +199,7 @@ ; CHECK-LABEL: test128_5: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqw (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i16>, <8 x i16>* %yp, align 4 %mask = icmp eq <8 x i16> %x, %y @@ -228,8 +211,7 @@ ; CHECK-LABEL: test128_6: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtw (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i16>, <8 x i16>* %y.ptr, align 4 %mask = icmp sgt <8 x i16> %x, %y @@ -241,8 +223,7 @@ ; CHECK-LABEL: test128_7: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmplew (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i16>, <8 x i16>* %y.ptr, align 4 %mask = icmp sle <8 x i16> %x, %y @@ -254,8 +235,7 @@ ; CHECK-LABEL: test128_8: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleuw (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i16>, <8 x i16>* %y.ptr, align 4 %mask = icmp ule <8 x i16> %x, %y @@ -268,8 +248,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 ; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 {%k1} -; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp eq <8 x i16> %x1, %y1 %mask0 = icmp eq <8 x i16> %x, %y @@ -283,8 +262,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleb %xmm1, %xmm0, %k1 ; CHECK-NEXT: vpcmpleb %xmm2, %xmm3, %k1 {%k1} -; CHECK-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 +; CHECK-NEXT: vpblendmb %xmm0, %xmm2, %xmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <16 x i8> %x1, %y1 %mask0 = icmp sle <16 x i8> %x, %y @@ -298,8 +276,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtb %xmm2, %xmm1, %k1 ; CHECK-NEXT: vpcmpgtb (%rdi), %xmm0, %k1 {%k1} -; CHECK-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sgt <16 x i8> %x1, %y1 %y = load <16 x i8>, <16 x i8>* %y.ptr, align 4 @@ -314,8 +291,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmplew %xmm1, %xmm2, %k1 ; CHECK-NEXT: vpcmpleuw (%rdi), %xmm0, %k1 {%k1} -; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <8 x i16> %x1, %y1 %y = load <8 x i16>, <8 x i16>* %y.ptr, align 4 Index: test/CodeGen/X86/avx512vl-vec-cmp.ll =================================================================== --- test/CodeGen/X86/avx512vl-vec-cmp.ll +++ test/CodeGen/X86/avx512vl-vec-cmp.ll @@ -5,8 +5,7 @@ ; CHECK-LABEL: test256_1: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask = icmp eq <4 x i64> %x, %y %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y @@ -17,8 +16,7 @@ ; CHECK-LABEL: test256_2: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 -; CHECK-NEXT: vmovdqa64 %ymm2, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask = icmp sgt <4 x i64> %x, %y %max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y @@ -29,8 +27,7 @@ ; CHECK-LABEL: test256_3: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k1 -; CHECK-NEXT: vmovdqa32 %ymm2, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm2, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask = icmp sge <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x1, <8 x i32> %y @@ -41,8 +38,7 @@ ; CHECK-LABEL: test256_4: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpnleuq %ymm1, %ymm0, %k1 -; CHECK-NEXT: vmovdqa64 %ymm2, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask = icmp ugt <4 x i64> %x, %y %max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y @@ -53,8 +49,7 @@ ; CHECK-LABEL: test256_5: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp eq <8 x i32> %x, %y @@ -66,8 +61,7 @@ ; CHECK-LABEL: test256_5b: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp eq <8 x i32> %y, %x @@ -79,8 +73,7 @@ ; CHECK-LABEL: test256_6: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp sgt <8 x i32> %x, %y @@ -92,8 +85,7 @@ ; CHECK-LABEL: test256_6b: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp slt <8 x i32> %y, %x @@ -105,8 +97,7 @@ ; CHECK-LABEL: test256_7: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp sle <8 x i32> %x, %y @@ -118,8 +109,7 @@ ; CHECK-LABEL: test256_7b: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp sge <8 x i32> %y, %x @@ -131,8 +121,7 @@ ; CHECK-LABEL: test256_8: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp ule <8 x i32> %x, %y @@ -144,8 +133,7 @@ ; CHECK-LABEL: test256_8b: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp uge <8 x i32> %y, %x @@ -158,8 +146,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 ; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 {%k1} -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp eq <8 x i32> %x1, %y1 %mask0 = icmp eq <8 x i32> %x, %y @@ -173,8 +160,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleq %ymm1, %ymm0, %k1 ; CHECK-NEXT: vpcmpleq %ymm2, %ymm3, %k1 {%k1} -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm2 {%k1} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <4 x i64> %x1, %y1 %mask0 = icmp sle <4 x i64> %x, %y @@ -188,8 +174,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtq %ymm2, %ymm1, %k1 ; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k1 {%k1} -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sgt <4 x i64> %x1, %y1 %y = load <4 x i64>, <4 x i64>* %y.ptr, align 4 @@ -204,8 +189,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled %ymm1, %ymm2, %k1 ; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1 {%k1} -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <8 x i32> %x1, %y1 %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 @@ -219,8 +203,7 @@ ; CHECK-LABEL: test256_13: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %yb = load i64, i64* %yb.ptr, align 4 %y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0 @@ -234,8 +217,7 @@ ; CHECK-LABEL: test256_14: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled (%rdi){1to8}, %ymm0, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %yb = load i32, i32* %yb.ptr, align 4 %y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0 @@ -250,8 +232,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled %ymm1, %ymm2, %k1 ; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k1 {%k1} -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <8 x i32> %x1, %y1 %yb = load i32, i32* %yb.ptr, align 4 @@ -268,8 +249,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleq %ymm1, %ymm2, %k1 ; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k1 {%k1} -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <4 x i64> %x1, %y1 %yb = load i64, i64* %yb.ptr, align 4 @@ -285,8 +265,7 @@ ; CHECK-LABEL: test256_17: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpneqd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp ne <8 x i32> %x, %y @@ -298,8 +277,7 @@ ; CHECK-LABEL: test256_18: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpneqd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp ne <8 x i32> %y, %x @@ -311,8 +289,7 @@ ; CHECK-LABEL: test256_19: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpnltud (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp uge <8 x i32> %x, %y @@ -324,8 +301,7 @@ ; CHECK-LABEL: test256_20: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp uge <8 x i32> %y, %x @@ -337,8 +313,7 @@ ; CHECK-LABEL: test128_1: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k1 -; CHECK-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp eq <2 x i64> %x, %y %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y @@ -349,8 +324,7 @@ ; CHECK-LABEL: test128_2: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 -; CHECK-NEXT: vmovdqa64 %xmm2, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp sgt <2 x i64> %x, %y %max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y @@ -361,8 +335,7 @@ ; CHECK-LABEL: test128_3: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k1 -; CHECK-NEXT: vmovdqa32 %xmm2, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm2, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp sge <4 x i32> %x, %y %max = select <4 x i1> %mask, <4 x i32> %x1, <4 x i32> %y @@ -373,8 +346,7 @@ ; CHECK-LABEL: test128_4: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1 -; CHECK-NEXT: vmovdqa64 %xmm2, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp ugt <2 x i64> %x, %y %max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y @@ -385,8 +357,7 @@ ; CHECK-LABEL: test128_5: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %yp, align 4 %mask = icmp eq <4 x i32> %x, %y @@ -398,8 +369,7 @@ ; CHECK-LABEL: test128_5b: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %yp, align 4 %mask = icmp eq <4 x i32> %y, %x @@ -411,8 +381,7 @@ ; CHECK-LABEL: test128_6: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp sgt <4 x i32> %x, %y @@ -424,8 +393,7 @@ ; CHECK-LABEL: test128_6b: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp slt <4 x i32> %y, %x @@ -437,8 +405,7 @@ ; CHECK-LABEL: test128_7: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp sle <4 x i32> %x, %y @@ -450,8 +417,7 @@ ; CHECK-LABEL: test128_7b: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp sge <4 x i32> %y, %x @@ -463,8 +429,7 @@ ; CHECK-LABEL: test128_8: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp ule <4 x i32> %x, %y @@ -476,8 +441,7 @@ ; CHECK-LABEL: test128_8b: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp uge <4 x i32> %y, %x @@ -490,8 +454,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 ; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 {%k1} -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp eq <4 x i32> %x1, %y1 %mask0 = icmp eq <4 x i32> %x, %y @@ -505,8 +468,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleq %xmm1, %xmm0, %k1 ; CHECK-NEXT: vpcmpleq %xmm2, %xmm3, %k1 {%k1} -; CHECK-NEXT: vmovdqa64 %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 +; CHECK-NEXT: vpblendmq %xmm0, %xmm2, %xmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <2 x i64> %x1, %y1 %mask0 = icmp sle <2 x i64> %x, %y @@ -520,8 +482,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtq %xmm2, %xmm1, %k1 ; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k1 {%k1} -; CHECK-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sgt <2 x i64> %x1, %y1 %y = load <2 x i64>, <2 x i64>* %y.ptr, align 4 @@ -536,8 +497,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled %xmm1, %xmm2, %k1 ; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1 {%k1} -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <4 x i32> %x1, %y1 %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 @@ -551,8 +511,7 @@ ; CHECK-LABEL: test128_13: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k1 -; CHECK-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %yb = load i64, i64* %yb.ptr, align 4 %y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0 @@ -566,8 +525,7 @@ ; CHECK-LABEL: test128_14: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled (%rdi){1to4}, %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %yb = load i32, i32* %yb.ptr, align 4 %y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0 @@ -582,8 +540,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpled %xmm1, %xmm2, %k1 ; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k1 {%k1} -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <4 x i32> %x1, %y1 %yb = load i32, i32* %yb.ptr, align 4 @@ -600,8 +557,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleq %xmm1, %xmm2, %k1 ; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k1 {%k1} -; CHECK-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask1 = icmp sge <2 x i64> %x1, %y1 %yb = load i64, i64* %yb.ptr, align 4 @@ -617,8 +573,7 @@ ; CHECK-LABEL: test128_17: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpneqd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp ne <4 x i32> %x, %y @@ -630,8 +585,7 @@ ; CHECK-LABEL: test128_18: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpneqd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp ne <4 x i32> %y, %x @@ -643,8 +597,7 @@ ; CHECK-LABEL: test128_19: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpnltud (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp uge <4 x i32> %x, %y @@ -656,8 +609,7 @@ ; CHECK-LABEL: test128_20: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp uge <4 x i32> %y, %x Index: test/CodeGen/X86/masked_memop.ll =================================================================== --- test/CodeGen/X86/masked_memop.ll +++ test/CodeGen/X86/masked_memop.ll @@ -29,8 +29,7 @@ ; SKX: ## BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1 -; SKX-NEXT: vmovupd (%rdi), %xmm1 {%k1} -; SKX-NEXT: vmovapd %xmm1, %xmm0 +; SKX-NEXT: vblendmpd (%rdi), %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i64> %trigger, zeroinitializer %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst) @@ -58,8 +57,7 @@ ; SKX: ## BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1 -; SKX-NEXT: vmovups (%rdi), %xmm1 {%k1} -; SKX-NEXT: vmovaps %xmm1, %xmm0 +; SKX-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1>%mask, <4 x float>%dst) @@ -95,8 +93,7 @@ ; SKX: ## BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1 -; SKX-NEXT: vmovdqu32 (%rdi), %xmm1 {%k1} -; SKX-NEXT: vmovdqa %xmm1, %xmm0 +; SKX-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst) @@ -171,8 +168,7 @@ ; SKX: ## BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1 -; SKX-NEXT: vmovapd (%rdi), %ymm1 {%k1} -; SKX-NEXT: vmovapd %ymm1, %ymm0 +; SKX-NEXT: vblendmpd (%rdi), %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>%dst) @@ -246,16 +242,15 @@ ; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $8, %k0, %k0 ; AVX512F-NEXT: kshiftrw $8, %k0, %k1 -; AVX512F-NEXT: vmovups (%rdi), %zmm1 {%k1} -; AVX512F-NEXT: vmovaps %ymm1, %ymm0 +; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512F-NEXT: retq ; ; SKX-LABEL: test11a: ; SKX: ## BB#0: ; SKX-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 -; SKX-NEXT: vmovaps (%rdi), %ymm1 {%k1} -; SKX-NEXT: vmovaps %ymm1, %ymm0 +; SKX-NEXT: vblendmps (%rdi), %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <8 x i32> %trigger, zeroinitializer %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1>%mask, <8 x float>%dst) @@ -293,16 +288,15 @@ ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $8, %k0, %k0 ; AVX512F-NEXT: kshiftrw $8, %k0, %k1 -; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512F-NEXT: retq ; ; SKX-LABEL: test11b: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 ; SKX-NEXT: vpmovw2m %xmm0, %k1 -; SKX-NEXT: vmovdqu32 (%rdi), %ymm1 {%k1} -; SKX-NEXT: vmovdqa %ymm1, %ymm0 +; SKX-NEXT: vpblendmd (%rdi), %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1>%mask, <8 x i32>%dst) ret <8 x i32> %res @@ -557,8 +551,7 @@ ; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k0 ; SKX-NEXT: kshiftlw $14, %k0, %k0 ; SKX-NEXT: kshiftrw $14, %k0, %k1 -; SKX-NEXT: vmovups (%rdi), %xmm1 {%k1} -; SKX-NEXT: vmovaps %xmm1, %xmm0 +; SKX-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst) Index: test/CodeGen/X86/vector-shuffle-masked.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-masked.ll +++ test/CodeGen/X86/vector-shuffle-masked.ll @@ -216,8 +216,7 @@ ; CHECK: # BB#0: ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,0] ; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> %mask.cast = bitcast i8 %mask to <8 x i1>