Index: llvm/trunk/lib/Target/X86/X86DomainReassignment.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86DomainReassignment.cpp +++ llvm/trunk/lib/Target/X86/X86DomainReassignment.cpp @@ -732,7 +732,10 @@ STI = &MF.getSubtarget(); // GPR->K is the only transformation currently supported, bail out early if no // AVX512. - if (!STI->hasAVX512()) + // TODO: We're also bailing of AVX512BW isn't supported since we use VK32 and + // VK64 for GR32/GR64, but those aren't legal classes on KNL. If the register + // coalescer doesn't clean it up and we generate a spill we will crash. + if (!STI->hasAVX512() || !STI->hasBWI()) return false; MRI = &MF.getRegInfo(); Index: llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -169,7 +169,8 @@ define <16 x float> @test_mm512_mask_shuffle_f32x4(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; X86-LABEL: test_mm512_mask_shuffle_f32x4: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3] ; X86-NEXT: retl ; @@ -188,7 +189,8 @@ define <16 x float> @test_mm512_maskz_shuffle_f32x4(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; X86-LABEL: test_mm512_maskz_shuffle_f32x4: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3] ; X86-NEXT: retl ; @@ -267,7 +269,8 @@ define <8 x i64> @test_mm512_mask_shuffle_i32x4(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; X86-LABEL: test_mm512_mask_shuffle_i32x4: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3] ; X86-NEXT: retl ; @@ -289,7 +292,8 @@ define <8 x i64> @test_mm512_maskz_shuffle_i32x4(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; X86-LABEL: test_mm512_maskz_shuffle_i32x4: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3] ; X86-NEXT: retl ; @@ -377,7 +381,8 @@ define zeroext i16 @test_mm512_mask_testn_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { ; X86-LABEL: test_mm512_mask_testn_epi32_mask: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1} ; X86-NEXT: kmovw %k0, %eax ; X86-NEXT: movzwl %ax, %eax @@ -448,7 +453,8 @@ define zeroext i16 @test_mm512_mask_test_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { ; X86-LABEL: test_mm512_mask_test_epi32_mask: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1} ; X86-NEXT: kmovw %k0, %eax ; X86-NEXT: movzwl %ax, %eax @@ -505,7 +511,8 @@ ; X86-LABEL: test_mm512_mask_set1_epi32: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: kmovw %ecx, %k1 ; X86-NEXT: vpbroadcastd %eax, %zmm0 {%k1} ; X86-NEXT: retl ; @@ -528,7 +535,8 @@ ; X86-LABEL: test_mm512_maskz_set1_epi32: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: kmovw %ecx, %k1 ; X86-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} ; X86-NEXT: retl ; @@ -607,7 +615,8 @@ define <8 x i64> @test_mm512_mask_broadcastd_epi32(<8 x i64> %a0, i16 %a1, <2 x i64> %a2) { ; X86-LABEL: test_mm512_mask_broadcastd_epi32: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1} ; X86-NEXT: retl ; @@ -628,7 +637,8 @@ define <8 x i64> @test_mm512_maskz_broadcastd_epi32(i16 %a0, <2 x i64> %a1) { ; X86-LABEL: test_mm512_maskz_broadcastd_epi32: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} ; X86-NEXT: retl ; @@ -751,7 +761,8 @@ define <16 x float> @test_mm512_mask_broadcastss_ps(<16 x float> %a0, i16 %a1, <4 x float> %a2) { ; X86-LABEL: test_mm512_mask_broadcastss_ps: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vbroadcastss %xmm1, %zmm0 {%k1} ; X86-NEXT: retl ; @@ -769,7 +780,8 @@ define <16 x float> @test_mm512_maskz_broadcastss_ps(i16 %a0, <4 x float> %a1) { ; X86-LABEL: test_mm512_maskz_broadcastss_ps: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} ; X86-NEXT: retl ; @@ -843,7 +855,8 @@ define <16 x float> @test_mm512_mask_movehdup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) { ; X86-LABEL: test_mm512_mask_movehdup_ps: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; X86-NEXT: retl ; @@ -861,7 +874,8 @@ define <16 x float> @test_mm512_maskz_movehdup_ps(i16 %a0, <16 x float> %a1) { ; X86-LABEL: test_mm512_maskz_movehdup_ps: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; X86-NEXT: retl ; @@ -888,7 +902,8 @@ define <16 x float> @test_mm512_mask_moveldup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) { ; X86-LABEL: test_mm512_mask_moveldup_ps: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; X86-NEXT: retl ; @@ -906,7 +921,8 @@ define <16 x float> @test_mm512_maskz_moveldup_ps(i16 %a0, <16 x float> %a1) { ; X86-LABEL: test_mm512_maskz_moveldup_ps: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; X86-NEXT: retl ; @@ -980,7 +996,8 @@ define <16 x float> @test_mm512_mask_permute_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) { ; X86-LABEL: test_mm512_mask_permute_ps: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] ; X86-NEXT: retl ; @@ -998,7 +1015,8 @@ define <16 x float> @test_mm512_maskz_permute_ps(i16 %a0, <16 x float> %a1) { ; X86-LABEL: test_mm512_maskz_permute_ps: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] ; X86-NEXT: retl ; @@ -1121,7 +1139,8 @@ define <8 x i64> @test_mm512_mask_shuffle_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2) { ; X86-LABEL: test_mm512_mask_shuffle_epi32: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] ; X86-NEXT: retl ; @@ -1142,7 +1161,8 @@ define <8 x i64> @test_mm512_maskz_shuffle_epi32(i16 %a0, <8 x i64> %a1) { ; X86-LABEL: test_mm512_maskz_shuffle_epi32: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] ; X86-NEXT: retl ; @@ -1221,7 +1241,8 @@ define <8 x i64> @test_mm512_mask_unpackhi_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) { ; X86-LABEL: test_mm512_mask_unpackhi_epi32: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15] ; X86-NEXT: retl ; @@ -1243,7 +1264,8 @@ define <8 x i64> @test_mm512_maskz_unpackhi_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) { ; X86-LABEL: test_mm512_maskz_unpackhi_epi32: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; X86-NEXT: retl ; @@ -1367,7 +1389,8 @@ define <16 x float> @test_mm512_mask_unpackhi_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) { ; X86-LABEL: test_mm512_mask_unpackhi_ps: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15] ; X86-NEXT: retl ; @@ -1385,7 +1408,8 @@ define <16 x float> @test_mm512_maskz_unpackhi_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) { ; X86-LABEL: test_mm512_maskz_unpackhi_ps: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; X86-NEXT: retl ; @@ -1415,7 +1439,8 @@ define <8 x i64> @test_mm512_mask_unpacklo_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) { ; X86-LABEL: test_mm512_mask_unpacklo_epi32: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13] ; X86-NEXT: retl ; @@ -1437,7 +1462,8 @@ define <8 x i64> @test_mm512_maskz_unpacklo_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) { ; X86-LABEL: test_mm512_maskz_unpacklo_epi32: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; X86-NEXT: retl ; @@ -1561,7 +1587,8 @@ define <16 x float> @test_mm512_mask_unpacklo_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) { ; X86-LABEL: test_mm512_mask_unpacklo_ps: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13] ; X86-NEXT: retl ; @@ -1579,7 +1606,8 @@ define <16 x float> @test_mm512_maskz_unpacklo_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) { ; X86-LABEL: test_mm512_maskz_unpacklo_ps: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; X86-NEXT: retl ; @@ -1978,7 +2006,8 @@ define <2 x i64> @test_mm512_mask_cvtepi32_epi8(<2 x i64> %__O, i16 zeroext %__M, <8 x i64> %__A) { ; X86-LABEL: test_mm512_mask_cvtepi32_epi8: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpmovdb %zmm1, %xmm0 {%k1} ; X86-NEXT: vzeroupper ; X86-NEXT: retl @@ -2000,7 +2029,8 @@ define <2 x i64> @test_mm512_maskz_cvtepi32_epi8(i16 zeroext %__M, <8 x i64> %__A) { ; X86-LABEL: test_mm512_maskz_cvtepi32_epi8: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z} ; X86-NEXT: vzeroupper ; X86-NEXT: retl @@ -2149,7 +2179,8 @@ define <8 x i64> @test_mm512_mask_ternarylogic_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) { ; X86-LABEL: test_mm512_mask_ternarylogic_epi32: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} ; X86-NEXT: retl ; @@ -2172,7 +2203,8 @@ define <8 x i64> @test_mm512_maskz_ternarylogic_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) { ; X86-LABEL: test_mm512_maskz_ternarylogic_epi32: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} {z} ; X86-NEXT: retl ; @@ -2249,7 +2281,8 @@ define <8 x i64> @test_mm512_mask2_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, i16 zeroext %__U, <8 x i64> %__B) { ; X86-LABEL: test_mm512_mask2_permutex2var_epi32: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 {%k1} ; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ; X86-NEXT: retl @@ -2301,7 +2334,8 @@ define <16 x float> @test_mm512_mask2_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, i16 zeroext %__U, <16 x float> %__B) { ; X86-LABEL: test_mm512_mask2_permutex2var_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1} ; X86-NEXT: vmovaps %zmm1, %zmm0 ; X86-NEXT: retl @@ -2362,7 +2396,8 @@ define <8 x i64> @test_mm512_maskz_permutex2var_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) { ; X86-LABEL: test_mm512_maskz_permutex2var_epi32: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1} {z} ; X86-NEXT: retl ; @@ -2385,7 +2420,8 @@ define <8 x i64> @test_mm512_mask_permutex2var_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) { ; X86-LABEL: test_mm512_mask_permutex2var_epi32: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1} ; X86-NEXT: retl ; @@ -2469,7 +2505,8 @@ define <16 x float> @test_mm512_mask_permutex2var_ps(<16 x float> %__A, i16 zeroext %__U, <8 x i64> %__I, <16 x float> %__B) { ; X86-LABEL: test_mm512_mask_permutex2var_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} ; X86-NEXT: retl ; @@ -2489,7 +2526,8 @@ define <16 x float> @test_mm512_maskz_permutex2var_ps(i16 zeroext %__U, <16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) { ; X86-LABEL: test_mm512_maskz_permutex2var_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} {z} ; X86-NEXT: retl ; @@ -3427,7 +3465,8 @@ define <16 x float> @test_mm512_mask_fmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { ; X86-LABEL: test_mm512_mask_fmadd_round_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ; X86-NEXT: retl ; @@ -3446,7 +3485,8 @@ define <16 x float> @test_mm512_mask3_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { ; X86-LABEL: test_mm512_mask3_fmadd_round_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ; X86-NEXT: vmovaps %zmm2, %zmm0 ; X86-NEXT: retl @@ -3467,7 +3507,8 @@ define <16 x float> @test_mm512_maskz_fmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { ; X86-LABEL: test_mm512_maskz_fmadd_round_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ; X86-NEXT: retl ; @@ -3504,7 +3545,8 @@ define <16 x float> @test_mm512_mask_fmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { ; X86-LABEL: test_mm512_mask_fmsub_round_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ; X86-NEXT: retl ; @@ -3524,7 +3566,8 @@ define <16 x float> @test_mm512_maskz_fmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { ; X86-LABEL: test_mm512_maskz_fmsub_round_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ; X86-NEXT: retl ; @@ -3562,7 +3605,8 @@ define <16 x float> @test_mm512_mask3_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { ; X86-LABEL: test_mm512_mask3_fnmadd_round_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfnmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ; X86-NEXT: vmovaps %zmm2, %zmm0 ; X86-NEXT: retl @@ -3584,7 +3628,8 @@ define <16 x float> @test_mm512_maskz_fnmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { ; X86-LABEL: test_mm512_maskz_fnmadd_round_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ; X86-NEXT: retl ; @@ -3619,7 +3664,8 @@ define <16 x float> @test_mm512_maskz_fnmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { ; X86-LABEL: test_mm512_maskz_fnmsub_round_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ; X86-NEXT: retl ; @@ -3650,7 +3696,8 @@ define <16 x float> @test_mm512_mask_fmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { ; X86-LABEL: test_mm512_mask_fmadd_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2 ; X86-NEXT: retl ; @@ -3669,7 +3716,8 @@ define <16 x float> @test_mm512_mask3_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { ; X86-LABEL: test_mm512_mask3_fmadd_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfmadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2 ; X86-NEXT: vmovaps %zmm2, %zmm0 ; X86-NEXT: retl @@ -3690,7 +3738,8 @@ define <16 x float> @test_mm512_maskz_fmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { ; X86-LABEL: test_mm512_maskz_fmadd_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 ; X86-NEXT: retl ; @@ -3727,7 +3776,8 @@ define <16 x float> @test_mm512_mask_fmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { ; X86-LABEL: test_mm512_mask_fmsub_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfmsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2 ; X86-NEXT: retl ; @@ -3747,7 +3797,8 @@ define <16 x float> @test_mm512_maskz_fmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { ; X86-LABEL: test_mm512_maskz_fmsub_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2 ; X86-NEXT: retl ; @@ -3785,7 +3836,8 @@ define <16 x float> @test_mm512_mask3_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { ; X86-LABEL: test_mm512_mask3_fnmadd_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfnmadd231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2 ; X86-NEXT: vmovaps %zmm2, %zmm0 ; X86-NEXT: retl @@ -3807,7 +3859,8 @@ define <16 x float> @test_mm512_maskz_fnmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { ; X86-LABEL: test_mm512_maskz_fnmadd_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2 ; X86-NEXT: retl ; @@ -3842,7 +3895,8 @@ define <16 x float> @test_mm512_maskz_fnmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { ; X86-LABEL: test_mm512_maskz_fnmsub_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2 ; X86-NEXT: retl ; @@ -4152,7 +4206,8 @@ define <16 x float> @test_mm512_mask_fmaddsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { ; X86-LABEL: test_mm512_mask_fmaddsub_round_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfmaddsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ; X86-NEXT: retl ; @@ -4171,7 +4226,8 @@ define <16 x float> @test_mm512_mask3_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { ; X86-LABEL: test_mm512_mask3_fmaddsub_round_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfmaddsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ; X86-NEXT: vmovaps %zmm2, %zmm0 ; X86-NEXT: retl @@ -4192,7 +4248,8 @@ define <16 x float> @test_mm512_maskz_fmaddsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { ; X86-LABEL: test_mm512_maskz_fmaddsub_round_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ; X86-NEXT: retl ; @@ -4229,7 +4286,8 @@ define <16 x float> @test_mm512_mask_fmsubadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { ; X86-LABEL: test_mm512_mask_fmsubadd_round_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfmsubadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ; X86-NEXT: retl ; @@ -4249,7 +4307,8 @@ define <16 x float> @test_mm512_maskz_fmsubadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { ; X86-LABEL: test_mm512_maskz_fmsubadd_round_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ; X86-NEXT: retl ; @@ -4282,7 +4341,8 @@ define <16 x float> @test_mm512_mask_fmaddsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { ; X86-LABEL: test_mm512_mask_fmaddsub_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfmaddsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2 ; X86-NEXT: retl ; @@ -4304,7 +4364,8 @@ define <16 x float> @test_mm512_mask3_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { ; X86-LABEL: test_mm512_mask3_fmaddsub_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfmaddsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2 ; X86-NEXT: vmovaps %zmm2, %zmm0 ; X86-NEXT: retl @@ -4328,7 +4389,8 @@ define <16 x float> @test_mm512_maskz_fmaddsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { ; X86-LABEL: test_mm512_maskz_fmaddsub_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 ; X86-NEXT: retl ; @@ -4363,7 +4425,8 @@ define <16 x float> @test_mm512_mask_fmsubadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { ; X86-LABEL: test_mm512_mask_fmsubadd_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfmsubadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2 ; X86-NEXT: retl ; @@ -4385,7 +4448,8 @@ define <16 x float> @test_mm512_maskz_fmsubadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { ; X86-LABEL: test_mm512_maskz_fmsubadd_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2 ; X86-NEXT: retl ; @@ -4453,7 +4517,8 @@ define <16 x float> @test_mm512_mask3_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { ; X86-LABEL: test_mm512_mask3_fmsub_round_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ; X86-NEXT: vmovaps %zmm2, %zmm0 ; X86-NEXT: retl @@ -4475,7 +4540,8 @@ define <16 x float> @test_mm512_mask3_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { ; X86-LABEL: test_mm512_mask3_fmsub_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfmsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2 ; X86-NEXT: vmovaps %zmm2, %zmm0 ; X86-NEXT: retl @@ -4545,7 +4611,8 @@ define <16 x float> @test_mm512_mask3_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { ; X86-LABEL: test_mm512_mask3_fmsubadd_round_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfmsubadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ; X86-NEXT: vmovaps %zmm2, %zmm0 ; X86-NEXT: retl @@ -4567,7 +4634,8 @@ define <16 x float> @test_mm512_mask3_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { ; X86-LABEL: test_mm512_mask3_fmsubadd_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfmsubadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2 ; X86-NEXT: vmovaps %zmm2, %zmm0 ; X86-NEXT: retl @@ -4633,7 +4701,8 @@ define <16 x float> @test_mm512_mask_fnmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { ; X86-LABEL: test_mm512_mask_fnmadd_round_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfnmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ; X86-NEXT: retl ; @@ -4653,7 +4722,8 @@ define <16 x float> @test_mm512_mask_fnmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { ; X86-LABEL: test_mm512_mask_fnmadd_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfnmadd132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2 ; X86-NEXT: retl ; @@ -4765,7 +4835,8 @@ define <16 x float> @test_mm512_mask_fnmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { ; X86-LABEL: test_mm512_mask_fnmsub_round_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfnmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ; X86-NEXT: retl ; @@ -4786,7 +4857,8 @@ define <16 x float> @test_mm512_mask3_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { ; X86-LABEL: test_mm512_mask3_fnmsub_round_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfnmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ; X86-NEXT: vmovaps %zmm2, %zmm0 ; X86-NEXT: retl @@ -4809,7 +4881,8 @@ define <16 x float> @test_mm512_mask_fnmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { ; X86-LABEL: test_mm512_mask_fnmsub_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfnmsub132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2 ; X86-NEXT: retl ; @@ -4830,7 +4903,8 @@ define <16 x float> @test_mm512_mask3_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { ; X86-LABEL: test_mm512_mask3_fnmsub_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfnmsub231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2 ; X86-NEXT: vmovaps %zmm2, %zmm0 ; X86-NEXT: retl @@ -6252,7 +6326,8 @@ ; X86-LABEL: test_mm512_mask_expandloadu_epi32: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: kmovw %ecx, %k1 ; X86-NEXT: vpexpandd (%eax), %zmm0 {%k1} ; X86-NEXT: retl ; @@ -6274,7 +6349,8 @@ ; X86-LABEL: test_mm512_maskz_expandloadu_epi32: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: kmovw %ecx, %k1 ; X86-NEXT: vpexpandd (%eax), %zmm0 {%k1} {z} ; X86-NEXT: retl ; @@ -6295,7 +6371,8 @@ ; X86-LABEL: test_mm512_mask_expandloadu_ps: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: kmovw %ecx, %k1 ; X86-NEXT: vexpandps (%eax), %zmm0 {%k1} ; X86-NEXT: retl ; @@ -6315,7 +6392,8 @@ ; X86-LABEL: test_mm512_maskz_expandloadu_ps: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: kmovw %ecx, %k1 ; X86-NEXT: vexpandps (%eax), %zmm0 {%k1} {z} ; X86-NEXT: retl ; @@ -6380,9 +6458,10 @@ define void @test_mm512_mask_compressstoreu_ps(i8* %__P, i16 zeroext %__U, <16 x float> %__A) { ; X86-LABEL: test_mm512_mask_compressstoreu_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vcompressps %zmm0, (%eax) {%k1} +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vcompressps %zmm0, (%ecx) {%k1} ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; @@ -6402,9 +6481,10 @@ define void @test_mm512_mask_compressstoreu_epi32(i8* %__P, i16 zeroext %__U, <8 x i64> %__A) { ; X86-LABEL: test_mm512_mask_compressstoreu_epi32: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpcompressd %zmm0, (%eax) {%k1} +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vpcompressd %zmm0, (%ecx) {%k1} ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; @@ -6973,7 +7053,8 @@ define i32 @test_mm512_mask_reduce_add_epi32(i16 zeroext %__M, <8 x i64> %__W) { ; X86-LABEL: test_mm512_mask_reduce_add_epi32: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X86-NEXT: vpaddd %ymm1, %ymm0, %ymm0 @@ -7029,7 +7110,8 @@ define i32 @test_mm512_mask_reduce_mul_epi32(i16 zeroext %__M, <8 x i64> %__W) { ; X86-LABEL: test_mm512_mask_reduce_mul_epi32: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -7087,7 +7169,8 @@ define i32 @test_mm512_mask_reduce_and_epi32(i16 zeroext %__M, <8 x i64> %__W) { ; X86-LABEL: test_mm512_mask_reduce_and_epi32: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -7141,7 +7224,8 @@ define i32 @test_mm512_mask_reduce_or_epi32(i16 zeroext %__M, <8 x i64> %__W) { ; X86-LABEL: test_mm512_mask_reduce_or_epi32: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0 @@ -7499,7 +7583,8 @@ ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %eax ; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0 @@ -7555,7 +7640,8 @@ ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %eax ; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1} ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0 @@ -8415,7 +8501,8 @@ define i32 @test_mm512_mask_reduce_max_epi32(i16 zeroext %__M, <8 x i64> %__W) { ; X86-LABEL: test_mm512_mask_reduce_max_epi32: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -8477,7 +8564,8 @@ define i32 @test_mm512_mask_reduce_max_epu32(i16 zeroext %__M, <8 x i64> %__W) { ; X86-LABEL: test_mm512_mask_reduce_max_epu32: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X86-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 @@ -8539,7 +8627,8 @@ ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %eax ; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf] ; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1} ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0 @@ -8595,7 +8684,8 @@ define i32 @test_mm512_mask_reduce_min_epi32(i16 zeroext %__M, <8 x i64> %__W) { ; X86-LABEL: test_mm512_mask_reduce_min_epi32: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -8657,7 +8747,8 @@ define i32 @test_mm512_mask_reduce_min_epu32(i16 zeroext %__M, <8 x i64> %__W) { ; X86-LABEL: test_mm512_mask_reduce_min_epu32: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -8721,7 +8812,8 @@ ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %eax ; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf] ; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1} ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0 @@ -8817,7 +8909,8 @@ define <16 x float> @test_mm512_mask_max_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; X86-LABEL: test_mm512_mask_max_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} ; X86-NEXT: retl ; @@ -8888,7 +8981,8 @@ define <16 x float> @test_mm512_maskz_max_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; X86-LABEL: test_mm512_maskz_max_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z} ; X86-NEXT: retl ; @@ -8907,7 +9001,8 @@ define <16 x float> @test_mm512_mask_max_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; X86-LABEL: test_mm512_mask_max_round_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} ; X86-NEXT: retl ; @@ -8928,7 +9023,8 @@ define <16 x float> @test_mm512_maskz_max_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; X86-LABEL: test_mm512_maskz_max_round_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z} ; X86-NEXT: retl ; @@ -9049,7 +9145,8 @@ define <16 x float> @test_mm512_mask_min_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; X86-LABEL: test_mm512_mask_min_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} ; X86-NEXT: retl ; @@ -9068,7 +9165,8 @@ define <16 x float> @test_mm512_maskz_min_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; X86-LABEL: test_mm512_maskz_min_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z} ; X86-NEXT: retl ; @@ -9087,7 +9185,8 @@ define <16 x float> @test_mm512_mask_min_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; X86-LABEL: test_mm512_mask_min_round_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} ; X86-NEXT: retl ; @@ -9108,7 +9207,8 @@ define <16 x float> @test_mm512_maskz_min_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; X86-LABEL: test_mm512_maskz_min_round_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z} ; X86-NEXT: retl ; @@ -9249,7 +9349,8 @@ define <16 x float> @test_mm512_mask_sqrt_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) { ; X86-LABEL: test_mm512_mask_sqrt_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vsqrtps %zmm1, %zmm0 {%k1} ; X86-NEXT: retl ; @@ -9268,7 +9369,8 @@ define <16 x float> @test_mm512_maskz_sqrt_ps(i16 zeroext %__U, <16 x float> %__A) { ; X86-LABEL: test_mm512_maskz_sqrt_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z} ; X86-NEXT: retl ; @@ -9287,7 +9389,8 @@ define <16 x float> @test_mm512_mask_sqrt_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) { ; X86-LABEL: test_mm512_mask_sqrt_round_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vsqrtps {rn-sae}, %zmm1, %zmm0 {%k1} ; X86-NEXT: retl ; @@ -9308,7 +9411,8 @@ define <16 x float> @test_mm512_maskz_sqrt_round_ps(i16 zeroext %__U, <16 x float> %__A) { ; X86-LABEL: test_mm512_maskz_sqrt_round_ps: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0 {%k1} {z} ; X86-NEXT: retl ; @@ -9349,7 +9453,8 @@ define <8 x i64> @test_mm512_mask_rol_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) { ; X86-LABEL: test_mm512_mask_rol_epi32: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vprold $5, %zmm1, %zmm0 {%k1} ; X86-NEXT: retl ; @@ -9371,7 +9476,8 @@ define <8 x i64> @test_mm512_maskz_rol_epi32(i16 zeroext %__U, <8 x i64> %__A) { ; X86-LABEL: test_mm512_maskz_rol_epi32: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vprold $5, %zmm0, %zmm0 {%k1} {z} ; X86-NEXT: retl ; @@ -9455,7 +9561,8 @@ define <8 x i64> @test_mm512_mask_rolv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { ; X86-LABEL: test_mm512_mask_rolv_epi32: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vprolvd %zmm2, %zmm1, %zmm0 {%k1} ; X86-NEXT: retl ; @@ -9478,7 +9585,8 @@ define <8 x i64> @test_mm512_maskz_rolv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { ; X86-LABEL: test_mm512_maskz_rolv_epi32: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vprolvd %zmm1, %zmm0, %zmm0 {%k1} {z} ; X86-NEXT: retl ; @@ -9563,7 +9671,8 @@ define <8 x i64> @test_mm512_mask_ror_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) { ; X86-LABEL: test_mm512_mask_ror_epi32: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vprord $5, %zmm1, %zmm0 {%k1} ; X86-NEXT: retl ; @@ -9585,7 +9694,8 @@ define <8 x i64> @test_mm512_maskz_ror_epi32(i16 zeroext %__U, <8 x i64> %__A) { ; X86-LABEL: test_mm512_maskz_ror_epi32: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vprord $5, %zmm0, %zmm0 {%k1} {z} ; X86-NEXT: retl ; @@ -9669,7 +9779,8 @@ define <8 x i64> @test_mm512_mask_rorv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { ; X86-LABEL: test_mm512_mask_rorv_epi32: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vprorvd %zmm2, %zmm1, %zmm0 {%k1} ; X86-NEXT: retl ; @@ -9692,7 +9803,8 @@ define <8 x i64> @test_mm512_maskz_rorv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { ; X86-LABEL: test_mm512_maskz_rorv_epi32: ; X86: # %bb.0: # %entry -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z} ; X86-NEXT: retl ; Index: llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll +++ llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll @@ -1278,19 +1278,23 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; KNL-LABEL: test18: ; KNL: ## %bb.0: -; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: kmovw %esi, %k2 -; KNL-NEXT: kshiftrw $8, %k2, %k0 -; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kshiftrw $6, %k1, %k3 -; KNL-NEXT: kxorw %k2, %k3, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k2 +; KNL-NEXT: kmovw %edi, %k0 +; KNL-NEXT: kmovw %esi, %k1 +; KNL-NEXT: kshiftrw $8, %k1, %k2 +; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftrw $9, %k1, %k1 +; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kshiftrw $6, %k0, %k1 +; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $9, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 ; KNL-NEXT: kshiftrw $9, %k1, %k1 -; KNL-NEXT: kshiftlw $7, %k0, %k0 -; KNL-NEXT: korw %k0, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $7, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k1 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdw %zmm0, %ymm0 ; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1337,19 +1341,23 @@ ; ; AVX512DQ-LABEL: test18: ; AVX512DQ: ## %bb.0: -; AVX512DQ-NEXT: kmovw %edi, %k1 -; AVX512DQ-NEXT: kmovw %esi, %k2 -; AVX512DQ-NEXT: kshiftrw $8, %k2, %k0 -; AVX512DQ-NEXT: kshiftrw $9, %k2, %k2 -; AVX512DQ-NEXT: kshiftrb $6, %k1, %k3 -; AVX512DQ-NEXT: kxorb %k2, %k3, %k2 -; AVX512DQ-NEXT: kshiftlb $7, %k2, %k2 -; AVX512DQ-NEXT: kshiftrb $1, %k2, %k2 +; AVX512DQ-NEXT: kmovw %edi, %k0 +; AVX512DQ-NEXT: kmovw %esi, %k1 +; AVX512DQ-NEXT: kshiftrw $8, %k1, %k2 +; AVX512DQ-NEXT: kmovw %k2, %eax +; AVX512DQ-NEXT: kshiftrw $9, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %ecx +; AVX512DQ-NEXT: kshiftrb $6, %k0, %k1 +; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: kxorb %k2, %k1, %k1 -; AVX512DQ-NEXT: kshiftlb $1, %k1, %k1 +; AVX512DQ-NEXT: kshiftlb $7, %k1, %k1 ; AVX512DQ-NEXT: kshiftrb $1, %k1, %k1 -; AVX512DQ-NEXT: kshiftlb $7, %k0, %k0 -; AVX512DQ-NEXT: korb %k0, %k1, %k0 +; AVX512DQ-NEXT: kxorb %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlb $1, %k0, %k0 +; AVX512DQ-NEXT: kshiftrb $1, %k0, %k0 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlb $7, %k1, %k1 +; AVX512DQ-NEXT: korb %k1, %k0, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512DQ-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 Index: llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -1617,7 +1617,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmov_qb_128: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovqb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x32,0xc2] ; X86-NEXT: vpmovqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x32,0xc1] ; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] @@ -1670,7 +1671,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qb_128: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsqb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x22,0xc2] ; X86-NEXT: vpmovsqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x22,0xc1] ; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] @@ -1723,7 +1725,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qb_128: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovusqb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x12,0xc2] ; X86-NEXT: vpmovusqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x12,0xc1] ; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] @@ -1776,7 +1779,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmov_qb_256: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovqb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x32,0xc2] ; X86-NEXT: vpmovqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x32,0xc1] ; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] @@ -1833,7 +1837,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qb_256: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsqb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x22,0xc2] ; X86-NEXT: vpmovsqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x22,0xc1] ; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] @@ -1890,7 +1895,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qb_256: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovusqb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x12,0xc2] ; X86-NEXT: vpmovusqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x12,0xc1] ; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] @@ -2619,7 +2625,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmov_db_128: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovdb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x31,0xc2] ; X86-NEXT: vpmovdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x31,0xc1] ; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] @@ -2672,7 +2679,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovs_db_128: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsdb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x21,0xc2] ; X86-NEXT: vpmovsdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x21,0xc1] ; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] @@ -2725,7 +2733,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovus_db_128: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovusdb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x11,0xc2] ; X86-NEXT: vpmovusdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x11,0xc1] ; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] @@ -2778,7 +2787,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmov_db_256: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovdb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x31,0xc2] ; X86-NEXT: vpmovdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x31,0xc1] ; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] @@ -2835,7 +2845,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovs_db_256: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsdb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x21,0xc2] ; X86-NEXT: vpmovsdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x21,0xc1] ; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] @@ -2892,7 +2903,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovus_db_256: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovusdb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x11,0xc2] ; X86-NEXT: vpmovusdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x11,0xc1] ; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] Index: llvm/trunk/test/CodeGen/X86/vec_floor.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vec_floor.ll +++ llvm/trunk/test/CodeGen/X86/vec_floor.ll @@ -1694,7 +1694,8 @@ ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1 +; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %k1 ; AVX512F-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm2 {%k1} ; AVX512F-NEXT: vmovaps %xmm2, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -1702,7 +1703,8 @@ ; ; AVX512VL-LABEL: floor_mask_ss_mask8: ; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1 +; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k0 +; AVX512VL-NEXT: kmovw %k0, %k1 ; AVX512VL-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm2 {%k1} ; AVX512VL-NEXT: vmovaps %xmm2, %xmm0 ; AVX512VL-NEXT: retq @@ -1753,14 +1755,16 @@ ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1 +; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %k1 ; AVX512F-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm0 {%k1} {z} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: floor_maskz_ss_mask8: ; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1 +; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k0 +; AVX512VL-NEXT: kmovw %k0, %k1 ; AVX512VL-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm0 {%k1} {z} ; AVX512VL-NEXT: retq %mask1 = fcmp oeq <4 x float> %x, %y @@ -1804,7 +1808,8 @@ ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 +; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %k1 ; AVX512F-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm2 {%k1} ; AVX512F-NEXT: vmovapd %xmm2, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -1812,7 +1817,8 @@ ; ; AVX512VL-LABEL: floor_mask_sd_mask8: ; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1 +; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 +; AVX512VL-NEXT: kmovw %k0, %k1 ; AVX512VL-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm2 {%k1} ; AVX512VL-NEXT: vmovapd %xmm2, %xmm0 ; AVX512VL-NEXT: retq @@ -1863,14 +1869,16 @@ ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 +; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %k1 ; AVX512F-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm0 {%k1} {z} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: floor_maskz_sd_mask8: ; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1 +; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 +; AVX512VL-NEXT: kmovw %k0, %k1 ; AVX512VL-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm0 {%k1} {z} ; AVX512VL-NEXT: retq %mask1 = fcmp oeq <2 x double> %x, %y @@ -2758,7 +2766,8 @@ ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1 +; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %k1 ; AVX512F-NEXT: vrndscaless $2, %xmm0, %xmm1, %xmm2 {%k1} ; AVX512F-NEXT: vmovaps %xmm2, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -2766,7 +2775,8 @@ ; ; AVX512VL-LABEL: ceil_mask_ss_mask8: ; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1 +; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k0 +; AVX512VL-NEXT: kmovw %k0, %k1 ; AVX512VL-NEXT: vrndscaless $2, %xmm0, %xmm1, %xmm2 {%k1} ; AVX512VL-NEXT: vmovaps %xmm2, %xmm0 ; AVX512VL-NEXT: retq @@ -2817,14 +2827,16 @@ ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1 +; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %k1 ; AVX512F-NEXT: vrndscaless $2, %xmm0, %xmm1, %xmm0 {%k1} {z} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: ceil_maskz_ss_mask8: ; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1 +; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k0 +; AVX512VL-NEXT: kmovw %k0, %k1 ; AVX512VL-NEXT: vrndscaless $2, %xmm0, %xmm1, %xmm0 {%k1} {z} ; AVX512VL-NEXT: retq %mask1 = fcmp oeq <4 x float> %x, %y @@ -2868,7 +2880,8 @@ ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 +; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %k1 ; AVX512F-NEXT: vrndscalesd $2, %xmm0, %xmm1, %xmm2 {%k1} ; AVX512F-NEXT: vmovapd %xmm2, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -2876,7 +2889,8 @@ ; ; AVX512VL-LABEL: ceil_mask_sd_mask8: ; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1 +; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 +; AVX512VL-NEXT: kmovw %k0, %k1 ; AVX512VL-NEXT: vrndscalesd $2, %xmm0, %xmm1, %xmm2 {%k1} ; AVX512VL-NEXT: vmovapd %xmm2, %xmm0 ; AVX512VL-NEXT: retq @@ -2927,14 +2941,16 @@ ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 +; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %k1 ; AVX512F-NEXT: vrndscalesd $2, %xmm0, %xmm1, %xmm0 {%k1} {z} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: ceil_maskz_sd_mask8: ; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1 +; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 +; AVX512VL-NEXT: kmovw %k0, %k1 ; AVX512VL-NEXT: vrndscalesd $2, %xmm0, %xmm1, %xmm0 {%k1} {z} ; AVX512VL-NEXT: retq %mask1 = fcmp oeq <2 x double> %x, %y