Index: lib/Target/X86/X86FastISel.cpp =================================================================== --- lib/Target/X86/X86FastISel.cpp +++ lib/Target/X86/X86FastISel.cpp @@ -540,11 +540,12 @@ // In case ValReg is a K register, COPY to a GPR if (MRI.getRegClass(ValReg) == &X86::VK1RegClass) { unsigned KValReg = ValReg; - ValReg = createResultReg(Subtarget->is64Bit() ? &X86::GR8RegClass - : &X86::GR8_ABCD_LRegClass); + ValReg = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ValReg) .addReg(KValReg); + ValReg = fastEmitInst_extractsubreg(MVT::i8, ValReg, /*Kill=*/true, + X86::sub_8bit); } // Mask out all but lowest bit. unsigned AndResult = createResultReg(&X86::GR8RegClass); @@ -1280,11 +1281,12 @@ // In case SrcReg is a K register, COPY to a GPR if (MRI.getRegClass(SrcReg) == &X86::VK1RegClass) { unsigned KSrcReg = SrcReg; - SrcReg = createResultReg(Subtarget->is64Bit() ? &X86::GR8RegClass - : &X86::GR8_ABCD_LRegClass); + SrcReg = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), SrcReg) .addReg(KSrcReg); + SrcReg = fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true, + X86::sub_8bit); } SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false); SrcVT = MVT::i8; @@ -1580,11 +1582,12 @@ // In case ResultReg is a K register, COPY to a GPR if (MRI.getRegClass(ResultReg) == &X86::VK1RegClass) { unsigned KResultReg = ResultReg; - ResultReg = createResultReg(Subtarget->is64Bit() ? &X86::GR8RegClass - : &X86::GR8_ABCD_LRegClass); + ResultReg = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg) .addReg(KResultReg); + ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true, + X86::sub_8bit); } // Set the high bits to zero. @@ -1768,11 +1771,12 @@ // In case OpReg is a K register, COPY to a GPR if (MRI.getRegClass(OpReg) == &X86::VK1RegClass) { unsigned KOpReg = OpReg; - OpReg = createResultReg(Subtarget->is64Bit() ? &X86::GR8RegClass - : &X86::GR8_ABCD_LRegClass); + OpReg = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), OpReg) .addReg(KOpReg); + OpReg = fastEmitInst_extractsubreg(MVT::i8, OpReg, /*Kill=*/true, + X86::sub_8bit); } BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) .addReg(OpReg) @@ -2113,11 +2117,12 @@ // In case OpReg is a K register, COPY to a GPR if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) { unsigned KCondReg = CondReg; - CondReg = createResultReg(Subtarget->is64Bit() ? - &X86::GR8RegClass : &X86::GR8_ABCD_LRegClass); + CondReg = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), CondReg) .addReg(KCondReg, getKillRegState(CondIsKill)); + CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Kill=*/true, + X86::sub_8bit); } BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) .addReg(CondReg, getKillRegState(CondIsKill)) @@ -2327,11 +2332,12 @@ // In case OpReg is a K register, COPY to a GPR if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) { unsigned KCondReg = CondReg; - CondReg = createResultReg(Subtarget->is64Bit() ? - &X86::GR8RegClass : &X86::GR8_ABCD_LRegClass); + CondReg = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), CondReg) .addReg(KCondReg, getKillRegState(CondIsKill)); + CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Kill=*/true, + X86::sub_8bit); } BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) .addReg(CondReg, getKillRegState(CondIsKill)) @@ -3307,6 +3313,16 @@ // Handle zero-extension from i1 to i8, which is common. if (ArgVT == MVT::i1) { + // In case SrcReg is a K register, COPY to a GPR + if (MRI.getRegClass(ArgReg) == &X86::VK1RegClass) { + unsigned KArgReg = ArgReg; + ArgReg = createResultReg(&X86::GR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ArgReg) + .addReg(KArgReg); + ArgReg = fastEmitInst_extractsubreg(MVT::i8, ArgReg, /*Kill=*/true, + X86::sub_8bit); + } // Set the high bits to zero. ArgReg = fastEmitZExtFromI1(MVT::i8, ArgReg, /*TODO: Kill=*/false); ArgVT = MVT::i8; Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -2183,28 +2183,26 @@ // GR from/to mask register def : Pat<(v16i1 (bitconvert (i16 GR16:$src))), - (COPY_TO_REGCLASS GR16:$src, VK16)>; + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit)), VK16)>; def : Pat<(i16 (bitconvert (v16i1 VK16:$src))), - (COPY_TO_REGCLASS VK16:$src, GR16)>; + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK16:$src, GR32)), sub_16bit)>; def : Pat<(v8i1 (bitconvert (i8 GR8:$src))), - (COPY_TO_REGCLASS GR8:$src, VK8)>; + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$src, sub_8bit)), VK8)>; def : Pat<(i8 (bitconvert (v8i1 VK8:$src))), - (COPY_TO_REGCLASS VK8:$src, GR8)>; + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK8:$src, GR32)), sub_8bit)>; def : Pat<(i32 (zext (i16 (bitconvert (v16i1 VK16:$src))))), (KMOVWrk VK16:$src)>; def : Pat<(i32 (anyext (i16 (bitconvert (v16i1 VK16:$src))))), - (i32 (INSERT_SUBREG (IMPLICIT_DEF), - (i16 (COPY_TO_REGCLASS VK16:$src, GR16)), sub_16bit))>; + (COPY_TO_REGCLASS VK16:$src, GR32)>; def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))), - (MOVZX32rr8 (COPY_TO_REGCLASS VK8:$src, GR8))>, Requires<[NoDQI]>; + (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK8:$src, GR32)), sub_8bit))>, Requires<[NoDQI]>; def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))), (KMOVBrk VK8:$src)>, Requires<[HasDQI]>; def : Pat<(i32 (anyext (i8 (bitconvert (v8i1 VK8:$src))))), - (i32 (INSERT_SUBREG (IMPLICIT_DEF), - (i8 (COPY_TO_REGCLASS VK8:$src, GR8)), sub_8bit))>; + (COPY_TO_REGCLASS VK8:$src, GR32)>; def : Pat<(v32i1 (bitconvert (i32 GR32:$src))), (COPY_TO_REGCLASS GR32:$src, VK32)>; @@ -2385,8 +2383,8 @@ let Predicates = [HasAVX512] in def : Pat<(!cast("int_x86_avx512_"##IntName##"_w") (i16 GR16:$src)), - (COPY_TO_REGCLASS (!cast(InstName##"Wrr") - (v16i1 (COPY_TO_REGCLASS GR16:$src, VK16))), GR16)>; + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS (!cast(InstName##"Wrr") + (v16i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit)), VK16))), GR32)), sub_16bit)>; } defm : avx512_mask_unop_int<"knot", "KNOT">; @@ -2442,9 +2440,9 @@ let Predicates = [HasAVX512] in def : Pat<(!cast("int_x86_avx512_"##IntName##"_w") (i16 GR16:$src1), (i16 GR16:$src2)), - (COPY_TO_REGCLASS (!cast(InstName##"Wrr") - (v16i1 (COPY_TO_REGCLASS GR16:$src1, VK16)), - (v16i1 (COPY_TO_REGCLASS GR16:$src2, VK16))), GR16)>; + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS (!cast(InstName##"Wrr") + (v16i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src1, sub_16bit)), VK16)), + (v16i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src2, sub_16bit)), VK16))), GR32)), sub_16bit)>; } defm : avx512_mask_binop_int<"kand", "KAND">; @@ -3313,6 +3311,23 @@ } +multiclass avx512_store_scalar_lowering_subreg { + +def : Pat<(masked_store addr:$dst, Mask, + (_.info512.VT (insert_subvector undef, + (_.info256.VT (insert_subvector undef, + (_.info128.VT _.info128.RC:$src), + (iPTR 0))), + (iPTR 0)))), + (!cast(InstrStr#mrk) addr:$dst, + (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)), + (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>; + +} + multiclass avx512_load_scalar_lowering { @@ -3339,22 +3354,50 @@ } +multiclass avx512_load_scalar_lowering_subreg { + +def : Pat<(_.info128.VT (extract_subvector + (_.info512.VT (masked_load addr:$srcAddr, Mask, + (_.info512.VT (bitconvert + (v16i32 immAllZerosV))))), + (iPTR 0))), + (!cast(InstrStr#rmkz) + (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)), + addr:$srcAddr)>; + +def : Pat<(_.info128.VT (extract_subvector + (_.info512.VT (masked_load addr:$srcAddr, Mask, + (_.info512.VT (insert_subvector undef, + (_.info256.VT (insert_subvector undef, + (_.info128.VT (X86vzmovl _.info128.RC:$src)), + (iPTR 0))), + (iPTR 0))))), + (iPTR 0))), + (!cast(InstrStr#rmk) _.info128.RC:$src, + (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)), + addr:$srcAddr)>; + +} + defm : avx512_move_scalar_lowering<"VMOVSSZ", X86Movss, fp32imm0, v4f32x_info>; defm : avx512_move_scalar_lowering<"VMOVSDZ", X86Movsd, fp64imm0, v2f64x_info>; defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info, (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>; -defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info, - (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16>; -defm : avx512_store_scalar_lowering<"VMOVSDZ", avx512vl_f64_info, - (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8>; +defm : avx512_store_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info, + (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16, sub_16bit>; +defm : avx512_store_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info, + (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>; defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info, (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>; -defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info, - (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16>; -defm : avx512_load_scalar_lowering<"VMOVSDZ", avx512vl_f64_info, - (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8>; +defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info, + (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16, sub_16bit>; +defm : avx512_load_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info, + (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>; def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))), (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X), @@ -3365,7 +3408,7 @@ VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR64X:$src1, VR128X)), FR64X)>; def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask), - (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)), + (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$mask, sub_8bit)), VK1WM)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; let hasSideEffects = 0 in Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -6309,8 +6309,6 @@ // SrcReg(MaskReg) -> DestReg(GR64) // SrcReg(MaskReg) -> DestReg(GR32) - // SrcReg(MaskReg) -> DestReg(GR16) - // SrcReg(MaskReg) -> DestReg(GR8) // All KMASK RegClasses hold the same k registers, can be tested against anyone. if (X86::VK16RegClass.contains(SrcReg)) { @@ -6320,21 +6318,14 @@ } if (X86::GR32RegClass.contains(DestReg)) return Subtarget.hasBWI() ? X86::KMOVDrk : X86::KMOVWrk; - if (X86::GR16RegClass.contains(DestReg)) { - DestReg = getX86SubSuperRegister(DestReg, 32); - return X86::KMOVWrk; - } - if (X86::GR8RegClass.contains(DestReg)) { - assert(!isHReg(DestReg) && "Cannot move between mask and h-reg"); - DestReg = getX86SubSuperRegister(DestReg, 32); - return Subtarget.hasDQI() ? X86::KMOVBrk : X86::KMOVWrk; - } + if (X86::GR16RegClass.contains(DestReg)) + llvm_unreachable("Bad copy"); + if (X86::GR8RegClass.contains(DestReg)) + llvm_unreachable("Bad copy"); } // SrcReg(GR64) -> DestReg(MaskReg) // SrcReg(GR32) -> DestReg(MaskReg) - // SrcReg(GR16) -> DestReg(MaskReg) - // SrcReg(GR8) -> DestReg(MaskReg) // All KMASK RegClasses hold the same k registers, can be tested against anyone. if (X86::VK16RegClass.contains(DestReg)) { @@ -6344,15 +6335,10 @@ } if (X86::GR32RegClass.contains(SrcReg)) return Subtarget.hasBWI() ? X86::KMOVDkr : X86::KMOVWkr; - if (X86::GR16RegClass.contains(SrcReg)) { - SrcReg = getX86SubSuperRegister(SrcReg, 32); - return X86::KMOVWkr; - } - if (X86::GR8RegClass.contains(SrcReg)) { - assert(!isHReg(SrcReg) && "Cannot move between mask and h-reg"); - SrcReg = getX86SubSuperRegister(SrcReg, 32); - return Subtarget.hasDQI() ? X86::KMOVBkr : X86::KMOVWkr; - } + if (X86::GR16RegClass.contains(SrcReg)) + llvm_unreachable("Bad copy"); + if (X86::GR8RegClass.contains(SrcReg)) + llvm_unreachable("Bad copy"); } Index: test/CodeGen/X86/vector-shuffle-128-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v16.ll +++ test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -422,7 +422,7 @@ ; AVX512VL-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: movw $-21846, %ax # imm = 0xAAAA -; AVX512VL-NEXT: kmovw %eax, %k1 +; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> @@ -462,7 +462,7 @@ ; AVX512VL-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: movw $-30584, %ax # imm = 0x8888 -; AVX512VL-NEXT: kmovw %eax, %k1 +; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> @@ -521,7 +521,7 @@ ; AVX512VL-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: movw $-28528, %ax # imm = 0x9090 -; AVX512VL-NEXT: kmovw %eax, %k1 +; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> @@ -562,7 +562,7 @@ ; AVX512VL-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: movw $-21264, %ax # imm = 0xACF0 -; AVX512VL-NEXT: kmovw %eax, %k1 +; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1} ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> Index: test/CodeGen/X86/vector-shuffle-256-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v16.ll +++ test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -718,7 +718,7 @@ ; AVX512VL-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: movw $-32768, %ax # imm = 0x8000 -; AVX512VL-NEXT: kmovw %eax, %k1 +; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -743,7 +743,7 @@ ; AVX512VL-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: movw $1, %ax -; AVX512VL-NEXT: kmovw %eax, %k1 +; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -768,7 +768,7 @@ ; AVX512VL-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: movw $21930, %ax # imm = 0x55AA -; AVX512VL-NEXT: kmovw %eax, %k1 +; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -793,7 +793,7 @@ ; AVX512VL-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: movw $-21931, %ax # imm = 0xAA55 -; AVX512VL-NEXT: kmovw %eax, %k1 +; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> Index: test/CodeGen/X86/vector-shuffle-256-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v32.ll +++ test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -383,7 +383,7 @@ ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512VL-NEXT: movw $1, %ax -; AVX512VL-NEXT: kmovw %eax, %k1 +; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: retq @@ -414,7 +414,7 @@ ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512VL-NEXT: movw $1, %ax -; AVX512VL-NEXT: kmovw %eax, %k1 +; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: retq Index: test/CodeGen/X86/vector-shuffle-512-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v16.ll +++ test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -262,12 +262,19 @@ } define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a, <16 x i32> %b) { -; ALL-LABEL: shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u: -; ALL: # BB#0: -; ALL-NEXT: movw $8, %ax -; ALL-NEXT: kmovw %eax, %k1 -; ALL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; ALL-NEXT: retq +; AVX512F-LABEL: shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u: +; AVX512F: # BB#0: +; AVX512F-NEXT: movw $8, %ax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: movw $8, %ax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: retq %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> ret <16 x i32> %c } @@ -398,12 +405,19 @@ } define <16 x i32> @mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01(<16 x i32> %a, <16 x i32> %passthru, i16 %mask) { -; ALL-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01: -; ALL: # BB#0: -; ALL-NEXT: kmovw %edi, %k1 -; ALL-NEXT: valignd {{.*#+}} zmm1 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1] -; ALL-NEXT: vmovdqa64 %zmm1, %zmm0 -; ALL-NEXT: retq +; AVX512F-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01: +; AVX512F: # BB#0: +; AVX512F-NEXT: kmovw %edi, %k1 +; AVX512F-NEXT: valignd {{.*#+}} zmm1 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: valignd {{.*#+}} zmm1 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> %mask.cast = bitcast i16 %mask to <16 x i1> %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> %passthru @@ -411,12 +425,19 @@ } define <16 x i32> @mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) { -; ALL-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17: -; ALL: # BB#0: -; ALL-NEXT: kmovw %edi, %k1 -; ALL-NEXT: valignd {{.*#+}} zmm2 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1] -; ALL-NEXT: vmovdqa64 %zmm2, %zmm0 -; ALL-NEXT: retq +; AVX512F-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17: +; AVX512F: # BB#0: +; AVX512F-NEXT: kmovw %edi, %k1 +; AVX512F-NEXT: valignd {{.*#+}} zmm2 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: valignd {{.*#+}} zmm2 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> %mask.cast = bitcast i16 %mask to <16 x i1> %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> %passthru @@ -424,11 +445,17 @@ } define <16 x i32> @maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01(<16 x i32> %a, i16 %mask) { -; ALL-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01: -; ALL: # BB#0: -; ALL-NEXT: kmovw %edi, %k1 -; ALL-NEXT: valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1] -; ALL-NEXT: retq +; AVX512F-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01: +; AVX512F: # BB#0: +; AVX512F-NEXT: kmovw %edi, %k1 +; AVX512F-NEXT: valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1] +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1] +; AVX512BW-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> %mask.cast = bitcast i16 %mask to <16 x i1> %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> zeroinitializer @@ -436,11 +463,17 @@ } define <16 x i32> @maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17(<16 x i32> %a, <16 x i32> %b, i16 %mask) { -; ALL-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17: -; ALL: # BB#0: -; ALL-NEXT: kmovw %edi, %k1 -; ALL-NEXT: valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1] -; ALL-NEXT: retq +; AVX512F-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17: +; AVX512F: # BB#0: +; AVX512F-NEXT: kmovw %edi, %k1 +; AVX512F-NEXT: valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1] +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1] +; AVX512BW-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> %mask.cast = bitcast i16 %mask to <16 x i1> %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> zeroinitializer @@ -510,12 +543,19 @@ } define <16 x float> @mask_shuffle_v16f32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23(<16 x float> %a, <16 x float> %b, <16 x float> %passthru, i16 %mask) { -; ALL-LABEL: mask_shuffle_v16f32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23: -; ALL: # BB#0: -; ALL-NEXT: kmovw %edi, %k1 -; ALL-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} -; ALL-NEXT: vmovaps %zmm2, %zmm0 -; ALL-NEXT: retq +; AVX512F-LABEL: mask_shuffle_v16f32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23: +; AVX512F: # BB#0: +; AVX512F-NEXT: kmovw %edi, %k1 +; AVX512F-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovaps %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: mask_shuffle_v16f32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovaps %zmm2, %zmm0 +; AVX512BW-NEXT: retq %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> %mask.cast = bitcast i16 %mask to <16 x i1> %res = select <16 x i1> %mask.cast, <16 x float> %shuffle, <16 x float> %passthru @@ -523,12 +563,19 @@ } define <16 x float> @mask_shuffle_v16f32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15(<16 x float> %a, <16 x float> %b, <16 x float> %passthru, i16 %mask) { -; ALL-LABEL: mask_shuffle_v16f32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15: -; ALL: # BB#0: -; ALL-NEXT: kmovw %edi, %k1 -; ALL-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} -; ALL-NEXT: vmovaps %zmm2, %zmm0 -; ALL-NEXT: retq +; AVX512F-LABEL: mask_shuffle_v16f32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15: +; AVX512F: # BB#0: +; AVX512F-NEXT: kmovw %edi, %k1 +; AVX512F-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovaps %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: mask_shuffle_v16f32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovaps %zmm2, %zmm0 +; AVX512BW-NEXT: retq %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> %mask.cast = bitcast i16 %mask to <16 x i1> %res = select <16 x i1> %mask.cast, <16 x float> %shuffle, <16 x float> %passthru @@ -536,12 +583,19 @@ } define <16 x i32> @mask_shuffle_v16i32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) { -; ALL-LABEL: mask_shuffle_v16i32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23: -; ALL: # BB#0: -; ALL-NEXT: kmovw %edi, %k1 -; ALL-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} -; ALL-NEXT: vmovdqa64 %zmm2, %zmm0 -; ALL-NEXT: retq +; AVX512F-LABEL: mask_shuffle_v16i32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23: +; AVX512F: # BB#0: +; AVX512F-NEXT: kmovw %edi, %k1 +; AVX512F-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: mask_shuffle_v16i32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> %mask.cast = bitcast i16 %mask to <16 x i1> %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> %passthru @@ -549,12 +603,19 @@ } define <16 x i32> @mask_shuffle_v16i32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) { -; ALL-LABEL: mask_shuffle_v16i32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15: -; ALL: # BB#0: -; ALL-NEXT: kmovw %edi, %k1 -; ALL-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} -; ALL-NEXT: vmovdqa64 %zmm2, %zmm0 -; ALL-NEXT: retq +; AVX512F-LABEL: mask_shuffle_v16i32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15: +; AVX512F: # BB#0: +; AVX512F-NEXT: kmovw %edi, %k1 +; AVX512F-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: mask_shuffle_v16i32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> %mask.cast = bitcast i16 %mask to <16 x i1> %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> %passthru Index: test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -51,7 +51,7 @@ ; ; X64-LABEL: combine_permvar_8f64_identity_mask: ; X64: # BB#0: -; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vmovapd {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] ; X64-NEXT: vpermpd %zmm0, %zmm2, %zmm1 {%k1} ; X64-NEXT: vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] @@ -89,7 +89,7 @@ ; ; X64-LABEL: combine_permvar_8i64_identity_mask: ; X64: # BB#0: -; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] ; X64-NEXT: vpermq %zmm0, %zmm2, %zmm1 {%k1} ; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] @@ -126,7 +126,7 @@ ; ; X64-LABEL: combine_vpermt2var_8f64_identity_mask: ; X64: # BB#0: -; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vmovapd {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] ; X64-NEXT: vpermi2pd %zmm1, %zmm0, %zmm2 {%k1} {z} ; X64-NEXT: vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] @@ -175,7 +175,7 @@ ; ; X64-LABEL: combine_vpermt2var_8f64_movddup_mask: ; X64: # BB#0: -; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] ; X64-NEXT: retq %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> , <8 x double> %x0, <8 x double> %x1, i8 %m) @@ -207,7 +207,7 @@ ; ; X64-LABEL: combine_vpermt2var_8i64_identity_mask: ; X64: # BB#0: -; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] ; X64-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 {%k1} {z} ; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] @@ -242,7 +242,7 @@ ; ; X64-LABEL: combine_vpermt2var_16f32_identity_mask: ; X64: # BB#0: -; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vmovaps {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2 {%k1} {z} ; X64-NEXT: vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] @@ -299,7 +299,7 @@ ; ; X64-LABEL: combine_vpermt2var_16f32_vmovddup_mask: ; X64: # BB#0: -; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] ; X64-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 {%k1} {z} ; X64-NEXT: retq @@ -319,7 +319,7 @@ ; ; X64-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load: ; X64: # BB#0: -; X64-NEXT: kmovw %esi, %k1 +; X64-NEXT: kmovd %esi, %k1 ; X64-NEXT: vmovaps (%rdi), %zmm2 ; X64-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] ; X64-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1 {%k1} {z} @@ -367,7 +367,7 @@ ; ; X64-LABEL: combine_vpermt2var_16f32_vmovshdup_mask: ; X64: # BB#0: -; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; X64-NEXT: retq %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 %m) @@ -411,7 +411,7 @@ ; ; X64-LABEL: combine_vpermt2var_16f32_vmovsldup_mask: ; X64: # BB#0: -; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; X64-NEXT: retq %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 %m) @@ -427,7 +427,7 @@ ; ; X64-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load: ; X64: # BB#0: -; X64-NEXT: kmovw %esi, %k1 +; X64-NEXT: kmovd %esi, %k1 ; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; X64-NEXT: retq %x0 = load <16 x float>, <16 x float> *%p0 @@ -472,7 +472,7 @@ ; ; X64-LABEL: combine_vpermt2var_16f32_vpermilps_mask: ; X64: # BB#0: -; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; X64-NEXT: retq %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 %m) @@ -488,7 +488,7 @@ ; ; X64-LABEL: combine_vpermt2var_16f32_vpermilps_mask_load: ; X64: # BB#0: -; X64-NEXT: kmovw %esi, %k1 +; X64-NEXT: kmovd %esi, %k1 ; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; X64-NEXT: retq %x0 = load <16 x float>, <16 x float> *%p0 @@ -520,7 +520,7 @@ ; ; X64-LABEL: combine_vpermt2var_16i32_identity_mask: ; X64: # BB#0: -; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vmovdqa32 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 {%k1} {z} ; X64-NEXT: vmovdqa32 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] @@ -675,7 +675,7 @@ ; ; X64-LABEL: combine_permvar_8i64_as_permq_mask: ; X64: # BB#0: -; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4] ; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ; X64-NEXT: retq @@ -707,7 +707,7 @@ ; ; X64-LABEL: combine_permvar_8f64_as_permpd_mask: ; X64: # BB#0: -; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4] ; X64-NEXT: vmovapd %zmm1, %zmm0 ; X64-NEXT: retq Index: test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll +++ test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll @@ -30,7 +30,7 @@ ; ; X64-LABEL: combine_vpermt2var_16i16_identity_mask: ; X64: # BB#0: -; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vmovdqu {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 {%k1} {z} ; X64-NEXT: vmovdqu {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] Index: test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll +++ test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll @@ -45,7 +45,7 @@ ; ; X64-LABEL: combine_vpermt2var_16i8_identity_mask: ; X64: # BB#0: -; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vmovdqu {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-NEXT: vpermi2b %xmm1, %xmm0, %xmm2 {%k1} {z} ; X64-NEXT: vmovdqu {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] Index: test/CodeGen/X86/vector-shuffle-masked.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-masked.ll +++ test/CodeGen/X86/vector-shuffle-masked.ll @@ -4,7 +4,7 @@ define <4 x i32> @mask_shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passthru, i8 %mask) { ; CHECK-LABEL: mask_shuffle_v4i32_1234: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: valignd {{.*#+}} xmm2 {%k1} = xmm0[1,2,3],xmm1[0] ; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ; CHECK-NEXT: retq @@ -18,7 +18,7 @@ define <4 x i32> @maskz_shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b, i8 %mask) { ; CHECK-LABEL: maskz_shuffle_v4i32_1234: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: valignd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,2,3],xmm1[0] ; CHECK-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> @@ -31,7 +31,7 @@ define <4 x i32> @mask_shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passthru, i8 %mask) { ; CHECK-LABEL: mask_shuffle_v4i32_2345: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: valignd {{.*#+}} xmm2 {%k1} = xmm0[2,3],xmm1[0,1] ; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ; CHECK-NEXT: retq @@ -45,7 +45,7 @@ define <4 x i32> @maskz_shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b, i8 %mask) { ; CHECK-LABEL: maskz_shuffle_v4i32_2345: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: valignd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3],xmm1[0,1] ; CHECK-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> @@ -58,7 +58,7 @@ define <2 x i64> @mask_shuffle_v2i64_12(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passthru, i8 %mask) { ; CHECK-LABEL: mask_shuffle_v2i64_12: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: valignq {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[0] ; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ; CHECK-NEXT: retq @@ -72,7 +72,7 @@ define <2 x i64> @maskz_shuffle_v2i64_12(<2 x i64> %a, <2 x i64> %b, i8 %mask) { ; CHECK-LABEL: maskz_shuffle_v2i64_12: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: valignq {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[0] ; CHECK-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> @@ -85,7 +85,7 @@ define <4 x i64> @mask_shuffle_v4i64_1234(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passthru, i8 %mask) { ; CHECK-LABEL: mask_shuffle_v4i64_1234: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: valignq {{.*#+}} ymm2 {%k1} = ymm0[1,2,3],ymm1[0] ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -99,7 +99,7 @@ define <4 x i64> @maskz_shuffle_v4i64_1234(<4 x i64> %a, <4 x i64> %b, i8 %mask) { ; CHECK-LABEL: maskz_shuffle_v4i64_1234: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: valignq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,3],ymm1[0] ; CHECK-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -112,7 +112,7 @@ define <4 x i64> @mask_shuffle_v4i64_1230(<4 x i64> %a, <4 x i64> %passthru, i8 %mask) { ; CHECK-LABEL: mask_shuffle_v4i64_1230: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,3,0] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -126,7 +126,7 @@ define <4 x i64> @maskz_shuffle_v4i64_1230(<4 x i64> %a, i8 %mask) { ; CHECK-LABEL: maskz_shuffle_v4i64_1230: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,3,0] ; CHECK-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> @@ -139,7 +139,7 @@ define <8 x i32> @mask_shuffle_v8i32_12345678(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passthru, i8 %mask) { ; CHECK-LABEL: mask_shuffle_v8i32_12345678: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: valignd {{.*#+}} ymm2 {%k1} = ymm0[1,2,3,4,5,6,7],ymm1[0] ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -152,7 +152,7 @@ define <8 x i32> @maskz_shuffle_v8i32_12345678(<8 x i32> %a, <8 x i32> %b, i8 %mask) { ; CHECK-LABEL: maskz_shuffle_v8i32_12345678: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: valignd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,3,4,5,6,7],ymm1[0] ; CHECK-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -164,7 +164,7 @@ define <8 x i32> @mask_shuffle_v8i32_23456789(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passthru, i8 %mask) { ; CHECK-LABEL: mask_shuffle_v8i32_23456789: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: valignd {{.*#+}} ymm2 {%k1} = ymm0[2,3,4,5,6,7],ymm1[0,1] ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -177,7 +177,7 @@ define <8 x i32> @maskz_shuffle_v8i32_23456789(<8 x i32> %a, <8 x i32> %b, i8 %mask) { ; CHECK-LABEL: maskz_shuffle_v8i32_23456789: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: valignd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,4,5,6,7],ymm1[0,1] ; CHECK-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -189,7 +189,7 @@ define <8 x i32> @mask_shuffle_v8i32_12345670(<8 x i32> %a, <8 x i32> %passthru, i8 %mask) { ; CHECK-LABEL: mask_shuffle_v8i32_12345670: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: valignd {{.*#+}} ymm1 {%k1} = ymm0[1,2,3,4,5,6,7,0] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -202,7 +202,7 @@ define <8 x i32> @maskz_shuffle_v8i32_12345670(<8 x i32> %a, i8 %mask) { ; CHECK-LABEL: maskz_shuffle_v8i32_12345670: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: valignd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,3,4,5,6,7,0] ; CHECK-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> @@ -215,7 +215,7 @@ ; CHECK-LABEL: mask_shuffle_v8i32_23456701: ; CHECK: # BB#0: ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,0] -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> @@ -228,7 +228,7 @@ ; CHECK-LABEL: maskz_shuffle_v8i32_23456701: ; CHECK: # BB#0: ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,0] -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> @@ -240,7 +240,7 @@ define <4 x i32> @mask_extract_v16i32_v4i32_0(<16 x i32> %a, <4 x i32> %passthru, i8 %mask) { ; CHECK-LABEL: mask_extract_v16i32_v4i32_0: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextracti32x4 $0, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper @@ -255,7 +255,7 @@ define <4 x i32> @mask_extract_v16i32_v4i32_1(<16 x i32> %a, <4 x i32> %passthru, i8 %mask) { ; CHECK-LABEL: mask_extract_v16i32_v4i32_1: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextracti32x4 $1, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper @@ -270,7 +270,7 @@ define <4 x i32> @mask_extract_v16i32_v4i32_2(<16 x i32> %a, <4 x i32> %passthru, i8 %mask) { ; CHECK-LABEL: mask_extract_v16i32_v4i32_2: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper @@ -285,7 +285,7 @@ define <4 x i32> @mask_extract_v16i32_v4i32_3(<16 x i32> %a, <4 x i32> %passthru, i8 %mask) { ; CHECK-LABEL: mask_extract_v16i32_v4i32_3: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextracti32x4 $3, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper @@ -300,7 +300,7 @@ define <4 x float> @mask_extract_v16f32_v4f32_0(<16 x float> %a, <4 x float> %passthru, i8 %mask) { ; CHECK-LABEL: mask_extract_v16f32_v4f32_0: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextractf32x4 $0, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper @@ -315,7 +315,7 @@ define <4 x float> @mask_extract_v16f32_v4f32_1(<16 x float> %a, <4 x float> %passthru, i8 %mask) { ; CHECK-LABEL: mask_extract_v16f32_v4f32_1: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextractf32x4 $1, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper @@ -330,7 +330,7 @@ define <4 x float> @mask_extract_v16f32_v4f32_2(<16 x float> %a, <4 x float> %passthru, i8 %mask) { ; CHECK-LABEL: mask_extract_v16f32_v4f32_2: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper @@ -345,7 +345,7 @@ define <4 x float> @mask_extract_v16f32_v4f32_3(<16 x float> %a, <4 x float> %passthru, i8 %mask) { ; CHECK-LABEL: mask_extract_v16f32_v4f32_3: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper @@ -360,7 +360,7 @@ define <8 x i32> @mask_extract_v16i32_v8i32_0(<16 x i32> %a, <8 x i32> %passthru, i8 %mask) { ; CHECK-LABEL: mask_extract_v16i32_v8i32_0: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextracti32x8 $0, %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -373,7 +373,7 @@ define <8 x i32> @mask_extract_v16i32_v8i32_1(<16 x i32> %a, <8 x i32> %passthru, i8 %mask) { ; CHECK-LABEL: mask_extract_v16i32_v8i32_1: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextracti32x8 $1, %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -386,7 +386,7 @@ define <8 x float> @mask_extract_v16f32_v8f32_0(<16 x float> %a, <8 x float> %passthru, i8 %mask) { ; CHECK-LABEL: mask_extract_v16f32_v8f32_0: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextractf32x8 $0, %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -399,7 +399,7 @@ define <8 x float> @mask_extract_v16f32_v8f32_1(<16 x float> %a, <8 x float> %passthru, i8 %mask) { ; CHECK-LABEL: mask_extract_v16f32_v8f32_1: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -412,7 +412,7 @@ define <2 x i64> @mask_extract_v8i64_v2i64_0(<8 x i64> %a, <2 x i64> %passthru, i8 %mask) { ; CHECK-LABEL: mask_extract_v8i64_v2i64_0: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextracti64x2 $0, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper @@ -427,7 +427,7 @@ define <2 x i64> @mask_extract_v8i64_v2i64_1(<8 x i64> %a, <2 x i64> %passthru, i8 %mask) { ; CHECK-LABEL: mask_extract_v8i64_v2i64_1: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextracti64x2 $1, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper @@ -442,7 +442,7 @@ define <2 x i64> @mask_extract_v8i64_v2i64_2(<8 x i64> %a, <2 x i64> %passthru, i8 %mask) { ; CHECK-LABEL: mask_extract_v8i64_v2i64_2: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextracti64x2 $2, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper @@ -457,7 +457,7 @@ define <2 x i64> @mask_extract_v8i64_v2i64_3(<8 x i64> %a, <2 x i64> %passthru, i8 %mask) { ; CHECK-LABEL: mask_extract_v8i64_v2i64_3: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextracti64x2 $3, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper @@ -472,7 +472,7 @@ define <2 x double> @mask_extract_v8f64_v2f64_0(<8 x double> %a, <2 x double> %passthru, i8 %mask) { ; CHECK-LABEL: mask_extract_v8f64_v2f64_0: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextractf64x2 $0, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper @@ -487,7 +487,7 @@ define <2 x double> @mask_extract_v8f64_v2f64_1(<8 x double> %a, <2 x double> %passthru, i8 %mask) { ; CHECK-LABEL: mask_extract_v8f64_v2f64_1: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper @@ -502,7 +502,7 @@ define <2 x double> @mask_extract_v8f64_v2f64_2(<8 x double> %a, <2 x double> %passthru, i8 %mask) { ; CHECK-LABEL: mask_extract_v8f64_v2f64_2: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextractf64x2 $2, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper @@ -517,7 +517,7 @@ define <2 x double> @mask_extract_v8f64_v2f64_3(<8 x double> %a, <2 x double> %passthru, i8 %mask) { ; CHECK-LABEL: mask_extract_v8f64_v2f64_3: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextractf64x2 $3, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper @@ -532,7 +532,7 @@ define <4 x i64> @mask_extract_v8i64_v4i64_0(<8 x i64> %a, <4 x i64> %passthru, i8 %mask) { ; CHECK-LABEL: mask_extract_v8i64_v4i64_0: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextracti64x4 $0, %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -546,7 +546,7 @@ define <4 x i64> @mask_extract_v8i64_v4i64_1(<8 x i64> %a, <4 x i64> %passthru, i8 %mask) { ; CHECK-LABEL: mask_extract_v8i64_v4i64_1: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -560,7 +560,7 @@ define <4 x double> @mask_extract_v8f64_v4f64_0(<8 x double> %a, <4 x double> %passthru, i8 %mask) { ; CHECK-LABEL: mask_extract_v8f64_v4f64_0: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextractf64x4 $0, %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -574,7 +574,7 @@ define <4 x double> @mask_extract_v8f64_v4f64_1(<8 x double> %a, <4 x double> %passthru, i8 %mask) { ; CHECK-LABEL: mask_extract_v8f64_v4f64_1: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -588,7 +588,7 @@ define <8 x i32> @mask_extract_v8i64_v8i32_1(<8 x i64> %a, <8 x i32> %passthru, i8 %mask) { ; CHECK-LABEL: mask_extract_v8i64_v8i32_1: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextracti32x8 $1, %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -602,7 +602,7 @@ define <8 x float> @mask_extract_v8f64_v8f32_1(<8 x double> %a, <8 x float> %passthru, i8 %mask) { ; CHECK-LABEL: mask_extract_v8f64_v8f32_1: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -616,7 +616,7 @@ define <4 x i32> @mask_cast_extract_v8i64_v4i32_1(<8 x i64> %a, <4 x i32> %passthru, i8 %mask) { ; CHECK-LABEL: mask_cast_extract_v8i64_v4i32_1: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextracti32x4 $1, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper @@ -632,7 +632,7 @@ define <4 x float> @mask_cast_extract_v8f64_v4f32_1(<8 x double> %a, <4 x float> %passthru, i8 %mask) { ; CHECK-LABEL: mask_cast_extract_v8f64_v4f32_1: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextractf32x4 $1, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper @@ -648,7 +648,7 @@ define <4 x i64> @mask_cast_extract_v16i32_v4i64_1(<16 x i32> %a, <4 x i64> %passthru, i8 %mask) { ; CHECK-LABEL: mask_cast_extract_v16i32_v4i64_1: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -663,7 +663,7 @@ define <4 x double> @mask_cast_extract_v16f32_v4f64_1(<16 x float> %a, <4 x double> %passthru, i8 %mask) { ; CHECK-LABEL: mask_cast_extract_v16f32_v4f64_1: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -678,7 +678,7 @@ define <2 x i64> @mask_cast_extract_v16i32_v2i64_1(<16 x i32> %a, <2 x i64> %passthru, i8 %mask) { ; CHECK-LABEL: mask_cast_extract_v16i32_v2i64_1: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextracti64x2 $1, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper @@ -694,7 +694,7 @@ define <2 x double> @mask_cast_extract_v16f32_v2f64_1(<16 x float> %a, <2 x double> %passthru, i8 %mask) { ; CHECK-LABEL: mask_cast_extract_v16f32_v2f64_1: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper @@ -710,7 +710,7 @@ define <2 x double> @broadcast_v4f32_0101_from_v2f32_mask(double* %x, <2 x double> %passthru, i8 %mask) { ; CHECK-LABEL: broadcast_v4f32_0101_from_v2f32_mask: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = mem[0,0] ; CHECK-NEXT: retq %q = load double, double* %x, align 1 @@ -725,7 +725,7 @@ define <2 x double> @broadcast_v4f32_0101_from_v2f32_maskz(double* %x, i8 %mask) { ; CHECK-LABEL: broadcast_v4f32_0101_from_v2f32_maskz: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = mem[0,0] ; CHECK-NEXT: retq %q = load double, double* %x, align 1 @@ -740,7 +740,7 @@ define <8 x float> @test_broadcast_2f64_8f32(<2 x double> *%p, i8 %mask) nounwind { ; CHECK-LABEL: test_broadcast_2f64_8f32: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %1 = load <2 x double>, <2 x double> *%p @@ -754,7 +754,7 @@ define <8 x i32> @test_broadcast_2i64_8i32(<2 x i64> *%p, i8 %mask) nounwind { ; CHECK-LABEL: test_broadcast_2i64_8i32: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %1 = load <2 x i64>, <2 x i64> *%p @@ -824,7 +824,7 @@ define <4 x double> @test_broadcast_4f32_4f64(<4 x float> *%p, i8 %mask) nounwind { ; CHECK-LABEL: test_broadcast_4f32_4f64: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vbroadcastf64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1] ; CHECK-NEXT: retq %1 = load <4 x float>, <4 x float> *%p @@ -839,7 +839,7 @@ define <4 x i64> @test_broadcast_4i32_4i64(<4 x i32> *%p, i8 %mask) nounwind { ; CHECK-LABEL: test_broadcast_4i32_4i64: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1] ; CHECK-NEXT: retq %1 = load <4 x i32>, <4 x i32> *%p @@ -854,7 +854,7 @@ define <8 x double> @test_broadcast_4f32_8f64(<4 x float> *%p, i8 %mask) nounwind { ; CHECK-LABEL: test_broadcast_4f32_8f64: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %1 = load <4 x float>, <4 x float> *%p @@ -868,7 +868,7 @@ define <8 x i64> @test_broadcast_4i32_8i64(<4 x i32> *%p, i8 %mask) nounwind { ; CHECK-LABEL: test_broadcast_4i32_8i64: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %1 = load <4 x i32>, <4 x i32> *%p @@ -882,7 +882,7 @@ define <8 x double> @test_broadcast_8f32_8f64(<8 x float> *%p, i8 %mask) nounwind { ; CHECK-LABEL: test_broadcast_8f32_8f64: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %1 = load <8 x float>, <8 x float> *%p @@ -896,7 +896,7 @@ define <8 x i64> @test_broadcast_8i32_8i64(<8 x i32> *%p, i8 %mask) nounwind { ; CHECK-LABEL: test_broadcast_8i32_8i64: ; CHECK: # BB#0: -; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %1 = load <8 x i32>, <8 x i32> *%p Index: test/CodeGen/X86/vector-shuffle-v1.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-v1.ll +++ test/CodeGen/X86/vector-shuffle-v1.ll @@ -35,11 +35,11 @@ ; VL_BW_DQ: # BB#0: ; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; VL_BW_DQ-NEXT: vptestmq %xmm0, %xmm0, %k0 -; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 ; VL_BW_DQ-NEXT: movb $1, %al -; VL_BW_DQ-NEXT: kmovb %eax, %k0 +; VL_BW_DQ-NEXT: kmovd %eax, %k1 +; VL_BW_DQ-NEXT: vpmovm2q %k1, %xmm0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm1 -; VL_BW_DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; VL_BW_DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 ; VL_BW_DQ-NEXT: retq @@ -171,7 +171,7 @@ ; ; VL_BW_DQ-LABEL: shuf8i1_u_2_u_u_2_u_2_u: ; VL_BW_DQ: # BB#0: -; VL_BW_DQ-NEXT: kmovb %edi, %k0 +; VL_BW_DQ-NEXT: kmovd %edi, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 ; VL_BW_DQ-NEXT: vextracti64x2 $1, %zmm0, %xmm0 ; VL_BW_DQ-NEXT: vpbroadcastq %xmm0, %zmm0 @@ -195,18 +195,20 @@ ; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: # kill: %AL %AL %EAX ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u: ; VL_BW_DQ: # BB#0: -; VL_BW_DQ-NEXT: kmovb %edi, %k0 +; VL_BW_DQ-NEXT: kmovd %edi, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 ; VL_BW_DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u> ; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0 -; VL_BW_DQ-NEXT: kmovb %k0, %eax +; VL_BW_DQ-NEXT: kmovd %k0, %eax +; VL_BW_DQ-NEXT: # kill: %AL %AL %EAX ; VL_BW_DQ-NEXT: vzeroupper ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> @@ -224,16 +226,18 @@ ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: # kill: %AL %AL %EAX ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u: ; VL_BW_DQ: # BB#0: -; VL_BW_DQ-NEXT: kmovb %edi, %k0 +; VL_BW_DQ-NEXT: kmovd %edi, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 ; VL_BW_DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1] ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 -; VL_BW_DQ-NEXT: kmovb %k0, %eax +; VL_BW_DQ-NEXT: kmovd %k0, %eax +; VL_BW_DQ-NEXT: # kill: %AL %AL %EAX ; VL_BW_DQ-NEXT: vzeroupper ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> @@ -253,18 +257,20 @@ ; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: # kill: %AL %AL %EAX ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0: ; VL_BW_DQ: # BB#0: -; VL_BW_DQ-NEXT: kmovb %edi, %k0 +; VL_BW_DQ-NEXT: kmovd %edi, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 ; VL_BW_DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] ; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0 -; VL_BW_DQ-NEXT: kmovb %k0, %eax +; VL_BW_DQ-NEXT: kmovd %k0, %eax +; VL_BW_DQ-NEXT: # kill: %AL %AL %EAX ; VL_BW_DQ-NEXT: vzeroupper ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> @@ -284,18 +290,20 @@ ; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: # kill: %AL %AL %EAX ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0: ; VL_BW_DQ: # BB#0: -; VL_BW_DQ-NEXT: kmovb %edi, %k0 +; VL_BW_DQ-NEXT: kmovd %edi, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7] ; VL_BW_DQ-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0 -; VL_BW_DQ-NEXT: kmovb %k0, %eax +; VL_BW_DQ-NEXT: kmovd %k0, %eax +; VL_BW_DQ-NEXT: # kill: %AL %AL %EAX ; VL_BW_DQ-NEXT: vzeroupper ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> @@ -317,20 +325,22 @@ ; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: # kill: %AL %AL %EAX ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1: ; VL_BW_DQ: # BB#0: -; VL_BW_DQ-NEXT: kmovb %edi, %k0 +; VL_BW_DQ-NEXT: kmovd %edi, %k0 ; VL_BW_DQ-NEXT: movb $51, %al -; VL_BW_DQ-NEXT: kmovb %eax, %k1 +; VL_BW_DQ-NEXT: kmovd %eax, %k1 ; VL_BW_DQ-NEXT: vpmovm2q %k1, %zmm0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm1 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1] ; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0 -; VL_BW_DQ-NEXT: kmovb %k0, %eax +; VL_BW_DQ-NEXT: kmovd %k0, %eax +; VL_BW_DQ-NEXT: # kill: %AL %AL %EAX ; VL_BW_DQ-NEXT: vzeroupper ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> @@ -352,6 +362,7 @@ ; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: # kill: %AL %AL %EAX ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -364,7 +375,8 @@ ; VL_BW_DQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 ; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0 -; VL_BW_DQ-NEXT: kmovb %k0, %eax +; VL_BW_DQ-NEXT: kmovd %k0, %eax +; VL_BW_DQ-NEXT: # kill: %AL %AL %EAX ; VL_BW_DQ-NEXT: vzeroupper ; VL_BW_DQ-NEXT: retq %c = shufflevector <8 x i1> , <8 x i1> %a, <8 x i32> @@ -382,16 +394,18 @@ ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: # kill: %AX %AX %EAX ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0: ; VL_BW_DQ: # BB#0: -; VL_BW_DQ-NEXT: kmovw %edi, %k0 +; VL_BW_DQ-NEXT: kmovd %edi, %k0 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm0 ; VL_BW_DQ-NEXT: vpbroadcastd %xmm0, %zmm0 ; VL_BW_DQ-NEXT: vpmovd2m %zmm0, %k0 -; VL_BW_DQ-NEXT: kmovw %k0, %eax +; VL_BW_DQ-NEXT: kmovd %k0, %eax +; VL_BW_DQ-NEXT: # kill: %AX %AX %EAX ; VL_BW_DQ-NEXT: vzeroupper ; VL_BW_DQ-NEXT: retq %b = bitcast i16 %a to <16 x i1>