Index: include/llvm/IR/IntrinsicsX86.td =================================================================== --- include/llvm/IR/IntrinsicsX86.td +++ include/llvm/IR/IntrinsicsX86.td @@ -2944,18 +2944,62 @@ // Vector load with broadcast let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". + def int_x86_avx512_mask_pbroadcast_b_gpr_128 : + GCCBuiltin<"__builtin_ia32_pbroadcastb128_gpr_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_i8_ty, llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_mask_pbroadcast_b_gpr_256 : + GCCBuiltin<"__builtin_ia32_pbroadcastb256_gpr_mask">, + Intrinsic<[llvm_v32i8_ty], + [llvm_i8_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_pbroadcast_b_gpr_512 : + GCCBuiltin<"__builtin_ia32_pbroadcastb512_gpr_mask">, + Intrinsic<[llvm_v64i8_ty], + [llvm_i8_ty, llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>; + + def int_x86_avx512_mask_pbroadcast_w_gpr_128 : + GCCBuiltin<"__builtin_ia32_pbroadcastw128_gpr_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_i16_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_pbroadcast_w_gpr_256 : + GCCBuiltin<"__builtin_ia32_pbroadcastw256_gpr_mask">, + Intrinsic<[llvm_v16i16_ty], + [llvm_i16_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_mask_pbroadcast_w_gpr_512 : + GCCBuiltin<"__builtin_ia32_pbroadcastw512_gpr_mask">, + Intrinsic<[llvm_v32i16_ty], + [llvm_i16_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; + + def int_x86_avx512_mask_pbroadcast_d_gpr_128 : + GCCBuiltin<"__builtin_ia32_pbroadcastd128_gpr_mask">, + Intrinsic<[llvm_v4i32_ty], + [llvm_i32_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_pbroadcast_d_gpr_256 : + GCCBuiltin<"__builtin_ia32_pbroadcastd256_gpr_mask">, + Intrinsic<[llvm_v8i32_ty], + [llvm_i32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pbroadcast_d_gpr_512 : - GCCBuiltin<"__builtin_ia32_pbroadcastd512_gpr_mask">, - Intrinsic<[llvm_v16i32_ty], [llvm_i32_ty, llvm_v16i32_ty, - llvm_i16_ty], [IntrNoMem]>; + GCCBuiltin<"__builtin_ia32_pbroadcastd512_gpr_mask">, + Intrinsic<[llvm_v16i32_ty], + [llvm_i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>; + + def int_x86_avx512_mask_pbroadcast_q_gpr_128 : + GCCBuiltin<"__builtin_ia32_pbroadcastq128_gpr_mask">, + Intrinsic<[llvm_v2i64_ty], + [llvm_i64_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_pbroadcast_q_gpr_256 : + GCCBuiltin<"__builtin_ia32_pbroadcastq256_gpr_mask">, + Intrinsic<[llvm_v4i64_ty], + [llvm_i64_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pbroadcast_q_gpr_512 : - GCCBuiltin<"__builtin_ia32_pbroadcastq512_gpr_mask">, - Intrinsic<[llvm_v8i64_ty], [llvm_i64_ty, llvm_v8i64_ty, - llvm_i8_ty], [IntrNoMem]>; + GCCBuiltin<"__builtin_ia32_pbroadcastq512_gpr_mask">, + Intrinsic<[llvm_v8i64_ty], + [llvm_i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_pbroadcast_q_mem_512 : - GCCBuiltin<"__builtin_ia32_pbroadcastq512_mem_mask">, - Intrinsic<[llvm_v8i64_ty], [llvm_i64_ty, llvm_v8i64_ty, - llvm_i8_ty], [IntrNoMem]>; + GCCBuiltin<"__builtin_ia32_pbroadcastq512_mem_mask">, + Intrinsic<[llvm_v8i64_ty], + [llvm_i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>; } // Vector permutation @@ -6082,11 +6126,6 @@ Intrinsic<[llvm_v8i64_ty], [llvm_v4i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_pbroadcastd_i32_512 : - Intrinsic<[llvm_v16i32_ty], [llvm_i32_ty], [IntrNoMem]>; - - def int_x86_avx512_pbroadcastq_i64_512 : - Intrinsic<[llvm_v8i64_ty], [llvm_i64_ty], [IntrNoMem]>; def int_x86_avx512_broadcastmw_512 : GCCBuiltin<"__builtin_ia32_broadcastmw512">, Intrinsic<[llvm_v16i32_ty], [llvm_i16_ty], [IntrNoMem]>; Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -913,9 +913,10 @@ multiclass avx512_int_broadcast_reg opc, X86VectorVTInfo _, RegisterClass SrcRC> { - defm r : AVX512_maskable_in_asm, T8PD, EVEX; + defm r : AVX512_maskable, T8PD, EVEX; } multiclass avx512_int_broadcast_reg_vl opc, AVX512VLVectorVTInfo _, @@ -932,6 +933,12 @@ HasBWI>; defm VPBROADCASTWr : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info, GR32, HasBWI>; +let isCodeGenOnly = 1 in { + defm VPBROADCASTBr_Alt : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info, + GR8, HasBWI>; + defm VPBROADCASTWr_Alt : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info, + GR16, HasBWI>; +} defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info, GR32, HasAVX512>; defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info, GR64, @@ -939,27 +946,9 @@ def : Pat <(v16i32 (X86vzext VK16WM:$mask)), (VPBROADCASTDrZrkz VK16WM:$mask, (i32 (MOV32ri 0x1)))>; - def : Pat <(v8i64 (X86vzext VK8WM:$mask)), (VPBROADCASTQrZrkz VK8WM:$mask, (i64 (MOV64ri 0x1)))>; -def : Pat<(v16i32 (X86VBroadcast (i32 GR32:$src))), - (VPBROADCASTDrZr GR32:$src)>; -def : Pat<(v8i64 (X86VBroadcast (i64 GR64:$src))), - (VPBROADCASTQrZr GR64:$src)>; - -def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_i32_512 (i32 GR32:$src))), - (VPBROADCASTDrZr GR32:$src)>; -def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_i64_512 (i64 GR64:$src))), - (VPBROADCASTQrZr GR64:$src)>; - -def : Pat<(v16i32 (int_x86_avx512_mask_pbroadcast_d_gpr_512 (i32 GR32:$src), - (v16i32 immAllZerosV), (i16 GR16:$mask))), - (VPBROADCASTDrZrkz (COPY_TO_REGCLASS GR16:$mask, VK16WM), GR32:$src)>; -def : Pat<(v8i64 (int_x86_avx512_mask_pbroadcast_q_gpr_512 (i64 GR64:$src), - (bc_v8i64 (v16i32 immAllZerosV)), (i8 GR8:$mask))), - (VPBROADCASTQrZrkz (COPY_TO_REGCLASS GR8:$mask, VK8WM), GR64:$src)>; - // Provide aliases for broadcast from the same register class that // automatically does the extract. multiclass avx512_int_broadcast_rm_lowering; - // Provide fallback in case the load node that is used in the patterns above - // is used by additional users, which prevents the pattern selection. +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. let AddedComplexity = 20 in { - def : Pat<(v4f32 (X86VBroadcast FR32:$src)), - (VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v8f32 (X86VBroadcast FR32:$src)), - (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f64 (X86VBroadcast FR64:$src)), - (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>; - - def : Pat<(v4i32 (X86VBroadcast GR32:$src)), - (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>; - def : Pat<(v8i32 (X86VBroadcast GR32:$src)), - (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>; - def : Pat<(v4i64 (X86VBroadcast GR64:$src)), - (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>; - - def : Pat<(v16i8 (X86VBroadcast GR8:$src)), - (VPBROADCASTBrr (COPY_TO_REGCLASS - (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)), - VR128))>; - def : Pat<(v32i8 (X86VBroadcast GR8:$src)), - (VPBROADCASTBYrr (COPY_TO_REGCLASS - (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)), - VR128))>; - - def : Pat<(v8i16 (X86VBroadcast GR16:$src)), - (VPBROADCASTWrr (COPY_TO_REGCLASS - (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)), - VR128))>; - def : Pat<(v16i16 (X86VBroadcast GR16:$src)), - (VPBROADCASTWYrr (COPY_TO_REGCLASS - (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)), - VR128))>; - - // The patterns for VPBROADCASTD are not needed because they would match - // the exact same thing as VBROADCASTSS patterns. - - def : Pat<(v2i64 (X86VBroadcast GR64:$src)), - (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>; - // The v4i64 pattern is not needed because VBROADCASTSDYrr already match. + def : Pat<(v4f32 (X86VBroadcast FR32:$src)), + (VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v8f32 (X86VBroadcast FR32:$src)), + (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f64 (X86VBroadcast FR64:$src)), + (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>; } } +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. +let Predicates = [HasAVX2, NoVLX_Or_NoBWI], AddedComplexity = 20 in { + def : Pat<(v16i8 (X86VBroadcast GR8:$src)), + (VPBROADCASTBrr (COPY_TO_REGCLASS + (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)), + VR128))>; + def : Pat<(v32i8 (X86VBroadcast GR8:$src)), + (VPBROADCASTBYrr (COPY_TO_REGCLASS + (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)), + VR128))>; + + def : Pat<(v8i16 (X86VBroadcast GR16:$src)), + (VPBROADCASTWrr (COPY_TO_REGCLASS + (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)), + VR128))>; + def : Pat<(v16i16 (X86VBroadcast GR16:$src)), + (VPBROADCASTWYrr (COPY_TO_REGCLASS + (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)), + VR128))>; +} +let Predicates = [HasAVX2, NoVLX], AddedComplexity = 20 in { + def : Pat<(v4i32 (X86VBroadcast GR32:$src)), + (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>; + def : Pat<(v8i32 (X86VBroadcast GR32:$src)), + (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>; + def : Pat<(v4i64 (X86VBroadcast GR64:$src)), + (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>; + + // The patterns for VPBROADCASTD are not needed because they would match + // the exact same thing as VBROADCASTSS patterns. + + def : Pat<(v2i64 (X86VBroadcast GR64:$src)), + (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>; + // The v4i64 pattern is not needed because VBROADCASTSDYrr already match. +} + // AVX1 broadcast patterns let Predicates = [HasAVX1Only] in { def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), @@ -8467,10 +8472,9 @@ (VBROADCASTSSrm addr:$src)>; } -let Predicates = [HasAVX] in { // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. - let AddedComplexity = 20 in { +let Predicates = [HasAVX], AddedComplexity = 20 in { // 128bit broadcasts: def : Pat<(v4f32 (X86VBroadcast FR32:$src)), (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>; @@ -8483,6 +8487,11 @@ (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), sub_xmm), (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), 1)>; + def : Pat<(v2f64 (X86VBroadcast f64:$src)), + (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>; +} + +let Predicates = [HasAVX, NoVLX], AddedComplexity = 20 in { def : Pat<(v4i32 (X86VBroadcast GR32:$src)), (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>; def : Pat<(v8i32 (X86VBroadcast GR32:$src)), @@ -8493,12 +8502,9 @@ (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm), (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>; - } - - def : Pat<(v2f64 (X86VBroadcast f64:$src)), - (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2i64 (X86VBroadcast i64:$src)), - (VMOVDDUPrr (COPY_TO_REGCLASS GR64:$src, VR128))>; + (VMOVDDUPrr (COPY_TO_REGCLASS GR64:$src, VR128))>; } //===----------------------------------------------------------------------===// Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -1036,6 +1036,30 @@ X86_INTRINSIC_DATA(avx512_mask_pavg_w_128, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), X86_INTRINSIC_DATA(avx512_mask_pavg_w_256, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), X86_INTRINSIC_DATA(avx512_mask_pavg_w_512, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_128, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_pbroadcast_d_gpr_128, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_pbroadcast_d_gpr_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_pbroadcast_d_gpr_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_pbroadcast_q_gpr_128, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_pbroadcast_q_gpr_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_pbroadcast_q_gpr_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_pbroadcast_w_gpr_128, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_pbroadcast_w_gpr_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_pbroadcast_w_gpr_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_128, CMP_MASK, X86ISD::PCMPEQM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_256, CMP_MASK, X86ISD::PCMPEQM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_512, CMP_MASK, X86ISD::PCMPEQM, 0), Index: test/CodeGen/X86/avx-isa-check.ll =================================================================== --- test/CodeGen/X86/avx-isa-check.ll +++ test/CodeGen/X86/avx-isa-check.ll @@ -1,5 +1,6 @@ ; check AVX2 instructions that are disabled in case avx512VL/avx512BW present - + +; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=corei7-avx -o /dev/null ; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=core-avx2 -mattr=+avx2 -o /dev/null ; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=knl -o /dev/null ; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=knl -mattr=+avx512vl -o /dev/null @@ -575,3 +576,76 @@ %C = zext <8 x i8> %B to <8 x i16> ret <8 x i16> %C } + +define <32 x i8> @_broadcast32xi8(i8 %a) { + %b = insertelement <32 x i8> undef, i8 %a, i32 0 + %c = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer + ret <32 x i8> %c +} + +define <16 x i8> @_broadcast16xi8(i8 %a) { + %b = insertelement <16 x i8> undef, i8 %a, i32 0 + %c = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer + ret <16 x i8> %c +} + +define <16 x i16> @_broadcast16xi16(i16 %a) { + %b = insertelement <16 x i16> undef, i16 %a, i32 0 + %c = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer + ret <16 x i16> %c +} + +define <8 x i16> @_broadcast8xi16(i16 %a) { + %b = insertelement <8 x i16> undef, i16 %a, i32 0 + %c = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer + ret <8 x i16> %c +} + +define <8 x i32> @_broadcast8xi32(i32 %a) { + %b = insertelement <8 x i32> undef, i32 %a, i32 0 + %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer + ret <8 x i32> %c +} + +define <4 x i32> @_broadcast4xi32(i32 %a) { + %b = insertelement <4 x i32> undef, i32 %a, i32 0 + %c = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer + ret <4 x i32> %c +} + +define <4 x i64> @_broadcast4xi64(i64 %a) { + %b = insertelement <4 x i64> undef, i64 %a, i64 0 + %c = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer + ret <4 x i64> %c +} + +define <2 x i64> @_broadcast2xi64(i64 %a) { + %b = insertelement <2 x i64> undef, i64 %a, i64 0 + %c = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer + ret <2 x i64> %c +} + +define <8 x float> @_broadcast8xfloat(float %a) { + %b = insertelement <8 x float> undef, float %a, i32 0 + %c = shufflevector <8 x float> %b, <8 x float> undef, <8 x i32> zeroinitializer + ret <8 x float> %c +} + +define <4 x float> @_broadcast4xfloat(float %a) { + %b = insertelement <4 x float> undef, float %a, i32 0 + %c = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer + ret <4 x float> %c +} + +define <4 x double> @_broadcast4xdouble(double %a) { + %b = insertelement <4 x double> undef, double %a, i32 0 + %c = shufflevector <4 x double> %b, <4 x double> undef, <4 x i32> zeroinitializer + ret <4 x double> %c +} + +define <2 x double> @_broadcast2xdouble(double %a) { + %b = insertelement <2 x double> undef, double %a, i32 0 + %c = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer + ret <2 x double> %c +} + Index: test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics.ll +++ test/CodeGen/X86/avx512-intrinsics.ll @@ -574,16 +574,6 @@ } declare <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32>, <16 x i32>, i16) -define <16 x i32> @test_x86_pbroadcastd_i32_512(i32 %a0) { -; CHECK-LABEL: test_x86_pbroadcastd_i32_512: -; CHECK: ## BB#0: -; CHECK-NEXT: vpbroadcastd %edi, %zmm0 -; CHECK-NEXT: retq - %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.i32.512(i32 %a0) ; <<16 x i32>> [#uses=1] - ret <16 x i32> %res -} -declare <16 x i32> @llvm.x86.avx512.pbroadcastd.i32.512(i32) nounwind readonly - define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1, i8 %mask) { ; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_512: ; CHECK: ## BB#0: @@ -603,16 +593,6 @@ } declare <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64>, <8 x i64>, i8) -define <8 x i64> @test_x86_pbroadcastq_i64_512(i64 %a0) { -; CHECK-LABEL: test_x86_pbroadcastq_i64_512: -; CHECK: ## BB#0: -; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 -; CHECK-NEXT: retq - %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.i64.512(i64 %a0) ; <<8 x i64>> [#uses=1] - ret <8 x i64> %res -} -declare <8 x i64> @llvm.x86.avx512.pbroadcastq.i64.512(i64) nounwind readonly - define <16 x i32> @test_conflict_d(<16 x i32> %a) { ; CHECK-LABEL: test_conflict_d: ; CHECK: ## BB#0: @@ -7356,3 +7336,42 @@ %res2 = add i8 %res, %res1 ret i8 %res2 } + +define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1} +; CHECK-NEXT: vpbroadcastd %edi, %zmm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastd %edi, %zmm2 +; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 -1) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 %mask) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> zeroinitializer, i16 %mask) + %res3 = add <16 x i32> %res, %res1 + %res4 = add <16 x i32> %res2, %res3 + ret <16 x i32> %res4 +} + +declare <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32, <16 x i32>, i16) + +define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_gpr_512(i64 %x0, <8 x i64> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastq_gpr_512: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} +; CHECK-NEXT: vpbroadcastq %rdi, %zmm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastq %rdi, %zmm2 +; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 -1) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 %mask) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> zeroinitializer,i8 %mask) + %res3 = add <8 x i64> %res, %res1 + %res4 = add <8 x i64> %res2, %res3 + ret <8 x i64> %res4 +} +declare <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64, <8 x i64>, i8) Index: test/CodeGen/X86/avx512bw-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics.ll +++ test/CodeGen/X86/avx512bw-intrinsics.ll @@ -3516,3 +3516,67 @@ %res2 = add i32 %res, %res1 ret i32 %res2 } + +declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64) + +define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rsi, %k1 +; AVX512BW-NEXT: vpbroadcastb %dil, %zmm0 {%k1} +; AVX512BW-NEXT: vpbroadcastb %dil, %zmm1 {%k1} {z} +; AVX512BW-NEXT: vpbroadcastb %dil, %zmm2 +; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: movb {{[0-9]+}}(%esp), %al +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0 +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1 +; AVX512F-32-NEXT: vpbroadcastb %al, %zmm1 {%k1} {z} +; AVX512F-32-NEXT: vpbroadcastb %al, %zmm0 {%k1} +; AVX512F-32-NEXT: vpbroadcastb %al, %zmm2 +; AVX512F-32-NEXT: vpaddb %zmm0, %zmm2, %zmm0 +; AVX512F-32-NEXT: vpaddb %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 -1) + %res1 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 %mask) + %res2 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> zeroinitializer, i64 %mask) + %res3 = add <64 x i8> %res, %res1 + %res4 = add <64 x i8> %res2, %res3 + ret <64 x i8> %res4 +} + +declare <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16, <32 x i16>, i32) + +define <32 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_512(i16 %x0, <32 x i16> %x1, i32 %mask) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpbroadcastw %di, %zmm0 {%k1} +; AVX512BW-NEXT: vpbroadcastw %di, %zmm1 {%k1} {z} +; AVX512BW-NEXT: vpbroadcastw %di, %zmm2 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: movw {{[0-9]+}}(%esp), %ax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpbroadcastw %ax, %zmm0 {%k1} +; AVX512F-32-NEXT: vpbroadcastw %ax, %zmm1 {%k1} {z} +; AVX512F-32-NEXT: vpbroadcastw %ax, %zmm2 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 -1) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 %mask) + %res2 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> zeroinitializer, i32 %mask) + %res3 = add <32 x i16> %res, %res1 + %res4 = add <32 x i16> %res2, %res3 + ret <32 x i16> %res4 +} Index: test/CodeGen/X86/avx512bwvl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -5444,3 +5444,82 @@ ret i16 %res2 } +declare <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8, <32 x i8>, i32) + +define <32 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_256(i8 %x0, <32 x i8> %x1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_256: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpbroadcastb %dil, %ymm0 {%k1} +; CHECK-NEXT: vpbroadcastb %dil, %ymm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastb %dil, %ymm2 +; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 -1) + %res1 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 %mask) + %res2 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> zeroinitializer, i32 %mask) + %res3 = add <32 x i8> %res, %res1 + %res4 = add <32 x i8> %res2, %res3 + ret <32 x i8> %res4 +} + +declare <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_128(i8 %x0, <16 x i8> %x1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_128: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpbroadcastb %dil, %xmm0 {%k1} +; CHECK-NEXT: vpbroadcastb %dil, %xmm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastb %dil, %xmm2 +; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 %mask) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> zeroinitializer, i16 %mask) + %res3 = add <16 x i8> %res, %res1 + %res4 = add <16 x i8> %res2, %res3 + ret <16 x i8> %res4 +} + +declare <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_256(i16 %x0, <16 x i16> %x1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_256: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpbroadcastw %di, %ymm0 {%k1} +; CHECK-NEXT: vpbroadcastw %di, %ymm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastw %di, %ymm2 +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 -1) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 %mask) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> zeroinitializer, i16 %mask) + %res3 = add <16 x i16> %res, %res1 + %res4 = add <16 x i16> %res2, %res3 + ret <16 x i16> %res4 +} + +declare <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_128(i16 %x0, <8 x i16> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_128: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpbroadcastw %di, %xmm0 {%k1} +; CHECK-NEXT: vpbroadcastw %di, %xmm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastw %di, %xmm2 +; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 %mask) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> zeroinitializer, i8 %mask) + %res3 = add <8 x i16> %res, %res1 + %res4 = add <8 x i16> %res2, %res3 + ret <8 x i16> %res4 +} Index: test/CodeGen/X86/avx512vl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics.ll +++ test/CodeGen/X86/avx512vl-intrinsics.ll @@ -8206,3 +8206,82 @@ ret i8 %res2 } +declare <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_256(i32 %x0, <8 x i32> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_256: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1} +; CHECK-NEXT: vpbroadcastd %edi, %ymm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastd %edi, %ymm2 +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 -1) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 %mask) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> zeroinitializer, i8 %mask) + %res3 = add <8 x i32> %res, %res1 + %res4 = add <8 x i32> %res2, %res3 + ret <8 x i32> %res4 +} + +declare <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_128(i32 %x0, <4 x i32> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_128: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1} +; CHECK-NEXT: vpbroadcastd %edi, %xmm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastd %edi, %xmm2 +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 -1) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 %mask) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> zeroinitializer, i8 %mask) + %res3 = add <4 x i32> %res, %res1 + %res4 = add <4 x i32> %res2, %res3 + ret <4 x i32> %res4 +} + +declare <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_256(i64 %x0, <4 x i64> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_256: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} +; CHECK-NEXT: vpbroadcastq %rdi, %ymm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastq %rdi, %ymm2 +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 -1) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 %mask) + %res2 = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> zeroinitializer,i8 %mask) + %res3 = add <4 x i64> %res, %res1 + %res4 = add <4 x i64> %res2, %res3 + ret <4 x i64> %res4 +} + +declare <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_128(i64 %x0, <2 x i64> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_128: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpbroadcastq %rdi, %xmm0 {%k1} +; CHECK-NEXT: vpbroadcastq %rdi, %xmm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastq %rdi, %xmm2 +; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 -1) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 %mask) + %res2 = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> zeroinitializer,i8 %mask) + %res3 = add <2 x i64> %res, %res1 + %res4 = add <2 x i64> %res2, %res3 + ret <2 x i64> %res4 +} Index: test/CodeGen/X86/masked_gather_scatter.ll =================================================================== --- test/CodeGen/X86/masked_gather_scatter.ll +++ test/CodeGen/X86/masked_gather_scatter.ll @@ -637,8 +637,7 @@ ; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1 ; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0 ; SKX-NEXT: vpbroadcastq %xmm0, %zmm0 -; SKX-NEXT: vmovd %esi, %xmm1 -; SKX-NEXT: vpbroadcastd %xmm1, %ymm1 +; SKX-NEXT: vpbroadcastd %esi, %ymm1 ; SKX-NEXT: vpmovsxdq %ymm1, %zmm1 ; SKX-NEXT: vpsllq $2, %zmm1, %zmm1 ; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 Index: test/CodeGen/X86/vector-shuffle-v1.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-v1.ll +++ test/CodeGen/X86/vector-shuffle-v1.ll @@ -74,13 +74,13 @@ ; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 -; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,1,0,3,7,7,0] -; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1 -; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1 -; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: movq {{.*}}(%rip), %rax +; AVX512F-NEXT: vpbroadcastq %rax, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpbroadcastq %rax, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: retq ; @@ -105,14 +105,14 @@ ; AVX512F: # BB#0: ; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqd %zmm3, %zmm1, %k2 -; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} {z} -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} {z} -; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] -; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512F-NEXT: vpslld $31, %zmm2, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: movl {{.*}}(%rip), %eax +; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k2} {z} +; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z} +; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vpslld $31, %zmm1, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: retq ; @@ -163,13 +163,13 @@ ; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u: ; AVX512F: # BB#0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z} -; AVX512F-NEXT: vextracti32x4 $1, %zmm1, %xmm1 -; AVX512F-NEXT: vpbroadcastq %xmm1, %zmm1 -; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1 -; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: movq {{.*}}(%rip), %rax +; AVX512F-NEXT: vpbroadcastq %rax, %zmm0 {%k1} {z} +; AVX512F-NEXT: vextracti32x4 $1, %zmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastq %xmm0, %zmm0 +; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpbroadcastq %rax, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: retq ; @@ -310,12 +310,12 @@ ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: movb $51, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: movq {{.*}}(%rip), %rax +; AVX512F-NEXT: vpbroadcastq %rax, %zmm0 {%k2} {z} +; AVX512F-NEXT: vpbroadcastq %rax, %zmm1 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: retq