Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -19916,9 +19916,11 @@ MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64; MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64); Src = DAG.getBitcast(BitcastVT, Src); + MVT ResVT = MVT::getVectorVT(ScalarVT, VT.getSizeInBits()/64); + SDValue Res = DAG.getNode(IntrData->Opc0, dl, ResVT, Src); + Res = DAG.getBitcast(VT, Res); - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src), - Mask, PassThru, Subtarget, DAG); + return getVectorMaskingNode(Res, Mask, PassThru, Subtarget, DAG); } default: break; Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -1028,39 +1028,60 @@ DestInfo.KRCWM:$mask, (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>; } -multiclass avx512_broadcast_rm opc, string OpcodeStr, - X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> { +// Split version to allow mask and broadcast node to be different types. This +// helps support the 32x2 broadcasts. +multiclass avx512_broadcast_rm_split opc, string OpcodeStr, + X86VectorVTInfo MaskInfo, + X86VectorVTInfo DestInfo, + X86VectorVTInfo SrcInfo> { let ExeDomain = DestInfo.ExeDomain in { - defm r : AVX512_maskable, + (MaskInfo.VT + (bitconvert + (DestInfo.VT + (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src)))))>, T8PD, EVEX; - defm m : AVX512_maskable, + (MaskInfo.VT + (bitconvert + (DestInfo.VT (X86VBroadcast + (SrcInfo.ScalarLdFrag addr:$src)))))>, T8PD, EVEX, EVEX_CD8; } - def : Pat<(DestInfo.VT (X86VBroadcast - (SrcInfo.VT (scalar_to_vector - (SrcInfo.ScalarLdFrag addr:$src))))), - (!cast(NAME#DestInfo.ZSuffix#m) addr:$src)>; - def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask, - (X86VBroadcast - (SrcInfo.VT (scalar_to_vector - (SrcInfo.ScalarLdFrag addr:$src)))), - DestInfo.RC:$src0)), + def : Pat<(MaskInfo.VT + (bitconvert + (DestInfo.VT (X86VBroadcast + (SrcInfo.VT (scalar_to_vector + (SrcInfo.ScalarLdFrag addr:$src))))))), + (!cast(NAME#MaskInfo.ZSuffix#m) addr:$src)>; + def : Pat<(MaskInfo.VT (vselect MaskInfo.KRCWM:$mask, + (bitconvert + (DestInfo.VT + (X86VBroadcast + (SrcInfo.VT (scalar_to_vector + (SrcInfo.ScalarLdFrag addr:$src)))))), + MaskInfo.RC:$src0)), (!cast(NAME#DestInfo.ZSuffix#mk) - DestInfo.RC:$src0, DestInfo.KRCWM:$mask, addr:$src)>; - def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask, - (X86VBroadcast - (SrcInfo.VT (scalar_to_vector - (SrcInfo.ScalarLdFrag addr:$src)))), - DestInfo.ImmAllZerosV)), - (!cast(NAME#DestInfo.ZSuffix#mkz) - DestInfo.KRCWM:$mask, addr:$src)>; -} + MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask, addr:$src)>; + def : Pat<(MaskInfo.VT (vselect MaskInfo.KRCWM:$mask, + (bitconvert + (DestInfo.VT + (X86VBroadcast + (SrcInfo.VT (scalar_to_vector + (SrcInfo.ScalarLdFrag addr:$src)))))), + MaskInfo.ImmAllZerosV)), + (!cast(NAME#MaskInfo.ZSuffix#mkz) + MaskInfo.KRCWM:$mask, addr:$src)>; +} + +// Helper class to force mask and broadcast result to same type. +multiclass avx512_broadcast_rm opc, string OpcodeStr, + X86VectorVTInfo DestInfo, + X86VectorVTInfo SrcInfo> : + avx512_broadcast_rm_split; multiclass avx512_fp_broadcast_sd opc, string OpcodeStr, AVX512VLVectorVTInfo _> { @@ -1384,11 +1405,13 @@ multiclass avx512_common_broadcast_32x2 opc, string OpcodeStr, AVX512VLVectorVTInfo _Dst, AVX512VLVectorVTInfo _Src> { let Predicates = [HasDQI] in - defm Z : avx512_broadcast_rm, - EVEX_V512; + defm Z : avx512_broadcast_rm_split, + EVEX_V512; let Predicates = [HasDQI, HasVLX] in - defm Z256 : avx512_broadcast_rm, - EVEX_V256; + defm Z256 : avx512_broadcast_rm_split, + EVEX_V256; } multiclass avx512_common_broadcast_i32x2 opc, string OpcodeStr, @@ -1396,8 +1419,9 @@ avx512_common_broadcast_32x2 { let Predicates = [HasDQI, HasVLX] in - defm Z128 : avx512_broadcast_rm, - EVEX_V128; + defm Z128 : avx512_broadcast_rm_split, + EVEX_V128; } defm VBROADCASTI32X2 : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2", Index: test/CodeGen/X86/vector-shuffle-masked.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-masked.ll +++ test/CodeGen/X86/vector-shuffle-masked.ll @@ -1680,3 +1680,158 @@ %res = select <8 x i1> %mask.cast, <8 x i64> %3, <8 x i64> zeroinitializer ret <8 x i64> %res } + +define <4 x float> @test_broadcastf32x2_v4f32(<4 x float> %vec, <4 x float> %passthru, i8 %mask) { +; CHECK-LABEL: test_broadcastf32x2_v4f32: +; CHECK: # BB#0: +; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res = select <4 x i1> %mask.extract, <4 x float> %shuf, <4 x float> %passthru + ret <4 x float> %res +} + +define <4 x float> @test_broadcastf32x2_v4f32_z(<4 x float> %vec, i8 %mask) { +; CHECK-LABEL: test_broadcastf32x2_v4f32_z: +; CHECK: # BB#0: +; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res = select <4 x i1> %mask.extract, <4 x float> %shuf, <4 x float> zeroinitializer + ret <4 x float> %res +} + +define <4 x i32> @test_broadcasti32x2_v4i32(<4 x i32> %vec, <4 x i32> %passthru, i8 %mask) { +; CHECK-LABEL: test_broadcasti32x2_v4i32: +; CHECK: # BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: retq + %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res = select <4 x i1> %mask.extract, <4 x i32> %shuf, <4 x i32> %passthru + ret <4 x i32> %res +} + +define <4 x i32> @test_broadcasti32x2_v4i32_z(<4 x i32> %vec, i8 %mask) { +; CHECK-LABEL: test_broadcasti32x2_v4i32_z: +; CHECK: # BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res = select <4 x i1> %mask.extract, <4 x i32> %shuf, <4 x i32> zeroinitializer + ret <4 x i32> %res +} + +define <8 x float> @test_broadcastf32x2_v8f32(<8 x float> %vec, <8 x float> %passthru, i8 %mask) { +; CHECK-LABEL: test_broadcastf32x2_v8f32: +; CHECK: # BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: retq + %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x float> %shuf, <8 x float> %passthru + ret <8 x float> %res +} + +define <8 x float> @test_broadcastf32x2_v8f32_z(<8 x float> %vec, i8 %mask) { +; CHECK-LABEL: test_broadcastf32x2_v8f32_z: +; CHECK: # BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1] +; CHECK-NEXT: retq + %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x float> %shuf, <8 x float> zeroinitializer + ret <8 x float> %res +} + +define <8 x i32> @test_broadcasti32x2_v8i32(<8 x i32> %vec, <8 x i32> %passthru, i8 %mask) { +; CHECK-LABEL: test_broadcasti32x2_v8i32: +; CHECK: # BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: retq + %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i32> %shuf, <8 x i32> %passthru + ret <8 x i32> %res +} + +define <8 x i32> @test_broadcasti32x2_v8i32_z(<8 x i32> %vec, i8 %mask) { +; CHECK-LABEL: test_broadcasti32x2_v8i32_z: +; CHECK: # BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1] +; CHECK-NEXT: retq + %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i32> %shuf, <8 x i32> zeroinitializer + ret <8 x i32> %res +} + +define <16 x float> @test_broadcastf32x2_v16f32_z(<16 x float> %vec, i16 %mask) { +; CHECK-LABEL: test_broadcastf32x2_v16f32_z: +; CHECK: # BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; CHECK-NEXT: retq + %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> + %mask.cast = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %mask.cast, <16 x float> %shuf, <16 x float> zeroinitializer + ret <16 x float> %res +} + +define <16 x i32> @test_broadcasti32x2_v16i32(<16 x i32> %vec, <16 x i32> %passthru, i16 %mask) { +; CHECK-LABEL: test_broadcasti32x2_v16i32: +; CHECK: # BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq + %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %mask.cast = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %mask.cast, <16 x i32> %shuf, <16 x i32> %passthru + ret <16 x i32> %res +} + +define <16 x float> @test_broadcastf32x2_v16f32(<16 x float> %vec, <16 x float> %passthru, i16 %mask) { +; CHECK-LABEL: test_broadcastf32x2_v16f32: +; CHECK: # BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; CHECK-NEXT: vmovapd %zmm1, %zmm0 +; CHECK-NEXT: retq + %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> + %mask.cast = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %mask.cast, <16 x float> %shuf, <16 x float> %passthru + ret <16 x float> %res +} + +define <16 x i32> @test_broadcasti32x2_v16i32_z(<16 x i32> %vec, i16 %mask) { +; CHECK-LABEL: test_broadcasti32x2_v16i32_z: +; CHECK: # BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; CHECK-NEXT: retq + %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %mask.cast = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %mask.cast, <16 x i32> %shuf, <16 x i32> zeroinitializer + ret <16 x i32> %res +}