This is an archive of the discontinued LLVM Phabricator instance.

[AVX512] Correct isel patterns to support selecting masked vbroadcastf32x2/vbroadcasti32x2
ClosedPublic

Authored by craig.topper on Aug 29 2017, 10:37 PM.

Download Raw Diff

Details

Reviewers

aymanmus
zvi
igorb

Commits

rG17854ecf2423: [AVX512] Correct isel patterns to support selecting masked…
rL312101: [AVX512] Correct isel patterns to support selecting masked…

Summary

This patch adjusts the patterns to make the result type of the broadcast node vXf64/vXi64. Then adds a bitcast to vXi32 after that. Intrinsic lowering was also adjusted to generate this new pattern.

Fixes PR34357

We should probably just drop the intrinsic entirely and use native IR, but I'll leave that for a future patch.

Any idea what instruction we should be lowering the floating point 128-bit result version of this pattern to? There's a 128-bit v2i32 integer broadcast but not an fp one.

Diff Detail

Repository: rL LLVM

Event Timeline

craig.topper created this revision.Aug 29 2017, 10:37 PM

craig.topper added a child revision: D37287: [X86] Implement broadcastf32x2 and broadcasti32x2 intrinsics using __builtin_shufflevector instead builtins.Aug 30 2017, 12:11 AM

craig.topper retitled this revision from [AVX512] Correct isel patterns to support selecting masked vbroadcastf32x2/vbroadi32x2 to [AVX512] Correct isel patterns to support selecting masked vbroadcastf32x2/vbroadcasti32x2.

LGTM

Regarding the 128-bit floating point version, the resulted sequence in the test looks fine.
It can be lowered to either vmovddup or vshufps, but both showed the same throughput results on IACA.

This revision is now accepted and ready to land.Aug 30 2017, 12:33 AM

movddup won't allow the mask to fold. shufps would allow the masking to fold. The only annoying thing is that we can't fold the 64-bit load with shufps.

Closed by commit rL312101: [AVX512] Correct isel patterns to support selecting masked… (authored by ctopper). · Explain WhyAug 30 2017, 12:50 AM

This revision was automatically updated to reflect the committed changes.

That's right, but in the merge-mask version it doesn't really improve anything, you must have the mov instruction from xmm1 to xmm0, so folding the mask into the mov or the shuffle/duplicate is equivalent.
Only in the zero-mask version it can save us the last masked mov (if we fold the mask), but still IACA showed no throughput improvement.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

X86/

X86ISelLowering.cpp

6 lines

X86InstrAVX512.td

84 lines

test/

CodeGen/

X86/

vector-shuffle-masked.ll

155 lines

Diff 113202

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 19,910 Lines • ▼ Show 20 Lines	case BRCST32x2_TO_VEC: {
SDValue Mask = Op.getOperand(3);		SDValue Mask = Op.getOperand(3);

assert((VT.getScalarType() == MVT::i32 \|\|		assert((VT.getScalarType() == MVT::i32 \|\|
VT.getScalarType() == MVT::f32) && "Unexpected type!");		VT.getScalarType() == MVT::f32) && "Unexpected type!");
//bitcast Src to packed 64		//bitcast Src to packed 64
MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;		MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);		MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
Src = DAG.getBitcast(BitcastVT, Src);		Src = DAG.getBitcast(BitcastVT, Src);
		MVT ResVT = MVT::getVectorVT(ScalarVT, VT.getSizeInBits()/64);
		SDValue Res = DAG.getNode(IntrData->Opc0, dl, ResVT, Src);
		Res = DAG.getBitcast(VT, Res);

return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),		return getVectorMaskingNode(Res, Mask, PassThru, Subtarget, DAG);
Mask, PassThru, Subtarget, DAG);
}		}
default:		default:
break;		break;
}		}
}		}

switch (IntNo) {		switch (IntNo) {
default: return SDValue(); // Don't custom lower most intrinsics.		default: return SDValue(); // Don't custom lower most intrinsics.
▲ Show 20 Lines • Show All 16,928 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/X86/X86InstrAVX512.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,080 Lines • ▼ Show 20 Lines	def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
(COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>;		(COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>;
def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,		def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
(X86VBroadcast SrcInfo.FRC:$src),		(X86VBroadcast SrcInfo.FRC:$src),
DestInfo.ImmAllZerosV)),		DestInfo.ImmAllZerosV)),
(!cast<Instruction>(NAME#DestInfo.ZSuffix#rkz)		(!cast<Instruction>(NAME#DestInfo.ZSuffix#rkz)
DestInfo.KRCWM:$mask, (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>;		DestInfo.KRCWM:$mask, (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>;
}		}

multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr,		// Split version to allow mask and broadcast node to be different types. This
X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> {		// helps support the 32x2 broadcasts.
		multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
		X86VectorVTInfo MaskInfo,
		X86VectorVTInfo DestInfo,
		X86VectorVTInfo SrcInfo> {
let ExeDomain = DestInfo.ExeDomain in {		let ExeDomain = DestInfo.ExeDomain in {
defm r : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),		defm r : AVX512_maskable<opc, MRMSrcReg, MaskInfo, (outs MaskInfo.RC:$dst),
(ins SrcInfo.RC:$src), OpcodeStr, "$src", "$src",		(ins SrcInfo.RC:$src), OpcodeStr, "$src", "$src",
(DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src)))>,		(MaskInfo.VT
		(bitconvert
		(DestInfo.VT
		(X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src)))))>,
T8PD, EVEX;		T8PD, EVEX;
defm m : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),		defm m : AVX512_maskable<opc, MRMSrcMem, MaskInfo, (outs MaskInfo.RC:$dst),
(ins SrcInfo.ScalarMemOp:$src), OpcodeStr, "$src", "$src",		(ins SrcInfo.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
		(MaskInfo.VT
		(bitconvert
(DestInfo.VT (X86VBroadcast		(DestInfo.VT (X86VBroadcast
(SrcInfo.ScalarLdFrag addr:$src)))>,		(SrcInfo.ScalarLdFrag addr:$src)))))>,
T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>;		T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>;
}		}

def : Pat<(DestInfo.VT (X86VBroadcast		def : Pat<(MaskInfo.VT
		(bitconvert
		(DestInfo.VT (X86VBroadcast
(SrcInfo.VT (scalar_to_vector		(SrcInfo.VT (scalar_to_vector
(SrcInfo.ScalarLdFrag addr:$src))))),		(SrcInfo.ScalarLdFrag addr:$src))))))),
(!cast<Instruction>(NAME#DestInfo.ZSuffix#m) addr:$src)>;		(!cast<Instruction>(NAME#MaskInfo.ZSuffix#m) addr:$src)>;
def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,		def : Pat<(MaskInfo.VT (vselect MaskInfo.KRCWM:$mask,
		(bitconvert
		(DestInfo.VT
(X86VBroadcast		(X86VBroadcast
(SrcInfo.VT (scalar_to_vector		(SrcInfo.VT (scalar_to_vector
(SrcInfo.ScalarLdFrag addr:$src)))),		(SrcInfo.ScalarLdFrag addr:$src)))))),
DestInfo.RC:$src0)),		MaskInfo.RC:$src0)),
(!cast<Instruction>(NAME#DestInfo.ZSuffix#mk)		(!cast<Instruction>(NAME#DestInfo.ZSuffix#mk)
DestInfo.RC:$src0, DestInfo.KRCWM:$mask, addr:$src)>;		MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask, addr:$src)>;
def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,		def : Pat<(MaskInfo.VT (vselect MaskInfo.KRCWM:$mask,
		(bitconvert
		(DestInfo.VT
(X86VBroadcast		(X86VBroadcast
(SrcInfo.VT (scalar_to_vector		(SrcInfo.VT (scalar_to_vector
(SrcInfo.ScalarLdFrag addr:$src)))),		(SrcInfo.ScalarLdFrag addr:$src)))))),
DestInfo.ImmAllZerosV)),		MaskInfo.ImmAllZerosV)),
(!cast<Instruction>(NAME#DestInfo.ZSuffix#mkz)		(!cast<Instruction>(NAME#MaskInfo.ZSuffix#mkz)
DestInfo.KRCWM:$mask, addr:$src)>;		MaskInfo.KRCWM:$mask, addr:$src)>;
}		}

		// Helper class to force mask and broadcast result to same type.
		multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr,
		X86VectorVTInfo DestInfo,
		X86VectorVTInfo SrcInfo> :
		avx512_broadcast_rm_split<opc, OpcodeStr, DestInfo, DestInfo, SrcInfo>;

multiclass avx512_fp_broadcast_sd<bits<8> opc, string OpcodeStr,		multiclass avx512_fp_broadcast_sd<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _> {		AVX512VLVectorVTInfo _> {
let Predicates = [HasAVX512] in		let Predicates = [HasAVX512] in
defm Z : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>,		defm Z : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>,
avx512_broadcast_scalar<opc, OpcodeStr, _.info512, _.info128>,		avx512_broadcast_scalar<opc, OpcodeStr, _.info512, _.info128>,
EVEX_V512;		EVEX_V512;

let Predicates = [HasVLX] in {		let Predicates = [HasVLX] in {
▲ Show 20 Lines • Show All 306 Lines • ▼ Show 20 Lines
defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm_dq<0x1b, "vbroadcastf32x8",		defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm_dq<0x1b, "vbroadcastf32x8",
v16f32_info, v8f32x_info>,		v16f32_info, v8f32x_info>,
EVEX_V512, EVEX_CD8<32, CD8VT8>;		EVEX_V512, EVEX_CD8<32, CD8VT8>;
}		}

multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr,		multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _Dst, AVX512VLVectorVTInfo _Src> {		AVX512VLVectorVTInfo _Dst, AVX512VLVectorVTInfo _Src> {
let Predicates = [HasDQI] in		let Predicates = [HasDQI] in
defm Z : avx512_broadcast_rm<opc, OpcodeStr, _Dst.info512, _Src.info128>,		defm Z : avx512_broadcast_rm_split<opc, OpcodeStr, _Dst.info512,
		_Src.info512, _Src.info128>,
EVEX_V512;		EVEX_V512;
let Predicates = [HasDQI, HasVLX] in		let Predicates = [HasDQI, HasVLX] in
defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _Dst.info256, _Src.info128>,		defm Z256 : avx512_broadcast_rm_split<opc, OpcodeStr, _Dst.info256,
		_Src.info256, _Src.info128>,
EVEX_V256;		EVEX_V256;
}		}

multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr,		multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _Dst, AVX512VLVectorVTInfo _Src> :		AVX512VLVectorVTInfo _Dst, AVX512VLVectorVTInfo _Src> :
avx512_common_broadcast_32x2<opc, OpcodeStr, _Dst, _Src> {		avx512_common_broadcast_32x2<opc, OpcodeStr, _Dst, _Src> {

let Predicates = [HasDQI, HasVLX] in		let Predicates = [HasDQI, HasVLX] in
defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, _Dst.info128, _Src.info128>,		defm Z128 : avx512_broadcast_rm_split<opc, OpcodeStr, _Dst.info128,
		_Src.info128, _Src.info128>,
EVEX_V128;		EVEX_V128;
}		}

defm VBROADCASTI32X2 : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2",		defm VBROADCASTI32X2 : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2",
avx512vl_i32_info, avx512vl_i64_info>;		avx512vl_i32_info, avx512vl_i64_info>;
defm VBROADCASTF32X2 : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2",		defm VBROADCASTF32X2 : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2",
avx512vl_f32_info, avx512vl_f64_info>;		avx512vl_f32_info, avx512vl_f64_info>;

let Predicates = [HasVLX] in {		let Predicates = [HasVLX] in {
▲ Show 20 Lines • Show All 9,094 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/vector-shuffle-masked.ll

	Show First 20 Lines • Show All 1,674 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%1 = load <8 x i32>, <8 x i32> *%p			%1 = load <8 x i32>, <8 x i32> *%p
	%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>			%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
	%3 = bitcast <16 x i32> %2 to <8 x i64>			%3 = bitcast <16 x i32> %2 to <8 x i64>
	%mask.cast = bitcast i8 %mask to <8 x i1>			%mask.cast = bitcast i8 %mask to <8 x i1>
	%res = select <8 x i1> %mask.cast, <8 x i64> %3, <8 x i64> zeroinitializer			%res = select <8 x i1> %mask.cast, <8 x i64> %3, <8 x i64> zeroinitializer
	ret <8 x i64> %res			ret <8 x i64> %res
	}			}

				define <4 x float> @test_broadcastf32x2_v4f32(<4 x float> %vec, <4 x float> %passthru, i8 %mask) {
				; CHECK-LABEL: test_broadcastf32x2_v4f32:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
				; CHECK-NEXT: kmovw %edi, %k1
				; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
				; CHECK-NEXT: retq
				%shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
				%mask.cast = bitcast i8 %mask to <8 x i1>
				%mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
				%res = select <4 x i1> %mask.extract, <4 x float> %shuf, <4 x float> %passthru
				ret <4 x float> %res
				}

				define <4 x float> @test_broadcastf32x2_v4f32_z(<4 x float> %vec, i8 %mask) {
				; CHECK-LABEL: test_broadcastf32x2_v4f32_z:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
				; CHECK-NEXT: kmovw %edi, %k1
				; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
				; CHECK-NEXT: retq
				%shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
				%mask.cast = bitcast i8 %mask to <8 x i1>
				%mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
				%res = select <4 x i1> %mask.extract, <4 x float> %shuf, <4 x float> zeroinitializer
				ret <4 x float> %res
				}

				define <4 x i32> @test_broadcasti32x2_v4i32(<4 x i32> %vec, <4 x i32> %passthru, i8 %mask) {
				; CHECK-LABEL: test_broadcasti32x2_v4i32:
				; CHECK: # BB#0:
				; CHECK-NEXT: kmovw %edi, %k1
				; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm1 {%k1}
				; CHECK-NEXT: vmovdqa %xmm1, %xmm0
				; CHECK-NEXT: retq
				%shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
				%mask.cast = bitcast i8 %mask to <8 x i1>
				%mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
				%res = select <4 x i1> %mask.extract, <4 x i32> %shuf, <4 x i32> %passthru
				ret <4 x i32> %res
				}

				define <4 x i32> @test_broadcasti32x2_v4i32_z(<4 x i32> %vec, i8 %mask) {
				; CHECK-LABEL: test_broadcasti32x2_v4i32_z:
				; CHECK: # BB#0:
				; CHECK-NEXT: kmovw %edi, %k1
				; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm0 {%k1} {z}
				; CHECK-NEXT: retq
				%shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
				%mask.cast = bitcast i8 %mask to <8 x i1>
				%mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
				%res = select <4 x i1> %mask.extract, <4 x i32> %shuf, <4 x i32> zeroinitializer
				ret <4 x i32> %res
				}

				define <8 x float> @test_broadcastf32x2_v8f32(<8 x float> %vec, <8 x float> %passthru, i8 %mask) {
				; CHECK-LABEL: test_broadcastf32x2_v8f32:
				; CHECK: # BB#0:
				; CHECK-NEXT: kmovw %edi, %k1
				; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
				; CHECK-NEXT: vmovapd %ymm1, %ymm0
				; CHECK-NEXT: retq
				%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
				%mask.cast = bitcast i8 %mask to <8 x i1>
				%res = select <8 x i1> %mask.cast, <8 x float> %shuf, <8 x float> %passthru
				ret <8 x float> %res
				}

				define <8 x float> @test_broadcastf32x2_v8f32_z(<8 x float> %vec, i8 %mask) {
				; CHECK-LABEL: test_broadcastf32x2_v8f32_z:
				; CHECK: # BB#0:
				; CHECK-NEXT: kmovw %edi, %k1
				; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
				; CHECK-NEXT: retq
				%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
				%mask.cast = bitcast i8 %mask to <8 x i1>
				%res = select <8 x i1> %mask.cast, <8 x float> %shuf, <8 x float> zeroinitializer
				ret <8 x float> %res
				}

				define <8 x i32> @test_broadcasti32x2_v8i32(<8 x i32> %vec, <8 x i32> %passthru, i8 %mask) {
				; CHECK-LABEL: test_broadcasti32x2_v8i32:
				; CHECK: # BB#0:
				; CHECK-NEXT: kmovw %edi, %k1
				; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
				; CHECK-NEXT: vmovdqa %ymm1, %ymm0
				; CHECK-NEXT: retq
				%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
				%mask.cast = bitcast i8 %mask to <8 x i1>
				%res = select <8 x i1> %mask.cast, <8 x i32> %shuf, <8 x i32> %passthru
				ret <8 x i32> %res
				}

				define <8 x i32> @test_broadcasti32x2_v8i32_z(<8 x i32> %vec, i8 %mask) {
				; CHECK-LABEL: test_broadcasti32x2_v8i32_z:
				; CHECK: # BB#0:
				; CHECK-NEXT: kmovw %edi, %k1
				; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
				; CHECK-NEXT: retq
				%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
				%mask.cast = bitcast i8 %mask to <8 x i1>
				%res = select <8 x i1> %mask.cast, <8 x i32> %shuf, <8 x i32> zeroinitializer
				ret <8 x i32> %res
				}

				define <16 x float> @test_broadcastf32x2_v16f32_z(<16 x float> %vec, i16 %mask) {
				; CHECK-LABEL: test_broadcastf32x2_v16f32_z:
				; CHECK: # BB#0:
				; CHECK-NEXT: kmovw %edi, %k1
				; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
				; CHECK-NEXT: retq
				%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
				%mask.cast = bitcast i16 %mask to <16 x i1>
				%res = select <16 x i1> %mask.cast, <16 x float> %shuf, <16 x float> zeroinitializer
				ret <16 x float> %res
				}

				define <16 x i32> @test_broadcasti32x2_v16i32(<16 x i32> %vec, <16 x i32> %passthru, i16 %mask) {
				; CHECK-LABEL: test_broadcasti32x2_v16i32:
				; CHECK: # BB#0:
				; CHECK-NEXT: kmovw %edi, %k1
				; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
				; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
				; CHECK-NEXT: retq
				%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
				%mask.cast = bitcast i16 %mask to <16 x i1>
				%res = select <16 x i1> %mask.cast, <16 x i32> %shuf, <16 x i32> %passthru
				ret <16 x i32> %res
				}

				define <16 x float> @test_broadcastf32x2_v16f32(<16 x float> %vec, <16 x float> %passthru, i16 %mask) {
				; CHECK-LABEL: test_broadcastf32x2_v16f32:
				; CHECK: # BB#0:
				; CHECK-NEXT: kmovw %edi, %k1
				; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
				; CHECK-NEXT: vmovapd %zmm1, %zmm0
				; CHECK-NEXT: retq
				%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
				%mask.cast = bitcast i16 %mask to <16 x i1>
				%res = select <16 x i1> %mask.cast, <16 x float> %shuf, <16 x float> %passthru
				ret <16 x float> %res
				}

				define <16 x i32> @test_broadcasti32x2_v16i32_z(<16 x i32> %vec, i16 %mask) {
				; CHECK-LABEL: test_broadcasti32x2_v16i32_z:
				; CHECK: # BB#0:
				; CHECK-NEXT: kmovw %edi, %k1
				; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
				; CHECK-NEXT: retq
				%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
				%mask.cast = bitcast i16 %mask to <16 x i1>
				%res = select <16 x i1> %mask.cast, <16 x i32> %shuf, <16 x i32> zeroinitializer
				ret <16 x i32> %res
				}