This is an archive of the discontinued LLVM Phabricator instance.

[AVX512] Bring back vector-shuffle lowering support through broadcasts
Needs RevisionPublic

Authored by rob.khasanov on Oct 21 2014, 11:02 AM.

Download Raw Diff

Details

Reviewers

Summary

I found that after your commit at rev219046 512-bit broadcasts lowering become non-optimal. Most of tests on broadcasting and embedded broadcasting were changed and they doesn’t produce efficient code.

Example below is from your commit changes (it’s the first test from test/CodeGen/X86/avx512-vbroadcast.ll):

define <16 x i32> @_inreg16xi32(i32 %a) {
; CHECK-LABEL: _inreg16xi32:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpbroadcastd %edi, %zmm0
+; CHECK-NEXT: vmovd %edi, %xmm0
+; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0
+; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; CHECK-NEXT: retq
%b = insertelement <16 x i32> undef, i32 %a, i32 0

%c = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
 ret <16 x i32> %c

}

Here, 256-bit broadcast was generated instead of 512-bit one.

I investigated the reason, found that in your version of vector-shuffle lowering there is no AVX-512 support.

In this patch

I added vector-shuffle lowering through broadcasts
Removed asserts and branches likes because this is incorrect
assert(Subtarget->hasDQI() && "We can only lower v8i64 with AVX-512-DQI");
Fixed lowering tests

Diff Detail

Event Timeline

rob.khasanov updated this revision to Diff 15194.Oct 21 2014, 11:02 AM

rob.khasanov retitled this revision from to [AVX512] Bring back vector-shuffle lowering support through broadcasts.

rob.khasanov updated this object.

rob.khasanov edited the test plan for this revision. (Show Details)

rob.khasanov added a reviewer: chandlerc.

rob.khasanov added subscribers: Unknown Object (MLST), anemet, delena.

Why do you need so many functions:
lowerV16F32VectorShuffle
lowerV8I64VectorShuffle
lowerV16I32VectorShuffle
.. ?

I don't see a big diff between them.

Elena

Sorry for the long delay...

lib/Target/X86/X86ISelLowering.cpp
10235	(ignore this)
10313–10319	These changes don't make sense to me. My understanding was that DQI was needed to shuffle integers.

chandlerc requested changes to this revision.Mar 29 2015, 2:09 PM

chandlerc edited edge metadata.

This revision now requires changes to proceed.Mar 29 2015, 2:09 PM

Revision Contents

Path

Size

lib/

Target/

X86/

X86ISelLowering.cpp

29 lines

X86InstrAVX512.td

10 lines

test/

CodeGen/

X86/

avx512-arith.ll

5 lines

avx512-vbroadcast.ll

22 lines

avx512-vec-cmp.ll

24 lines

vector-shuffle-512-v8.ll

6 lines

Diff 15194

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 10,197 Lines • ▼ Show 20 Lines	static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
SelectionDAG &DAG) {		SelectionDAG &DAG) {
SDLoc DL(Op);		SDLoc DL(Op);
assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");		assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");		assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);		ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> Mask = SVOp->getMask();		ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");		assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

		// Check for being able to broadcast a single element.
		if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f64, DL, V1,
		Mask, Subtarget, DAG))
		return Broadcast;

// FIXME: Implement direct support for this type!		// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);		return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
}		}

/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.		/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,		static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
const X86Subtarget *Subtarget,		const X86Subtarget *Subtarget,
SelectionDAG &DAG) {		SelectionDAG &DAG) {
SDLoc DL(Op);		SDLoc DL(Op);
assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");		assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");		assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);		ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> Mask = SVOp->getMask();		ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");		assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

		// Check for being able to broadcast a single element.
		if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16f32, DL, V1,
		Mask, Subtarget, DAG))
		return Broadcast;

// FIXME: Implement direct support for this type!		// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);		return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
}		}

/// \brief Handle lowering of 8-lane 64-bit integer shuffles.		/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,		static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
const X86Subtarget *Subtarget,		const X86Subtarget *Subtarget,
SelectionDAG &DAG) {		SelectionDAG &DAG) {
SDLoc DL(Op);		SDLoc DL(Op);
assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");		assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");		assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);		ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> Mask = SVOp->getMask();		ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");		assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
assert(Subtarget->hasDQI() && "We can only lower v8i64 with AVX-512-DQI");
chandlercUnsubmitted Not Done Reply Inline Actions (ignore this) chandlerc: (ignore this)
		// Check for being able to broadcast a single element.
		if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i64, DL, V1,
		Mask, Subtarget, DAG))
		return Broadcast;

// FIXME: Implement direct support for this type!		// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);		return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
}		}

/// \brief Handle lowering of 16-lane 32-bit integer shuffles.		/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,		static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
const X86Subtarget *Subtarget,		const X86Subtarget *Subtarget,
SelectionDAG &DAG) {		SelectionDAG &DAG) {
SDLoc DL(Op);		SDLoc DL(Op);
assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");		assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");		assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);		ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> Mask = SVOp->getMask();		ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");		assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
assert(Subtarget->hasDQI() && "We can only lower v16i32 with AVX-512-DQI!");

		// Check for being able to broadcast a single element.
		if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i32, DL, V1,
		Mask, Subtarget, DAG))
		return Broadcast;
// FIXME: Implement direct support for this type!		// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);		return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
}		}

/// \brief Handle lowering of 32-lane 16-bit integer shuffles.		/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,		static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
const X86Subtarget *Subtarget,		const X86Subtarget *Subtarget,
SelectionDAG &DAG) {		SelectionDAG &DAG) {
▲ Show 20 Lines • Show All 44 Lines • ▼ Show 20 Lines	static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// lower them. Each lowering routine of a given type is allowed to assume that		// lower them. Each lowering routine of a given type is allowed to assume that
// the requisite ISA extensions for that element type are available.		// the requisite ISA extensions for that element type are available.
switch (VT.SimpleTy) {		switch (VT.SimpleTy) {
case MVT::v8f64:		case MVT::v8f64:
return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);		return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
case MVT::v16f32:		case MVT::v16f32:
return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);		return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
case MVT::v8i64:		case MVT::v8i64:
if (Subtarget->hasDQI())
return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);		return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
break;
case MVT::v16i32:		case MVT::v16i32:
if (Subtarget->hasDQI())
return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);		return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
break;
chandlercUnsubmitted Not Done Reply Inline Actions These changes don't make sense to me. My understanding was that DQI was needed to shuffle integers. chandlerc: These changes don't make sense to me. My understanding was that DQI was needed to shuffle…
case MVT::v32i16:		case MVT::v32i16:
if (Subtarget->hasBWI())		if (Subtarget->hasBWI())
return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);		return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
break;		break;
case MVT::v64i8:		case MVT::v64i8:
if (Subtarget->hasBWI())		if (Subtarget->hasBWI())
return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG);		return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
break;		break;
▲ Show 20 Lines • Show All 15,299 Lines • Show Last 20 Lines

lib/Target/X86/X86InstrAVX512.td

	Show First 20 Lines • Show All 705 Lines • ▼ Show 20 Lines
	def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_512 (v2i64 VR128X:$src))),			def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_512 (v2i64 VR128X:$src))),
	(VPBROADCASTQZrr VR128X:$src)>;			(VPBROADCASTQZrr VR128X:$src)>;

	def : Pat<(v16f32 (X86VBroadcast (v4f32 VR128X:$src))),			def : Pat<(v16f32 (X86VBroadcast (v4f32 VR128X:$src))),
	(VBROADCASTSSZrr VR128X:$src)>;			(VBROADCASTSSZrr VR128X:$src)>;
	def : Pat<(v8f64 (X86VBroadcast (v2f64 VR128X:$src))),			def : Pat<(v8f64 (X86VBroadcast (v2f64 VR128X:$src))),
	(VBROADCASTSDZrr VR128X:$src)>;			(VBROADCASTSDZrr VR128X:$src)>;

				def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))),
				(VBROADCASTSSZrr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>;
				def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))),
				(VBROADCASTSDZrr (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>;

				def : Pat<(v16i32 (X86VBroadcast (v16i32 VR512:$src))),
				(VPBROADCASTDZrr (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>;
				def : Pat<(v8i64 (X86VBroadcast (v8i64 VR512:$src))),
				(VPBROADCASTQZrr (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>;

	def : Pat<(v16f32 (int_x86_avx512_vbroadcast_ss_ps_512 (v4f32 VR128X:$src))),			def : Pat<(v16f32 (int_x86_avx512_vbroadcast_ss_ps_512 (v4f32 VR128X:$src))),
	(VBROADCASTSSZrr VR128X:$src)>;			(VBROADCASTSSZrr VR128X:$src)>;
	def : Pat<(v8f64 (int_x86_avx512_vbroadcast_sd_pd_512 (v2f64 VR128X:$src))),			def : Pat<(v8f64 (int_x86_avx512_vbroadcast_sd_pd_512 (v2f64 VR128X:$src))),
	(VBROADCASTSDZrr VR128X:$src)>;			(VBROADCASTSDZrr VR128X:$src)>;

	// Provide fallback in case the load node that is used in the patterns above			// Provide fallback in case the load node that is used in the patterns above
	// is used by additional users, which prevents the pattern selection.			// is used by additional users, which prevents the pattern selection.
	def : Pat<(v16f32 (X86VBroadcast FR32X:$src)),			def : Pat<(v16f32 (X86VBroadcast FR32X:$src)),
	▲ Show 20 Lines • Show All 4,408 Lines • Show Last 20 Lines

test/CodeGen/X86/avx512-arith.ll

Show First 20 Lines • Show All 447 Lines • ▼ Show 20 Lines	entry:
%a = load <16 x i32>* %x, align 4		%a = load <16 x i32>* %x, align 4
%b = and <16 x i32> %y, %a		%b = and <16 x i32> %y, %a
ret <16 x i32> %b		ret <16 x i32> %b
}		}

define <8 x i64> @andqbrst(<8 x i64> %p1, i64* %ap) {		define <8 x i64> @andqbrst(<8 x i64> %p1, i64* %ap) {
; CHECK-LABEL: andqbrst:		; CHECK-LABEL: andqbrst:
; CHECK: ## BB#0: ## %entry		; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vmovq (%rdi), %xmm1		; CHECK-NEXT: vpandq (%rdi){1to8}, %zmm0, %zmm0
; CHECK-NEXT: vpbroadcastq %xmm1, %ymm1
; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq		; CHECK-NEXT: retq
entry:		entry:
%a = load i64* %ap, align 8		%a = load i64* %ap, align 8
%b = insertelement <8 x i64> undef, i64 %a, i32 0		%b = insertelement <8 x i64> undef, i64 %a, i32 0
%c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer		%c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
%d = and <8 x i64> %p1, %c		%d = and <8 x i64> %p1, %c
ret <8 x i64>%d		ret <8 x i64>%d
}		}

test/CodeGen/X86/avx512-vbroadcast.ll

	; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl \| FileCheck %s			; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl \| FileCheck %s

	define <16 x i32> @_inreg16xi32(i32 %a) {			define <16 x i32> @_inreg16xi32(i32 %a) {
	; CHECK-LABEL: _inreg16xi32:			; CHECK-LABEL: _inreg16xi32:
	; CHECK: ## BB#0:			; CHECK: ## BB#0:
	; CHECK-NEXT: vmovd %edi, %xmm0			; CHECK-NEXT: vpbroadcastd %edi, %zmm0
	; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0
	; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%b = insertelement <16 x i32> undef, i32 %a, i32 0			%b = insertelement <16 x i32> undef, i32 %a, i32 0
	%c = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer			%c = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
	ret <16 x i32> %c			ret <16 x i32> %c
	}			}

	define <8 x i64> @_inreg8xi64(i64 %a) {			define <8 x i64> @_inreg8xi64(i64 %a) {
	; CHECK-LABEL: _inreg8xi64:			; CHECK-LABEL: _inreg8xi64:
	; CHECK: ## BB#0:			; CHECK: ## BB#0:
	; CHECK-NEXT: vmovq %rdi, %xmm0			; CHECK-NEXT: vpbroadcastq %rdi, %zmm0
	; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0
	; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%b = insertelement <8 x i64> undef, i64 %a, i32 0			%b = insertelement <8 x i64> undef, i64 %a, i32 0
	%c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer			%c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
	ret <8 x i64> %c			ret <8 x i64> %c
	}			}

	define <16 x float> @_inreg16xfloat(float %a) {			define <16 x float> @_inreg16xfloat(float %a) {
	; CHECK-LABEL: _inreg16xfloat:			; CHECK-LABEL: _inreg16xfloat:
	; CHECK: ## BB#0:			; CHECK: ## BB#0:
	; CHECK-NEXT: ## kill: XMM0<def> XMM0<kill> ZMM0<def>			; CHECK-NEXT: vbroadcastss %xmm0, %zmm0
	; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
	; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%b = insertelement <16 x float> undef, float %a, i32 0			%b = insertelement <16 x float> undef, float %a, i32 0
	%c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer			%c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
	ret <16 x float> %c			ret <16 x float> %c
	}			}

	define <8 x double> @_inreg8xdouble(double %a) {			define <8 x double> @_inreg8xdouble(double %a) {
	; CHECK-LABEL: _inreg8xdouble:			; CHECK-LABEL: _inreg8xdouble:
	; CHECK: ## BB#0:			; CHECK: ## BB#0:
	; CHECK-NEXT: ## kill: XMM0<def> XMM0<kill> ZMM0<def>			; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0
	; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
	; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%b = insertelement <8 x double> undef, double %a, i32 0			%b = insertelement <8 x double> undef, double %a, i32 0
	%c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer			%c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
	ret <8 x double> %c			ret <8 x double> %c
	}			}

	define <16 x i32> @_xmm16xi32(<16 x i32> %a) {			define <16 x i32> @_xmm16xi32(<16 x i32> %a) {
	; CHECK-LABEL: _xmm16xi32:			; CHECK-LABEL: _xmm16xi32:
	; CHECK: ## BB#0:			; CHECK: ## BB#0:
	; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0			; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0
	; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> zeroinitializer			%b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> zeroinitializer
	ret <16 x i32> %b			ret <16 x i32> %b
	}			}

	define <16 x float> @_xmm16xfloat(<16 x float> %a) {			define <16 x float> @_xmm16xfloat(<16 x float> %a) {
	; CHECK-LABEL: _xmm16xfloat:			; CHECK-LABEL: _xmm16xfloat:
	; CHECK: ## BB#0:			; CHECK: ## BB#0:
	; CHECK-NEXT: vbroadcastss %xmm0, %ymm0			; CHECK-NEXT: vbroadcastss %xmm0, %zmm0
	; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> zeroinitializer			%b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> zeroinitializer
	ret <16 x float> %b			ret <16 x float> %b
	}			}

	define <16 x i32> @test_vbroadcast() {			define <16 x i32> @test_vbroadcast() {
	; CHECK-LABEL: test_vbroadcast:			; CHECK-LABEL: test_vbroadcast:
	; CHECK: ## BB#0: ## %entry			; CHECK: ## BB#0: ## %entry
	▲ Show 20 Lines • Show All 119 Lines • Show Last 20 Lines

test/CodeGen/X86/avx512-vec-cmp.ll

Show First 20 Lines • Show All 306 Lines • ▼ Show 20 Lines	; CHECK-NEXT: retq
%mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer		%mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1		%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
ret <16 x i32> %max		ret <16 x i32> %max
}		}

define <8 x i64> @test24(<8 x i64> %x, <8 x i64> %x1, i64* %yb.ptr) nounwind {		define <8 x i64> @test24(<8 x i64> %x, <8 x i64> %x1, i64* %yb.ptr) nounwind {
; CHECK-LABEL: test24:		; CHECK-LABEL: test24:
; CHECK: ## BB#0:		; CHECK: ## BB#0:
; CHECK-NEXT: vmovq (%rdi), %xmm2		; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k1
; CHECK-NEXT: vpbroadcastq %xmm2, %ymm2
; CHECK-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}		; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0		; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq		; CHECK-NEXT: retq
%yb = load i64* %yb.ptr, align 4		%yb = load i64* %yb.ptr, align 4
%y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0		%y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
%y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer		%y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer
%mask = icmp eq <8 x i64> %x, %y		%mask = icmp eq <8 x i64> %x, %y
%max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1		%max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1
ret <8 x i64> %max		ret <8 x i64> %max
}		}

define <16 x i32> @test25(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1) nounwind {		define <16 x i32> @test25(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1) nounwind {
; CHECK-LABEL: test25:		; CHECK-LABEL: test25:
; CHECK: ## BB#0:		; CHECK: ## BB#0:
; CHECK-NEXT: vmovd (%rdi), %xmm2		; CHECK-NEXT: vpcmpled (%rdi){1to16}, %zmm0, %k1
; CHECK-NEXT: vpbroadcastd %xmm2, %ymm2
; CHECK-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
; CHECK-NEXT: vpcmpled %zmm2, %zmm0, %k1
; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}		; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0		; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq		; CHECK-NEXT: retq
%yb = load i32* %yb.ptr, align 4		%yb = load i32* %yb.ptr, align 4
%y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0		%y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
%y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer		%y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer
%mask = icmp sle <16 x i32> %x, %y		%mask = icmp sle <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1		%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
ret <16 x i32> %max		ret <16 x i32> %max
}		}

define <16 x i32> @test26(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1, <16 x i32> %y1) nounwind {		define <16 x i32> @test26(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1, <16 x i32> %y1) nounwind {
; CHECK-LABEL: test26:		; CHECK-LABEL: test26:
; CHECK: ## BB#0:		; CHECK: ## BB#0:
; CHECK-NEXT: vmovd (%rdi), %xmm3		; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1
; CHECK-NEXT: vpbroadcastd %xmm3, %ymm3		; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1}
; CHECK-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3
; CHECK-NEXT: vpcmpgtd %zmm3, %zmm0, %k1
; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1 {%k1}
; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}		; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0		; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq		; CHECK-NEXT: retq
%mask1 = icmp sge <16 x i32> %x1, %y1		%mask1 = icmp sge <16 x i32> %x1, %y1
%yb = load i32* %yb.ptr, align 4		%yb = load i32* %yb.ptr, align 4
%y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0		%y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
%y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer		%y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer
%mask0 = icmp sgt <16 x i32> %x, %y		%mask0 = icmp sgt <16 x i32> %x, %y
%mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer		%mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1		%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
ret <16 x i32> %max		ret <16 x i32> %max
}		}

define <8 x i64> @test27(<8 x i64> %x, i64* %yb.ptr, <8 x i64> %x1, <8 x i64> %y1) nounwind {		define <8 x i64> @test27(<8 x i64> %x, i64* %yb.ptr, <8 x i64> %x1, <8 x i64> %y1) nounwind {
; CHECK-LABEL: test27:		; CHECK-LABEL: test27:
; CHECK: ## BB#0:		; CHECK: ## BB#0:
; CHECK-NEXT: vmovq (%rdi), %xmm3		; CHECK-NEXT: vpcmpleq %zmm1, %zmm2, %k1
; CHECK-NEXT: vpbroadcastq %xmm3, %ymm3		; CHECK-NEXT: vpcmpleq (%rdi){1to8}, %zmm0, %k1 {%k1}
; CHECK-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3
; CHECK-NEXT: vpcmpleq %zmm3, %zmm0, %k1
; CHECK-NEXT: vpcmpleq %zmm1, %zmm2, %k1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}		; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0		; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq		; CHECK-NEXT: retq
%mask1 = icmp sge <8 x i64> %x1, %y1		%mask1 = icmp sge <8 x i64> %x1, %y1
%yb = load i64* %yb.ptr, align 4		%yb = load i64* %yb.ptr, align 4
%y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0		%y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
%y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer		%y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer
%mask0 = icmp sle <8 x i64> %x, %y		%mask0 = icmp sle <8 x i64> %x, %y
%mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer		%mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
%max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1		%max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1
ret <8 x i64> %max		ret <8 x i64> %max
}		}

test/CodeGen/X86/vector-shuffle-512-v8.ll

	; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f -x86-experimental-vector-shuffle-lowering \| FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F			; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f -x86-experimental-vector-shuffle-lowering \| FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
	; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw -x86-experimental-vector-shuffle-lowering \| FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW			; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw -x86-experimental-vector-shuffle-lowering \| FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW

	target triple = "x86_64-unknown-unknown"			target triple = "x86_64-unknown-unknown"

	define <8 x double> @shuffle_v8f64_00000000(<8 x double> %a, <8 x double> %b) {			define <8 x double> @shuffle_v8f64_00000000(<8 x double> %a, <8 x double> %b) {
	; ALL-LABEL: shuffle_v8f64_00000000:			; ALL-LABEL: shuffle_v8f64_00000000:
	; ALL: # BB#0:			; ALL: # BB#0:
	; ALL-NEXT: vbroadcastsd %xmm0, %ymm0			; ALL-NEXT: vbroadcastsd %xmm0, %zmm0
	; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
	; ALL-NEXT: retq			; ALL-NEXT: retq
	%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>			%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <8 x double> %shuffle			ret <8 x double> %shuffle
	}			}

	define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) {			define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) {
	; ALL-LABEL: shuffle_v8f64_00000010:			; ALL-LABEL: shuffle_v8f64_00000010:
	; ALL: # BB#0:			; ALL: # BB#0:
	▲ Show 20 Lines • Show All 705 Lines • ▼ Show 20 Lines
	; ALL-NEXT: retq			; ALL-NEXT: retq
	%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 15, i32 5, i32 1, i32 1, i32 2, i32 3, i32 5, i32 10>			%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 15, i32 5, i32 1, i32 1, i32 2, i32 3, i32 5, i32 10>
	ret <8 x double> %shuffle			ret <8 x double> %shuffle
	}			}

	define <8 x i64> @shuffle_v8i64_00000000(<8 x i64> %a, <8 x i64> %b) {			define <8 x i64> @shuffle_v8i64_00000000(<8 x i64> %a, <8 x i64> %b) {
	; ALL-LABEL: shuffle_v8i64_00000000:			; ALL-LABEL: shuffle_v8i64_00000000:
	; ALL: # BB#0:			; ALL: # BB#0:
	; ALL-NEXT: vpbroadcastq %xmm0, %ymm0			; ALL-NEXT: vpbroadcastq %xmm0, %zmm0
	; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
	; ALL-NEXT: retq			; ALL-NEXT: retq
	%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>			%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <8 x i64> %shuffle			ret <8 x i64> %shuffle
	}			}

	define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) {			define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) {
	; ALL-LABEL: shuffle_v8i64_00000010:			; ALL-LABEL: shuffle_v8i64_00000010:
	; ALL: # BB#0:			; ALL: # BB#0:
	▲ Show 20 Lines • Show All 690 Lines • Show Last 20 Lines