This is an archive of the discontinued LLVM Phabricator instance.

[X86] Call lowerShuffleAsBitMask for 512-bit vectors in lowerShuffleAsBlend.
ClosedPublic

Authored by craig.topper on Mar 17 2019, 9:11 PM.

Download Raw Diff

Details

Reviewers

RKSimon
spatel
andreadb

Commits

rG03675533043d: [X86] Call lowerShuffleAsBitMask for 512-bit vectors in lowerShuffleAsBlend.
rL356618: [X86] Call lowerShuffleAsBitMask for 512-bit vectors in lowerShuffleAsBlend.

Summary

This patch enables the use of lowerShuffleAsBitMask for 512-bit blends before
falling back to move immedate, GPR to k-register, and masked op.

I had to make some changes to support v8i64 when i64 is not a legal type. And to
support floating point types.

This trades a load for the move immediate and GPR move which is higher latency.
But its probably better for register pressure not having to hop through other
register classes. The load+and should play better with LICM and
rematerialization I think.

Diff Detail

Repository

rL LLVM

Build Status

Buildable 29276
Build 29275: arc lint + arc unit

Event Timeline

craig.topper created this revision.Mar 17 2019, 9:11 PM

Herald added a project: Restricted Project. · View Herald TranscriptMar 17 2019, 9:11 PM

Harbormaster completed remote builds in B29276: Diff 191053.Mar 17 2019, 9:11 PM

Should we not do this if opt-size is set?

Add optsize qualification. Copied one of the test cases in avx512-mask-op.ll and add the optsize attribute to test.

Herald added a subscriber: hiraditya. · View Herald TranscriptMar 19 2019, 12:48 PM

LGTM - cheers

This revision is now accepted and ready to land.Mar 20 2019, 2:04 AM

Closed by commit rL356618: [X86] Call lowerShuffleAsBitMask for 512-bit vectors in lowerShuffleAsBlend. (authored by ctopper). · Explain WhyMar 20 2019, 2:30 PM

This revision was automatically updated to reflect the committed changes.

I've bisected https://bugs.llvm.org/show_bug.cgi?id=41203 to this commit, which seems to cause crashes when building the test-suite with AXV512.

Revision Contents

Path

Size

lib/

Target/

X86/

X86ISelLowering.cpp

51 lines

test/

CodeGen/

X86/

avx512-mask-op.ll

12 lines

merge-consecutive-loads-512.ll

78 lines

vector-shuffle-512-v32.ll

6 lines

Diff 191053

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 10,358 Lines • ▼ Show 20 Lines
}		}

/// Try to emit a bitmask instruction for a shuffle.		/// Try to emit a bitmask instruction for a shuffle.
///		///
/// This handles cases where we can model a blend exactly as a bitmask due to		/// This handles cases where we can model a blend exactly as a bitmask due to
/// one of the inputs being zeroable.		/// one of the inputs being zeroable.
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,		static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,		SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable, SelectionDAG &DAG) {		const APInt &Zeroable,
assert(!VT.isFloatingPoint() && "Floating point types are not supported");		const X86Subtarget &Subtarget,
		SelectionDAG &DAG) {
		MVT MaskVT = VT;
MVT EltVT = VT.getVectorElementType();		MVT EltVT = VT.getVectorElementType();
SDValue Zero = DAG.getConstant(0, DL, EltVT);		SDValue Zero, AllOnes;
SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);		// Use f64 if i64 isn't legal.
		if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
		EltVT = MVT::f64;
		MaskVT = MVT::getVectorVT(EltVT, Mask.size());
		}

		MVT LogicVT = VT;
		if (EltVT == MVT::f32 \|\| EltVT == MVT::f64) {
		Zero = DAG.getConstantFP(0.0, DL, MVT::f64);
		AllOnes = DAG.getConstantFP(APInt::getAllOnesValue(64).bitsToDouble(), DL,
		EltVT);
		LogicVT = MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32,
		Mask.size());
		} else {
		Zero = DAG.getConstant(0, DL, EltVT);
		AllOnes = DAG.getAllOnesConstant(DL, EltVT);
		}

SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);		SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
SDValue V;		SDValue V;
for (int i = 0, Size = Mask.size(); i < Size; ++i) {		for (int i = 0, Size = Mask.size(); i < Size; ++i) {
if (Zeroable[i])		if (Zeroable[i])
continue;		continue;
if (Mask[i] % Size != i)		if (Mask[i] % Size != i)
return SDValue(); // Not a blend.		return SDValue(); // Not a blend.
if (!V)		if (!V)
V = Mask[i] < Size ? V1 : V2;		V = Mask[i] < Size ? V1 : V2;
else if (V != (Mask[i] < Size ? V1 : V2))		else if (V != (Mask[i] < Size ? V1 : V2))
return SDValue(); // Can only let one input through the mask.		return SDValue(); // Can only let one input through the mask.

VMaskOps[i] = AllOnes;		VMaskOps[i] = AllOnes;
}		}
if (!V)		if (!V)
return SDValue(); // No non-zeroable elements!		return SDValue(); // No non-zeroable elements!

SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);		SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
return DAG.getNode(ISD::AND, DL, VT, V, VMask);		VMask = DAG.getBitcast(LogicVT, VMask);
		V = DAG.getBitcast(LogicVT, V);
		SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
		return DAG.getBitcast(VT, And);
}		}

/// Try to emit a blend instruction for a shuffle using bit math.		/// Try to emit a blend instruction for a shuffle using bit math.
///		///
/// This is used as a fallback approach when first class blend instructions are		/// This is used as a fallback approach when first class blend instructions are
/// unavailable. Currently it is only suitable for integer vectors, but could		/// unavailable. Currently it is only suitable for integer vectors, but could
/// be generalized for floating point vectors if desirable.		/// be generalized for floating point vectors if desirable.
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,		static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
▲ Show 20 Lines • Show All 148 Lines • ▼ Show 20 Lines	static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
case MVT::v32i8:		case MVT::v32i8:
assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");		assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
LLVM_FALLTHROUGH;		LLVM_FALLTHROUGH;
case MVT::v16i8: {		case MVT::v16i8: {
assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");		assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");

// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.		// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,		if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
DAG))		Subtarget, DAG))
return Masked;		return Masked;

if (Subtarget.hasBWI() && Subtarget.hasVLX()) {		if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
MVT IntegerType =		MVT IntegerType =
MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));		MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);		SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);		return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
}		}
▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines	return DAG.getBitcast(
V1, V2));		V1, V2));
}		}
case MVT::v16f32:		case MVT::v16f32:
case MVT::v8f64:		case MVT::v8f64:
case MVT::v8i64:		case MVT::v8i64:
case MVT::v16i32:		case MVT::v16i32:
case MVT::v32i16:		case MVT::v32i16:
case MVT::v64i8: {		case MVT::v64i8: {
		// Attempt to lower to a bitmask if we can.
		if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
		Subtarget, DAG))
		return Masked;

		// Otherwise load an immediate into a GPR, cast to k-register, and use a
		// masked move.
MVT IntegerType =		MVT IntegerType =
MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));		MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);		SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);		return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
}		}
default:		default:
llvm_unreachable("Not a supported integer vector type!");		llvm_unreachable("Not a supported integer vector type!");
}		}
▲ Show 20 Lines • Show All 2,140 Lines • ▼ Show 20 Lines	static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// exact same predicate.		// exact same predicate.
bool IsBlendSupported = Subtarget.hasSSE41();		bool IsBlendSupported = Subtarget.hasSSE41();
if (IsBlendSupported)		if (IsBlendSupported)
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,		if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))		Zeroable, Subtarget, DAG))
return Blend;		return Blend;

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,		if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
Zeroable, DAG))		Zeroable, Subtarget, DAG))
return Masked;		return Masked;

// Use dedicated unpack instructions for masks that match their pattern.		// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))		if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
return V;		return V;

// Try to use byte rotation instructions.		// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.		// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
▲ Show 20 Lines • Show All 684 Lines • ▼ Show 20 Lines	static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// exact same predicate.		// exact same predicate.
bool IsBlendSupported = Subtarget.hasSSE41();		bool IsBlendSupported = Subtarget.hasSSE41();
if (IsBlendSupported)		if (IsBlendSupported)
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,		if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
Zeroable, Subtarget, DAG))		Zeroable, Subtarget, DAG))
return Blend;		return Blend;

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,		if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
Zeroable, DAG))		Zeroable, Subtarget, DAG))
return Masked;		return Masked;

// Use dedicated unpack instructions for masks that match their pattern.		// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))		if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
return V;		return V;

// Use dedicated pack instructions for masks that match their pattern.		// Use dedicated pack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,		if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
▲ Show 20 Lines • Show All 251 Lines • ▼ Show 20 Lines	auto tryToWidenViaDuplication = [&]() -> SDValue {
DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),		DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));		DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
};		};
if (SDValue V = tryToWidenViaDuplication())		if (SDValue V = tryToWidenViaDuplication())
return V;		return V;
}		}

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,		if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
Zeroable, DAG))		Zeroable, Subtarget, DAG))
return Masked;		return Masked;

// Use dedicated unpack instructions for masks that match their pattern.		// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))		if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
return V;		return V;

// Try to use byte shift instructions to mask.		// Try to use byte shift instructions to mask.
if (SDValue V = lowerVectorShuffleAsByteShiftMask(		if (SDValue V = lowerVectorShuffleAsByteShiftMask(
▲ Show 20 Lines • Show All 1,819 Lines • ▼ Show 20 Lines	static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
// types. Since we'll use floating point types there eventually, just		// types. Since we'll use floating point types there eventually, just
// immediately cast everything to a float and operate entirely in that domain.		// immediately cast everything to a float and operate entirely in that domain.
if (VT.isInteger() && !Subtarget.hasAVX2()) {		if (VT.isInteger() && !Subtarget.hasAVX2()) {
int ElementBits = VT.getScalarSizeInBits();		int ElementBits = VT.getScalarSizeInBits();
if (ElementBits < 32) {		if (ElementBits < 32) {
// No floating point type available, if we can't use the bit operations		// No floating point type available, if we can't use the bit operations
// for masking/blending then decompose into 128-bit vectors.		// for masking/blending then decompose into 128-bit vectors.
if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,		if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
DAG))		Subtarget, DAG))
return V;		return V;
if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))		if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
return V;		return V;
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);		return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
}		}

MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),		MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
VT.getVectorNumElements());		VT.getVectorNumElements());
▲ Show 20 Lines • Show All 28,304 Lines • Show Last 20 Lines

test/CodeGen/X86/avx512-mask-op.ll

	Show First 20 Lines • Show All 1,854 Lines • ▼ Show 20 Lines
	; KNL-LABEL: test_build_vec_v32i1:			; KNL-LABEL: test_build_vec_v32i1:
	; KNL: ## %bb.0:			; KNL: ## %bb.0:
	; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0			; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
	; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1			; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
	; KNL-NEXT: retq			; KNL-NEXT: retq
	;			;
	; SKX-LABEL: test_build_vec_v32i1:			; SKX-LABEL: test_build_vec_v32i1:
	; SKX: ## %bb.0:			; SKX: ## %bb.0:
	; SKX-NEXT: movl $1497715861, %eax ## imm = 0x59455495			; SKX-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0
	; SKX-NEXT: kmovd %eax, %k1
	; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
	; SKX-NEXT: retq			; SKX-NEXT: retq
	;			;
	; AVX512BW-LABEL: test_build_vec_v32i1:			; AVX512BW-LABEL: test_build_vec_v32i1:
	; AVX512BW: ## %bb.0:			; AVX512BW: ## %bb.0:
	; AVX512BW-NEXT: movl $1497715861, %eax ## imm = 0x59455495			; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
	; AVX512BW-NEXT: kmovd %eax, %k1
	; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: retq			; AVX512BW-NEXT: retq
	;			;
	; AVX512DQ-LABEL: test_build_vec_v32i1:			; AVX512DQ-LABEL: test_build_vec_v32i1:
	; AVX512DQ: ## %bb.0:			; AVX512DQ: ## %bb.0:
	; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0			; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
	; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1			; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
	; AVX512DQ-NEXT: retq			; AVX512DQ-NEXT: retq
	;			;
	; X86-LABEL: test_build_vec_v32i1:			; X86-LABEL: test_build_vec_v32i1:
	; X86: ## %bb.0:			; X86: ## %bb.0:
	; X86-NEXT: movl $1497715861, %eax ## imm = 0x59455495			; X86-NEXT: vandps LCPI40_0, %zmm0, %zmm0
	; X86-NEXT: kmovd %eax, %k1
	; X86-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
	; X86-NEXT: retl			; X86-NEXT: retl
	%ret = select <32 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false>, <32 x i16> %x, <32 x i16> zeroinitializer			%ret = select <32 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false>, <32 x i16> %x, <32 x i16> zeroinitializer
	ret <32 x i16> %ret			ret <32 x i16> %ret
	}			}

	define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) {			define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) {
	; KNL-LABEL: test_build_vec_v64i1:			; KNL-LABEL: test_build_vec_v64i1:
	; KNL: ## %bb.0:			; KNL: ## %bb.0:
	▲ Show 20 Lines • Show All 2,363 Lines • Show Last 20 Lines

test/CodeGen/X86/merge-consecutive-loads-512.ll

Show First 20 Lines • Show All 122 Lines • ▼ Show 20 Lines	; X32-AVX512F-NEXT: retl
%res2 = insertelement <8 x double> %res1, double 0.0, i32 2		%res2 = insertelement <8 x double> %res1, double 0.0, i32 2
%res3 = insertelement <8 x double> %res2, double 0.0, i32 3		%res3 = insertelement <8 x double> %res2, double 0.0, i32 3
%res6 = insertelement <8 x double> %res3, double 0.0, i32 6		%res6 = insertelement <8 x double> %res3, double 0.0, i32 6
%res7 = insertelement <8 x double> %res6, double 0.0, i32 7		%res7 = insertelement <8 x double> %res6, double 0.0, i32 7
ret <8 x double> %res7		ret <8 x double> %res7
}		}

define <8 x double> @merge_8f64_f64_1u3u5zu8(double* %ptr) nounwind uwtable noinline ssp {		define <8 x double> @merge_8f64_f64_1u3u5zu8(double* %ptr) nounwind uwtable noinline ssp {
; AVX512F-LABEL: merge_8f64_f64_1u3u5zu8:		; ALL-LABEL: merge_8f64_f64_1u3u5zu8:
; AVX512F: # %bb.0:		; ALL: # %bb.0:
; AVX512F-NEXT: movb $32, %al		; ALL-NEXT: vmovdqu64 8(%rdi), %zmm0
; AVX512F-NEXT: kmovw %eax, %k0		; ALL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; AVX512F-NEXT: knotw %k0, %k1		; ALL-NEXT: retq
; AVX512F-NEXT: vmovupd 8(%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: merge_8f64_f64_1u3u5zu8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movb $32, %al
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: knotw %k0, %k1
; AVX512BW-NEXT: vmovupd 8(%rdi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;		;
; X32-AVX512F-LABEL: merge_8f64_f64_1u3u5zu8:		; X32-AVX512F-LABEL: merge_8f64_f64_1u3u5zu8:
; X32-AVX512F: # %bb.0:		; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax		; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: movb $32, %cl		; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0
; X32-AVX512F-NEXT: kmovw %ecx, %k0		; X32-AVX512F-NEXT: vpandq {{\.LCPI.*}}, %zmm0, %zmm0
; X32-AVX512F-NEXT: knotw %k0, %k1
; X32-AVX512F-NEXT: vmovupd 8(%eax), %zmm0 {%k1} {z}
; X32-AVX512F-NEXT: retl		; X32-AVX512F-NEXT: retl
%ptr0 = getelementptr inbounds double, double* %ptr, i64 1		%ptr0 = getelementptr inbounds double, double* %ptr, i64 1
%ptr2 = getelementptr inbounds double, double* %ptr, i64 3		%ptr2 = getelementptr inbounds double, double* %ptr, i64 3
%ptr4 = getelementptr inbounds double, double* %ptr, i64 5		%ptr4 = getelementptr inbounds double, double* %ptr, i64 5
%ptr7 = getelementptr inbounds double, double* %ptr, i64 8		%ptr7 = getelementptr inbounds double, double* %ptr, i64 8
%val0 = load double, double* %ptr0		%val0 = load double, double* %ptr0
%val2 = load double, double* %ptr2		%val2 = load double, double* %ptr2
%val4 = load double, double* %ptr4		%val4 = load double, double* %ptr4
▲ Show 20 Lines • Show All 52 Lines • ▼ Show 20 Lines	; X32-AVX512F-NEXT: retl
%res3 = insertelement <8 x i64> %res2, i64 0, i32 3		%res3 = insertelement <8 x i64> %res2, i64 0, i32 3
%res4 = insertelement <8 x i64> %res3, i64 %val4, i32 4		%res4 = insertelement <8 x i64> %res3, i64 %val4, i32 4
%res6 = insertelement <8 x i64> %res4, i64 0, i32 6		%res6 = insertelement <8 x i64> %res4, i64 0, i32 6
%res7 = insertelement <8 x i64> %res6, i64 0, i32 7		%res7 = insertelement <8 x i64> %res6, i64 0, i32 7
ret <8 x i64> %res7		ret <8 x i64> %res7
}		}

define <8 x i64> @merge_8i64_i64_1u3u5zu8(i64* %ptr) nounwind uwtable noinline ssp {		define <8 x i64> @merge_8i64_i64_1u3u5zu8(i64* %ptr) nounwind uwtable noinline ssp {
; AVX512F-LABEL: merge_8i64_i64_1u3u5zu8:		; ALL-LABEL: merge_8i64_i64_1u3u5zu8:
; AVX512F: # %bb.0:		; ALL: # %bb.0:
; AVX512F-NEXT: movb $32, %al		; ALL-NEXT: vmovdqu64 8(%rdi), %zmm0
; AVX512F-NEXT: kmovw %eax, %k0		; ALL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; AVX512F-NEXT: knotw %k0, %k1		; ALL-NEXT: retq
; AVX512F-NEXT: vmovdqu64 8(%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: merge_8i64_i64_1u3u5zu8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movb $32, %al
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: knotw %k0, %k1
; AVX512BW-NEXT: vmovdqu64 8(%rdi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;		;
; X32-AVX512F-LABEL: merge_8i64_i64_1u3u5zu8:		; X32-AVX512F-LABEL: merge_8i64_i64_1u3u5zu8:
; X32-AVX512F: # %bb.0:		; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax		; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: movb $32, %cl		; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0
; X32-AVX512F-NEXT: kmovw %ecx, %k0		; X32-AVX512F-NEXT: vpandq {{\.LCPI.*}}, %zmm0, %zmm0
; X32-AVX512F-NEXT: knotw %k0, %k1
; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0 {%k1} {z}
; X32-AVX512F-NEXT: retl		; X32-AVX512F-NEXT: retl
%ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1		%ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
%ptr2 = getelementptr inbounds i64, i64* %ptr, i64 3		%ptr2 = getelementptr inbounds i64, i64* %ptr, i64 3
%ptr4 = getelementptr inbounds i64, i64* %ptr, i64 5		%ptr4 = getelementptr inbounds i64, i64* %ptr, i64 5
%ptr7 = getelementptr inbounds i64, i64* %ptr, i64 8		%ptr7 = getelementptr inbounds i64, i64* %ptr, i64 8
%val0 = load i64, i64* %ptr0		%val0 = load i64, i64* %ptr0
%val2 = load i64, i64* %ptr2		%val2 = load i64, i64* %ptr2
%val4 = load i64, i64* %ptr4		%val4 = load i64, i64* %ptr4
▲ Show 20 Lines • Show All 192 Lines • ▼ Show 20 Lines	; X32-AVX512F-NEXT: retl
%res3 = insertelement <16 x i32> %res0, i32 %val3, i32 3		%res3 = insertelement <16 x i32> %res0, i32 %val3, i32 3
%resC = insertelement <16 x i32> %res3, i32 %valC, i32 12		%resC = insertelement <16 x i32> %res3, i32 %valC, i32 12
%resE = insertelement <16 x i32> %resC, i32 %valE, i32 14		%resE = insertelement <16 x i32> %resC, i32 %valE, i32 14
%resF = insertelement <16 x i32> %resE, i32 %valF, i32 15		%resF = insertelement <16 x i32> %resE, i32 %valF, i32 15
ret <16 x i32> %resF		ret <16 x i32> %resF
}		}

define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(i32* %ptr) nounwind uwtable noinline ssp {		define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(i32* %ptr) nounwind uwtable noinline ssp {
; AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:		; ALL-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
; AVX512F: # %bb.0:		; ALL: # %bb.0:
; AVX512F-NEXT: movw $8240, %ax # imm = 0x2030		; ALL-NEXT: vmovdqu64 (%rdi), %zmm0
; AVX512F-NEXT: kmovw %eax, %k0		; ALL-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0
; AVX512F-NEXT: knotw %k0, %k1		; ALL-NEXT: retq
; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movw $8240, %ax # imm = 0x2030
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: knotw %k0, %k1
; AVX512BW-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;		;
; X32-AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:		; X32-AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
; X32-AVX512F: # %bb.0:		; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax		; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: movw $8240, %cx # imm = 0x2030		; X32-AVX512F-NEXT: vmovdqu64 (%eax), %zmm0
; X32-AVX512F-NEXT: kmovw %ecx, %k0		; X32-AVX512F-NEXT: vpandd {{\.LCPI.*}}, %zmm0, %zmm0
; X32-AVX512F-NEXT: knotw %k0, %k1
; X32-AVX512F-NEXT: vmovdqu32 (%eax), %zmm0 {%k1} {z}
; X32-AVX512F-NEXT: retl		; X32-AVX512F-NEXT: retl
%ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0		%ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0
%ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3		%ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3
%ptrC = getelementptr inbounds i32, i32* %ptr, i64 12		%ptrC = getelementptr inbounds i32, i32* %ptr, i64 12
%ptrE = getelementptr inbounds i32, i32* %ptr, i64 14		%ptrE = getelementptr inbounds i32, i32* %ptr, i64 14
%ptrF = getelementptr inbounds i32, i32* %ptr, i64 15		%ptrF = getelementptr inbounds i32, i32* %ptr, i64 15
%val0 = load i32, i32* %ptr0		%val0 = load i32, i32* %ptr0
%val3 = load i32, i32* %ptr3		%val3 = load i32, i32* %ptr3
▲ Show 20 Lines • Show All 247 Lines • Show Last 20 Lines

test/CodeGen/X86/vector-shuffle-512-v32.ll

	Show First 20 Lines • Show All 197 Lines • ▼ Show 20 Lines
	; KNL-NEXT: movl $65535, %eax ## imm = 0xFFFF			; KNL-NEXT: movl $65535, %eax ## imm = 0xFFFF
	; KNL-NEXT: vmovd %eax, %xmm1			; KNL-NEXT: vmovd %eax, %xmm1
	; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0			; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0
	; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1			; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; KNL-NEXT: retq			; KNL-NEXT: retq
	;			;
	; SKX-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:			; SKX-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
	; SKX: ## %bb.0:			; SKX: ## %bb.0:
	; SKX-NEXT: movl $1, %eax			; SKX-NEXT: movl $65535, %eax ## imm = 0xFFFF
	; SKX-NEXT: kmovd %eax, %k1			; SKX-NEXT: vmovd %eax, %xmm1
	; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}			; SKX-NEXT: vpandq %zmm1, %zmm0, %zmm0
	; SKX-NEXT: retq			; SKX-NEXT: retq
	%shuffle = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>			%shuffle = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
	ret <32 x i16> %shuffle			ret <32 x i16> %shuffle
	}			}

	define <32 x i16> @insert_dup_mem_v32i16_i32(i32* %ptr) {			define <32 x i16> @insert_dup_mem_v32i16_i32(i32* %ptr) {
	; KNL-LABEL: insert_dup_mem_v32i16_i32:			; KNL-LABEL: insert_dup_mem_v32i16_i32:
	; KNL: ## %bb.0:			; KNL: ## %bb.0:
	▲ Show 20 Lines • Show All 154 Lines • Show Last 20 Lines