Diff 364092

llvm/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 35,773 Lines • ▼ Show 20 Lines	static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// here, we're not going to remove the operands we find.		// here, we're not going to remove the operands we find.
bool UnaryShuffle = (Inputs.size() == 1);		bool UnaryShuffle = (Inputs.size() == 1);
SDValue V1 = peekThroughBitcasts(Inputs[0]);		SDValue V1 = peekThroughBitcasts(Inputs[0]);
SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())		SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
: peekThroughBitcasts(Inputs[1]));		: peekThroughBitcasts(Inputs[1]));

MVT VT1 = V1.getSimpleValueType();		MVT VT1 = V1.getSimpleValueType();
MVT VT2 = V2.getSimpleValueType();		MVT VT2 = V2.getSimpleValueType();
assert(VT1.getSizeInBits() == RootSizeInBits &&		assert(VT1.getSizeInBits() == RootSizeInBits &&
VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch");		VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch");
		lebedev.riAuthorUnsubmitted Done Reply Inline Actions Didn't we just assert that the inputs have the same size as root? lebedev.ri: Didn't we just assert that the inputs have the same size as root?
		RKSimonUnsubmitted Done Reply Inline Actions The middle-term plan is to relax these to (RootSizeInBit % VT1.getSizeInBits()) == 0 - we're getting pretty close and that will let us stop widening in combineX86ShufflesRecursively - which is causing hasOneUse() problems - and just widen when a match is found (which is what CanonicalizeShuffleInput will do). This should also help use get rid of combineX86ShuffleChainWithExtract. But I can understand if you don't want to handle this yet. RKSimon: The middle-term plan is to relax these to (RootSizeInBit % VT1.getSizeInBits()) == 0 - we're…
		lebedev.riAuthorUnsubmitted Done Reply Inline Actions Right. I'm aware that this is an artificial (and subpar) restriction. lebedev.ri: Right. I'm aware that this is an artificial (and subpar) restriction.

SDLoc DL(Root);		SDLoc DL(Root);
SDValue Res;		SDValue Res;

unsigned NumBaseMaskElts = BaseMask.size();		unsigned NumBaseMaskElts = BaseMask.size();
if (NumBaseMaskElts == 1) {		if (NumBaseMaskElts == 1) {
assert(BaseMask[0] == 0 && "Invalid shuffle index found!");		assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
return CanonicalizeShuffleInput(RootVT, V1);		return CanonicalizeShuffleInput(RootVT, V1);
}		}

bool OptForSize = DAG.shouldOptForSize();		bool OptForSize = DAG.shouldOptForSize();
unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;		unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
bool FloatDomain = VT1.isFloatingPoint() \|\| VT2.isFloatingPoint() \|\|		bool FloatDomain = VT1.isFloatingPoint() \|\| VT2.isFloatingPoint() \|\|
(RootVT.isFloatingPoint() && Depth >= 1) \|\|		(RootVT.isFloatingPoint() && Depth >= 1) \|\|
(RootVT.is256BitVector() && !Subtarget.hasAVX2());		(RootVT.is256BitVector() && !Subtarget.hasAVX2());

		// How many elements does each of the inputs have, given the current
		RKSimonUnsubmitted Done Reply Inline Actions Add brief description comment RKSimon: Add brief description comment
		// granularity of the root shuffle? Note that while currently the sizes of an
		// inputs must match the size of the shuffle root,
		// that restriction will be lifted in the future.
		SmallVector<unsigned, 2> InputNumElts;
		llvm::transform(std::initializer_list<MVT>({VT1, VT2}),
		std::back_inserter(InputNumElts),
		[BaseMaskEltSizeInBits](MVT VT) {
		assert(VT.getSizeInBits() % BaseMaskEltSizeInBits == 0 &&
		"Input is not a multiple of output element width?");
		return VT.getSizeInBits() / BaseMaskEltSizeInBits;
		});

// Don't combine if we are a AVX512/EVEX target and the mask element size		// Don't combine if we are a AVX512/EVEX target and the mask element size
// is different from the root element size - this would prevent writemasks		// is different from the root element size - this would prevent writemasks
// from being reused.		// from being reused.
bool IsMaskedShuffle = false;		bool IsMaskedShuffle = false;
if (RootSizeInBits == 512 \|\| (Subtarget.hasVLX() && RootSizeInBits >= 128)) {		if (RootSizeInBits == 512 \|\| (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&		if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {		Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
IsMaskedShuffle = true;		IsMaskedShuffle = true;
}		}
}		}

// If we are shuffling a broadcast (and not introducing zeros) then		// If we are shuffling a broadcast (and not introducing zeros) then
// we can just use the broadcast directly. This works for smaller broadcast		// we can just use the broadcast directly. This works for smaller broadcast
// elements as well as they already repeat across each mask element		// elements as well as they already repeat across each mask element
if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&		SmallVector<bool, 2> InputIsSplat;
(BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&		llvm::transform(
		std::initializer_list<SDValue>({V1, V2}),
		std::back_inserter(InputIsSplat), [BaseMaskEltSizeInBits](SDValue V) {
		return isTargetShuffleSplat(V) &&
		(BaseMaskEltSizeInBits % V.getScalarValueSizeInBits()) == 0;
		});
		if (UnaryShuffle && InputIsSplat[0] && !isAnyZero(BaseMask) &&
		RKSimonUnsubmitted Not Done Reply Inline Actions Is this any better? ''' bool IsSplat0 = isTargetShuffleSplat(V0); bool IsSplat1 = isTargetShuffleSplat(V1); IsSplat0 &= !!(BaseMaskEltSizeInBits % V0.getScalarValueSizeInBits()); IsSplat1 &= !!(BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()); ''' RKSimon: Is this any better? ''' bool IsSplat0 = isTargetShuffleSplat(V0); bool IsSplat1 =…
V1.getValueSizeInBits() >= RootSizeInBits) {		V1.getValueSizeInBits() >= RootSizeInBits) {
return CanonicalizeShuffleInput(RootVT, V1);		return CanonicalizeShuffleInput(RootVT, V1);
}		}

		// Adjust mask elements that pick from a splat input to be identity mask elts,
		// i.e. to pick from the same lane of the input as the mask element is in.
		// This may allow to simplify the shuffle into a blend.
		SmallVector<int> NewMask;
		RKSimonUnsubmitted Done Reply Inline Actions Move this inside the if() ? RKSimon: Move this inside the if() ?
		lebedev.riAuthorUnsubmitted Done Reply Inline Actions This can't work, because `BaseMask` is an `ArrayRef`, so if we narrow the scope of `NewMask`, then we can't assign it to `BaseMask`, because that would result in use-after-scope. Later in the function we have `SmallVector<int, 64> Mask;` with the same purpose, so the solution is to change the code inbetween to use `NewMask` instead of `BaseMask`, but that is a separate change that i will do afterwards. lebedev.ri: This can't work, because `BaseMask` is an `ArrayRef`, so if we narrow the scope of `NewMask`…
		RKSimonUnsubmitted Done Reply Inline Actions OK - got it - cheers RKSimon: OK - got it - cheers
		lebedev.riAuthorUnsubmitted Done Reply Inline Actions Done in rG35c0848b570214ed2b2d96cca4dd62bb7ae725cd. lebedev.ri: Done in rG35c0848b570214ed2b2d96cca4dd62bb7ae725cd.
		if (InputIsSplat[0] \|\| InputIsSplat[1]) {
		RKSimonUnsubmitted Done Reply Inline Actions Both V1 and V2 could be smaller than RootSizeInBits - we need to either not try to do this in that case or bail in the loop below if we try to reference a 'identity' element higher than V1 or V2's width. RKSimon: Both V1 and V2 could be smaller than RootSizeInBits - we need to either not try to do this in…
		lebedev.riAuthorUnsubmitted Done Reply Inline Actions Okay, does this look about right? lebedev.ri: Okay, does this look about right?
		NewMask.assign(BaseMask.begin(), BaseMask.end());
		RKSimonUnsubmitted Done Reply Inline Actions Why not NewMask.assign(BaseMask.begin(), BaseMask.end()) - then we just need to adjust the masks instead of repeated emplace_back(). RKSimon: Why not NewMask.assign(BaseMask.begin(), BaseMask.end()) - then we just need to adjust the…
		for (unsigned i = 0; i != NumBaseMaskElts; ++i) {
		int &M = NewMask[i];
		assert(isUndefOrZeroOrInRange(M, 0, 2 * NumBaseMaskElts) &&
		RKSimonUnsubmitted Done Reply Inline Actions Use isUndefOrInRange ? RKSimon: Use isUndefOrInRange ?
		"OOB mask element?");
		if (M < 0)
		continue; // Keep the undef/zero mask elements as-is.
		int InputIdx = (unsigned)M < NumBaseMaskElts ? 0 : 1;
		// Is the used input wide-enough to contain that lane, and is it a splat?
		if (InputIsSplat[InputIdx] && i < InputNumElts[InputIdx])
		M = i + InputIdx * NumBaseMaskElts; // Pick from the same lane of input.
		}
		BaseMask = std::move(NewMask);
		}

// See if the shuffle is a hidden identity shuffle - repeated args in HOPs		// See if the shuffle is a hidden identity shuffle - repeated args in HOPs
// etc. can be simplified.		// etc. can be simplified.
		RKSimonUnsubmitted Done Reply Inline Actions This looks over-complicated? Merge some of the comments and avoid nested if()s RKSimon: This looks over-complicated? Merge some of the comments and avoid nested if()s
if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits) {		if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits) {
SmallVector<int> ScaledMask, IdentityMask;		SmallVector<int> ScaledMask, IdentityMask;
unsigned NumElts = VT1.getVectorNumElements();		unsigned NumElts = VT1.getVectorNumElements();
if (BaseMask.size() <= NumElts &&		if (BaseMask.size() <= NumElts &&
		RKSimonUnsubmitted Done Reply Inline Actions BaseMask = std::move(NewMask); RKSimon: BaseMask = std::move(NewMask);
scaleShuffleElements(BaseMask, NumElts, ScaledMask)) {		scaleShuffleElements(BaseMask, NumElts, ScaledMask)) {
for (unsigned i = 0; i != NumElts; ++i)		for (unsigned i = 0; i != NumElts; ++i)
IdentityMask.push_back(i);		IdentityMask.push_back(i);
if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, V1, V2))		if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, V1, V2))
return CanonicalizeShuffleInput(RootVT, V1);		return CanonicalizeShuffleInput(RootVT, V1);
}		}
}		}

▲ Show 20 Lines • Show All 16,803 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/avx.ll

	Show First 20 Lines • Show All 147 Lines • ▼ Show 20 Lines
	;; FIXME: We're emitting an extraneous pshufd/vbroadcast.			;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
	define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {			define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
	; On X32, account for the arguments' move to registers			; On X32, account for the arguments' move to registers
	; X32-LABEL: insertps_from_broadcast_multiple_use:			; X32-LABEL: insertps_from_broadcast_multiple_use:
	; X32: ## %bb.0:			; X32: ## %bb.0:
	; X32-NEXT: movl {{[0-9]+}}(%esp), %eax			; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx			; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X32-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4			; X32-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4
	; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]			; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
	; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]			; X32-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
	; X32-NEXT: vaddps %xmm1, %xmm0, %xmm0			; X32-NEXT: vaddps %xmm1, %xmm0, %xmm0
	; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]			; X32-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
	; X32-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]			; X32-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
	; X32-NEXT: vaddps %xmm2, %xmm1, %xmm1			; X32-NEXT: vaddps %xmm2, %xmm1, %xmm1
	; X32-NEXT: vaddps %xmm1, %xmm0, %xmm0			; X32-NEXT: vaddps %xmm1, %xmm0, %xmm0
	; X32-NEXT: retl			; X32-NEXT: retl
	;			;
	; X64-LABEL: insertps_from_broadcast_multiple_use:			; X64-LABEL: insertps_from_broadcast_multiple_use:
	; X64: ## %bb.0:			; X64: ## %bb.0:
	; X64-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4			; X64-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4
	; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]			; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
	; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]			; X64-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
	; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0			; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
	; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]			; X64-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
	; X64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]			; X64-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
	; X64-NEXT: vaddps %xmm2, %xmm1, %xmm1			; X64-NEXT: vaddps %xmm2, %xmm1, %xmm1
	; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0			; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
	; X64-NEXT: retq			; X64-NEXT: retq
	%1 = getelementptr inbounds float, float* %fb, i64 %index			%1 = getelementptr inbounds float, float* %fb, i64 %index
	%2 = load float, float* %1, align 4			%2 = load float, float* %1, align 4
	%3 = insertelement <4 x float> undef, float %2, i32 0			%3 = insertelement <4 x float> undef, float %2, i32 0
	%4 = insertelement <4 x float> %3, float %2, i32 1			%4 = insertelement <4 x float> %3, float %2, i32 1
	%5 = insertelement <4 x float> %4, float %2, i32 2			%5 = insertelement <4 x float> %4, float %2, i32 2
	Show All 10 Lines

llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll

Show First 20 Lines • Show All 4,309 Lines • ▼ Show 20 Lines	; CHECK-NEXT: retq
%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer		%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
ret <4 x double> %res		ret <4 x double> %res
}		}

define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {		define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:		; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
; CHECK-FAST: # %bb.0:		; CHECK-FAST: # %bb.0:
; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2		; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2
; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [3,4,2,6]		; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [3,5,2,7]
; CHECK-FAST-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm3		; CHECK-FAST-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm3
; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2		; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1		; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
; CHECK-FAST-NEXT: vmovapd %ymm3, %ymm0 {%k1}		; CHECK-FAST-NEXT: vmovapd %ymm3, %ymm0 {%k1}
; CHECK-FAST-NEXT: retq		; CHECK-FAST-NEXT: retq
;		;
; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:		; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
; CHECK-FAST-PERLANE: # %bb.0:		; CHECK-FAST-PERLANE: # %bb.0:
; CHECK-FAST-PERLANE-NEXT: vpermpd $236, (%rdi), %ymm2 # ymm2 = mem[0,3,2,3]		; CHECK-FAST-PERLANE-NEXT: vpermpd $236, (%rdi), %ymm2 # ymm2 = mem[0,3,2,3]
; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3		; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1		; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
; CHECK-FAST-PERLANE-NEXT: vshufpd $1, 32(%rdi){1to4}, %ymm2, %ymm0 {%k1}		; CHECK-FAST-PERLANE-NEXT: vshufpd $1, 32(%rdi){1to4}, %ymm2, %ymm0 {%k1}
; CHECK-FAST-PERLANE-NEXT: retq		; CHECK-FAST-PERLANE-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp		%vec = load <8 x double>, <8 x double>* %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>		%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer		%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2		%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
ret <4 x double> %res		ret <4 x double> %res
}		}

define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %mask) {		define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %mask) {
; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:		; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
; CHECK-FAST: # %bb.0:		; CHECK-FAST: # %bb.0:
; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2		; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2
; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [3,4,2,6]		; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [3,5,2,7]
; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3		; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm0, %k1		; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
; CHECK-FAST-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm1 {%k1} {z}		; CHECK-FAST-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm1 {%k1} {z}
; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0		; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0
; CHECK-FAST-NEXT: retq		; CHECK-FAST-NEXT: retq
;		;
; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:		; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
; CHECK-FAST-PERLANE: # %bb.0:		; CHECK-FAST-PERLANE: # %bb.0:
▲ Show 20 Lines • Show All 359 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/pr15296.ll

Show All 20 Lines	allocas:
%smear.7 = insertelement <8 x i32> %smear.6, i32 %shiftval, i32 7		%smear.7 = insertelement <8 x i32> %smear.6, i32 %shiftval, i32 7
%bitop = lshr <8 x i32> %input, %smear.7		%bitop = lshr <8 x i32> %input, %smear.7
ret <8 x i32> %bitop		ret <8 x i32> %bitop
}		}

define <8 x i32> @shiftInput___canonical(<8 x i32> %input, i32 %shiftval, <8 x i32> %__mask) nounwind {		define <8 x i32> @shiftInput___canonical(<8 x i32> %input, i32 %shiftval, <8 x i32> %__mask) nounwind {
; CHECK-LABEL: shiftInput___canonical:		; CHECK-LABEL: shiftInput___canonical:
; CHECK: # %bb.0: # %allocas		; CHECK: # %bb.0: # %allocas
; CHECK-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm1		; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero		; CHECK-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3		; CHECK-NEXT: vpsrld %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vpsrld %xmm2, %xmm3, %xmm4		; CHECK-NEXT: vpsrld %xmm2, %xmm0, %xmm0
; CHECK-NEXT: vpsrlq $32, %xmm1, %xmm5		; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; CHECK-NEXT: vpsrld %xmm5, %xmm3, %xmm6
; CHECK-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
; CHECK-NEXT: vpxor %xmm6, %xmm6, %xmm6
; CHECK-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm6[2,3,4,5,6,7]
; CHECK-NEXT: vpsrld %xmm6, %xmm3, %xmm7
; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; CHECK-NEXT: vpsrld %xmm1, %xmm3, %xmm3
; CHECK-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4,5,6,7]
; CHECK-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
; CHECK-NEXT: vpsrld %xmm2, %xmm0, %xmm2
; CHECK-NEXT: vpsrld %xmm5, %xmm0, %xmm4
; CHECK-NEXT: vpsrld %xmm6, %xmm0, %xmm5
; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm2[4,5,6,7]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; CHECK-NEXT: retl		; CHECK-NEXT: retl
allocas:		allocas:
%smear.0 = insertelement <8 x i32> undef, i32 %shiftval, i32 0		%smear.0 = insertelement <8 x i32> undef, i32 %shiftval, i32 0
%smear.7 = shufflevector <8 x i32> %smear.0, <8 x i32> undef, <8 x i32> zeroinitializer		%smear.7 = shufflevector <8 x i32> %smear.0, <8 x i32> undef, <8 x i32> zeroinitializer
%bitop = lshr <8 x i32> %input, %smear.7		%bitop = lshr <8 x i32> %input, %smear.7
ret <8 x i32> %bitop		ret <8 x i32> %bitop
}		}

Show All 15 Lines

llvm/test/CodeGen/X86/sse41.ll

	Show First 20 Lines • Show All 1,655 Lines • ▼ Show 20 Lines
	; X86-SSE-NEXT: addps %xmm3, %xmm0 ## encoding: [0x0f,0x58,0xc3]			; X86-SSE-NEXT: addps %xmm3, %xmm0 ## encoding: [0x0f,0x58,0xc3]
	; X86-SSE-NEXT: retl ## encoding: [0xc3]			; X86-SSE-NEXT: retl ## encoding: [0xc3]
	;			;
	; X86-AVX1-LABEL: insertps_from_broadcast_multiple_use:			; X86-AVX1-LABEL: insertps_from_broadcast_multiple_use:
	; X86-AVX1: ## %bb.0:			; X86-AVX1: ## %bb.0:
	; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]			; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
	; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]			; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
	; X86-AVX1-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]			; X86-AVX1-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
	; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]			; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
	; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0]			; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3]
	; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]			; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
	; X86-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0]			; X86-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3]
	; X86-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]			; X86-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
	; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]			; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08]
	; X86-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0]			; X86-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[3]
	; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]			; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08]
	; X86-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0]			; X86-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[3]
	; X86-AVX1-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]			; X86-AVX1-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
	; X86-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]			; X86-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
	; X86-AVX1-NEXT: retl ## encoding: [0xc3]			; X86-AVX1-NEXT: retl ## encoding: [0xc3]
	;			;
	; X86-AVX512-LABEL: insertps_from_broadcast_multiple_use:			; X86-AVX512-LABEL: insertps_from_broadcast_multiple_use:
	; X86-AVX512: ## %bb.0:			; X86-AVX512: ## %bb.0:
	; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]			; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
	; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]			; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
	; X86-AVX512-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]			; X86-AVX512-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
	; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]			; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
	; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0]			; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3]
	; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]			; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
	; X86-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0]			; X86-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3]
				; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08]
				; X86-AVX512-NEXT: ## xmm2 = xmm2[0,1,2],xmm4[3]
				; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08]
				; X86-AVX512-NEXT: ## xmm3 = xmm3[0,1,2],xmm4[3]
	; X86-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]			; X86-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
	; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]			; X86-AVX512-NEXT: vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb]
	; X86-AVX512-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0]
	; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
	; X86-AVX512-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0]
	; X86-AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
	; X86-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]			; X86-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
	; X86-AVX512-NEXT: retl ## encoding: [0xc3]			; X86-AVX512-NEXT: retl ## encoding: [0xc3]
	;			;
	; X64-SSE-LABEL: insertps_from_broadcast_multiple_use:			; X64-SSE-LABEL: insertps_from_broadcast_multiple_use:
	; X64-SSE: ## %bb.0:			; X64-SSE: ## %bb.0:
	; X64-SSE-NEXT: movss (%rdi,%rsi,4), %xmm4 ## encoding: [0xf3,0x0f,0x10,0x24,0xb7]			; X64-SSE-NEXT: movss (%rdi,%rsi,4), %xmm4 ## encoding: [0xf3,0x0f,0x10,0x24,0xb7]
	; X64-SSE-NEXT: ## xmm4 = mem[0],zero,zero,zero			; X64-SSE-NEXT: ## xmm4 = mem[0],zero,zero,zero
	; X64-SSE-NEXT: insertps $48, %xmm4, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc4,0x30]			; X64-SSE-NEXT: insertps $48, %xmm4, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc4,0x30]
	; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0]			; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0]
	; X64-SSE-NEXT: insertps $48, %xmm4, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x21,0xcc,0x30]			; X64-SSE-NEXT: insertps $48, %xmm4, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x21,0xcc,0x30]
	; X64-SSE-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0]			; X64-SSE-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0]
	; X64-SSE-NEXT: addps %xmm1, %xmm0 ## encoding: [0x0f,0x58,0xc1]			; X64-SSE-NEXT: addps %xmm1, %xmm0 ## encoding: [0x0f,0x58,0xc1]
	; X64-SSE-NEXT: insertps $48, %xmm4, %xmm2 ## encoding: [0x66,0x0f,0x3a,0x21,0xd4,0x30]			; X64-SSE-NEXT: insertps $48, %xmm4, %xmm2 ## encoding: [0x66,0x0f,0x3a,0x21,0xd4,0x30]
	; X64-SSE-NEXT: ## xmm2 = xmm2[0,1,2],xmm4[0]			; X64-SSE-NEXT: ## xmm2 = xmm2[0,1,2],xmm4[0]
	; X64-SSE-NEXT: insertps $48, %xmm4, %xmm3 ## encoding: [0x66,0x0f,0x3a,0x21,0xdc,0x30]			; X64-SSE-NEXT: insertps $48, %xmm4, %xmm3 ## encoding: [0x66,0x0f,0x3a,0x21,0xdc,0x30]
	; X64-SSE-NEXT: ## xmm3 = xmm3[0,1,2],xmm4[0]			; X64-SSE-NEXT: ## xmm3 = xmm3[0,1,2],xmm4[0]
	; X64-SSE-NEXT: addps %xmm2, %xmm3 ## encoding: [0x0f,0x58,0xda]			; X64-SSE-NEXT: addps %xmm2, %xmm3 ## encoding: [0x0f,0x58,0xda]
	; X64-SSE-NEXT: addps %xmm3, %xmm0 ## encoding: [0x0f,0x58,0xc3]			; X64-SSE-NEXT: addps %xmm3, %xmm0 ## encoding: [0x0f,0x58,0xc3]
	; X64-SSE-NEXT: retq ## encoding: [0xc3]			; X64-SSE-NEXT: retq ## encoding: [0xc3]
	;			;
	; X64-AVX1-LABEL: insertps_from_broadcast_multiple_use:			; X64-AVX1-LABEL: insertps_from_broadcast_multiple_use:
	; X64-AVX1: ## %bb.0:			; X64-AVX1: ## %bb.0:
	; X64-AVX1-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]			; X64-AVX1-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
	; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]			; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
	; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0]			; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3]
	; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]			; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
	; X64-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0]			; X64-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3]
	; X64-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]			; X64-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
	; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]			; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08]
	; X64-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0]			; X64-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[3]
	; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]			; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08]
	; X64-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0]			; X64-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[3]
	; X64-AVX1-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]			; X64-AVX1-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
	; X64-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]			; X64-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
	; X64-AVX1-NEXT: retq ## encoding: [0xc3]			; X64-AVX1-NEXT: retq ## encoding: [0xc3]
	;			;
	; X64-AVX512-LABEL: insertps_from_broadcast_multiple_use:			; X64-AVX512-LABEL: insertps_from_broadcast_multiple_use:
	; X64-AVX512: ## %bb.0:			; X64-AVX512: ## %bb.0:
	; X64-AVX512-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]			; X64-AVX512-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
	; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]			; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
	; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0]			; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3]
	; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]			; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
	; X64-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0]			; X64-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3]
				; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08]
				; X64-AVX512-NEXT: ## xmm2 = xmm2[0,1,2],xmm4[3]
				; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08]
				; X64-AVX512-NEXT: ## xmm3 = xmm3[0,1,2],xmm4[3]
	; X64-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]			; X64-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
	; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]			; X64-AVX512-NEXT: vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb]
	; X64-AVX512-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0]
	; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
	; X64-AVX512-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0]
	; X64-AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
	; X64-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]			; X64-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
	; X64-AVX512-NEXT: retq ## encoding: [0xc3]			; X64-AVX512-NEXT: retq ## encoding: [0xc3]
	%1 = getelementptr inbounds float, float* %fb, i64 %index			%1 = getelementptr inbounds float, float* %fb, i64 %index
	%2 = load float, float* %1, align 4			%2 = load float, float* %1, align 4
	%3 = insertelement <4 x float> undef, float %2, i32 0			%3 = insertelement <4 x float> undef, float %2, i32 0
	%4 = insertelement <4 x float> %3, float %2, i32 1			%4 = insertelement <4 x float> %3, float %2, i32 1
	%5 = insertelement <4 x float> %4, float %2, i32 2			%5 = insertelement <4 x float> %4, float %2, i32 2
	%6 = insertelement <4 x float> %5, float %2, i32 3			%6 = insertelement <4 x float> %5, float %2, i32 3
	▲ Show 20 Lines • Show All 424 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 4,585 Lines • ▼ Show 20 Lines
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]			; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]			; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:			; AVX2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
	; AVX2: # %bb.0:			; AVX2: # %bb.0:
	; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1			; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
	; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u]			; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u]
	; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]			; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	;			;
	; AVX512VLBW-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:			; AVX512VLBW-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
	; AVX512VLBW: # %bb.0:			; AVX512VLBW: # %bb.0:
	; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1			; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1
	; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u]			; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u]
	; AVX512VLBW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]			; AVX512VLBW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
	; AVX512VLBW-NEXT: retq			; AVX512VLBW-NEXT: retq
	;			;
	; AVX512VLVBMI-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:			; AVX512VLVBMI-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
	; AVX512VLVBMI: # %bb.0:			; AVX512VLVBMI: # %bb.0:
	; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,16,16,16,16,16,16,16,16]			; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,16,16,16,16,16,16,16,16]
	; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0			; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
	; AVX512VLVBMI-NEXT: retq			; AVX512VLVBMI-NEXT: retq
	;			;
	▲ Show 20 Lines • Show All 671 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[WIP][X86] combineX86ShuffleChain(): canonicalize mask elts picking from splats
AbandonedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 364092

llvm/lib/Target/X86/X86ISelLowering.cpp

llvm/test/CodeGen/X86/avx.ll

llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll

llvm/test/CodeGen/X86/pr15296.ll

llvm/test/CodeGen/X86/sse41.ll

llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll

This is an archive of the discontinued LLVM Phabricator instance.

[WIP][X86] combineX86ShuffleChain(): canonicalize mask elts picking from splatsAbandonedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 364092

llvm/lib/Target/X86/X86ISelLowering.cpp

llvm/test/CodeGen/X86/avx.ll

llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll

llvm/test/CodeGen/X86/pr15296.ll

llvm/test/CodeGen/X86/sse41.ll

llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll

[WIP][X86] combineX86ShuffleChain(): canonicalize mask elts picking from splats
AbandonedPublic