This is an archive of the discontinued LLVM Phabricator instance.

[X86, AVX] instcombine vperm2 intrinsics with zero inputs into shuffles
ClosedPublic

Authored by spatel on Mar 23 2015, 2:49 PM.

Download Raw Diff

Details

Reviewers

RKSimon
chandlerc
andreadb
mkuper

Commits

rG43a87fdc7956: [X86, AVX] instcombine vperm2 intrinsics with zero inputs into shuffles
rL233110: [X86, AVX] instcombine vperm2 intrinsics with zero inputs into shuffles

Summary

This is the IR optimizer follow-on patch for D8563: the x86 backend patch that converts this kind of shuffle back into a vperm2.

This is also a continuation of the transform that started in D8486. In that patch, Andrea suggested that we could convert vperm2 intrinsics that use zero masks into a single shuffle. This is an implementation of that suggestion.

I recognize that we could go even further into bit twiddling hackery to make the code a line or two shorter, but I thought it would hurt readability.

Diff Detail

Repository: rL LLVM

Event Timeline

spatel updated this revision to Diff 22519.Mar 23 2015, 2:49 PM

spatel retitled this revision from to [X86, AVX] instcombine vperm2 intrinsics with zero inputs into shuffles.

spatel updated this object.

spatel edited the test plan for this revision. (Show Details)

spatel added reviewers: andreadb, mkuper, RKSimon, chandlerc.

spatel added a subscriber: Unknown Object (MLST).

Patch updated based on email suggestion by Andrea (thanks!):
Rather than keeping the shuffle operand order fixed based on the inputs to the intrinsic, swap them as needed. This has 2 benefits:

It simplifies the zero vector replacement logic.
It creates a shuffle in a more canonical form; the x86 backend swaps shuffle operands to reduce accesses to the 2nd input in the low half of the result vector.

LGTM. Thanks Sanjay!

lib/Transforms/InstCombine/InstCombineCalls.cpp
222–230 ↗	(On Diff #22582)	You can move this logic after line 235.

This revision is now accepted and ready to land.Mar 24 2015, 10:19 AM

Closed by commit rL233110: [X86, AVX] instcombine vperm2 intrinsics with zero inputs into shuffles (authored by spatel). · Explain WhyMar 24 2015, 1:39 PM

This revision was automatically updated to reflect the committed changes.

Thanks, Andrea!

For the record, I did a little Clang test case dance with r233109 and r233111
in an attempt to not cause any buildbot breakage with this commit (r233110).
I learned my lesson from the last two similar changes. :)

Is it wrong that a Clang regression test has a dependency on the IR optimizer?

Revision Contents

Path

Size

llvm/

trunk/

lib/

Transforms/

InstCombine/

InstCombineCalls.cpp

74 lines

test/

Transforms/

InstCombine/

x86-vperm2.ll

37 lines

Diff 22596

llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp

	Show First 20 Lines • Show All 198 Lines • ▼ Show 20 Lines

	/// The shuffle mask for a perm2*128 selects any two halves of two 256-bit			/// The shuffle mask for a perm2*128 selects any two halves of two 256-bit
	/// source vectors, unless a zero bit is set. If a zero bit is set,			/// source vectors, unless a zero bit is set. If a zero bit is set,
	/// then ignore that half of the mask and clear that half of the vector.			/// then ignore that half of the mask and clear that half of the vector.
	static Value *SimplifyX86vperm2(const IntrinsicInst &II,			static Value *SimplifyX86vperm2(const IntrinsicInst &II,
	InstCombiner::BuilderTy &Builder) {			InstCombiner::BuilderTy &Builder) {
	if (auto CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) {			if (auto CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
	VectorType *VecTy = cast<VectorType>(II.getType());			VectorType *VecTy = cast<VectorType>(II.getType());
	uint8_t Imm = CInt->getZExtValue();			ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);

	// The immediate permute control byte looks like this:			// The immediate permute control byte looks like this:
	// [1:0] - select 128 bits from sources for low half of destination			// [1:0] - select 128 bits from sources for low half of destination
	// [2] - ignore			// [2] - ignore
	// [3] - zero low half of destination			// [3] - zero low half of destination
	// [5:4] - select 128 bits from sources for high half of destination			// [5:4] - select 128 bits from sources for high half of destination
	// [6] - ignore			// [6] - ignore
	// [7] - zero high half of destination			// [7] - zero high half of destination

	if ((Imm & 0x88) == 0x88) {			uint8_t Imm = CInt->getZExtValue();

				bool LowHalfZero = Imm & 0x08;
				bool HighHalfZero = Imm & 0x80;

	// If both zero mask bits are set, this was just a weird way to			// If both zero mask bits are set, this was just a weird way to
	// generate a zero vector.			// generate a zero vector.
	return ConstantAggregateZero::get(VecTy);			if (LowHalfZero && HighHalfZero)
	}			return ZeroVector;

	// TODO: If a single zero bit is set, replace one of the source operands			// If 0 or 1 zero mask bits are set, this is a simple shuffle.
	// with a zero vector and use the same mask generation logic as below.

	if ((Imm & 0x88) == 0x00) {
	// If neither zero mask bit is set, this is a simple shuffle.
	unsigned NumElts = VecTy->getNumElements();			unsigned NumElts = VecTy->getNumElements();
	unsigned HalfSize = NumElts / 2;			unsigned HalfSize = NumElts / 2;
	unsigned HalfBegin;
	SmallVector<int, 8> ShuffleMask(NumElts);			SmallVector<int, 8> ShuffleMask(NumElts);

				// The high bit of the selection field chooses the 1st or 2nd operand.
				bool LowInputSelect = Imm & 0x02;
				bool HighInputSelect = Imm & 0x20;

				// The low bit of the selection field chooses the low or high half
				// of the selected operand.
				bool LowHalfSelect = Imm & 0x01;
				bool HighHalfSelect = Imm & 0x10;

				// Determine which operand(s) are actually in use for this instruction.
				Value *V0 = LowInputSelect ? II.getArgOperand(1) : II.getArgOperand(0);
				Value *V1 = HighInputSelect ? II.getArgOperand(1) : II.getArgOperand(0);

				// If needed, replace operands based on zero mask.
				V0 = LowHalfZero ? ZeroVector : V0;
				V1 = HighHalfZero ? ZeroVector : V1;

	// Permute low half of result.			// Permute low half of result.
	HalfBegin = (Imm & 0x3) * HalfSize;			unsigned StartIndex = LowHalfSelect ? HalfSize : 0;
	for (unsigned i = 0; i != HalfSize; ++i)			for (unsigned i = 0; i < HalfSize; ++i)
	ShuffleMask[i] = HalfBegin + i;			ShuffleMask[i] = StartIndex + i;

	// Permute high half of result.			// Permute high half of result.
	HalfBegin = ((Imm >> 4) & 0x3) * HalfSize;			StartIndex = HighHalfSelect ? HalfSize : 0;
	for (unsigned i = HalfSize; i != NumElts; ++i)			StartIndex += NumElts;
	ShuffleMask[i] = HalfBegin + i - HalfSize;			for (unsigned i = 0; i < HalfSize; ++i)
				ShuffleMask[i + HalfSize] = StartIndex + i;
	Value *Op0 = II.getArgOperand(0);
	Value *Op1 = II.getArgOperand(1);			return Builder.CreateShuffleVector(V0, V1, ShuffleMask);
	return Builder.CreateShuffleVector(Op0, Op1, ShuffleMask);
	}
	}			}
	return nullptr;			return nullptr;
	}			}

	/// visitCallInst - CallInst simplification. This mostly only handles folding			/// visitCallInst - CallInst simplification. This mostly only handles folding
	/// of intrinsic instructions. For normal calls, it allows visitCallSite to do			/// of intrinsic instructions. For normal calls, it allows visitCallSite to do
	/// the heavy lifting.			/// the heavy lifting.
	///			///
	▲ Show 20 Lines • Show All 1,662 Lines • Show Last 20 Lines

llvm/trunk/test/Transforms/InstCombine/x86-vperm2.ll

Show First 20 Lines • Show All 70 Lines • ▼ Show 20 Lines
; CHECK-NEXT: ret <4 x double> %1		; CHECK-NEXT: ret <4 x double> %1
}		}

define <4 x double> @perm2pd_0x02(<4 x double> %a0, <4 x double> %a1) {		define <4 x double> @perm2pd_0x02(<4 x double> %a0, <4 x double> %a1) {
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 2)		%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 2)
ret <4 x double> %res		ret <4 x double> %res

; CHECK-LABEL: @perm2pd_0x02		; CHECK-LABEL: @perm2pd_0x02
; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 4, i32 5, i32 0, i32 1>		; CHECK-NEXT: %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; CHECK-NEXT: ret <4 x double> %1		; CHECK-NEXT: ret <4 x double> %1
}		}

define <4 x double> @perm2pd_0x03(<4 x double> %a0, <4 x double> %a1) {		define <4 x double> @perm2pd_0x03(<4 x double> %a0, <4 x double> %a1) {
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 3)		%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 3)
ret <4 x double> %res		ret <4 x double> %res

; CHECK-LABEL: @perm2pd_0x03		; CHECK-LABEL: @perm2pd_0x03
; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 6, i32 7, i32 0, i32 1>		; CHECK-NEXT: %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
; CHECK-NEXT: ret <4 x double> %1		; CHECK-NEXT: ret <4 x double> %1
}		}

define <4 x double> @perm2pd_0x10(<4 x double> %a0, <4 x double> %a1) {		define <4 x double> @perm2pd_0x10(<4 x double> %a0, <4 x double> %a1) {
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 16)		%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 16)
ret <4 x double> %res		ret <4 x double> %res

; CHECK-LABEL: @perm2pd_0x10		; CHECK-LABEL: @perm2pd_0x10
Show All 9 Lines
; CHECK-NEXT: ret <4 x double> %1		; CHECK-NEXT: ret <4 x double> %1
}		}

define <4 x double> @perm2pd_0x12(<4 x double> %a0, <4 x double> %a1) {		define <4 x double> @perm2pd_0x12(<4 x double> %a0, <4 x double> %a1) {
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 18)		%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 18)
ret <4 x double> %res		ret <4 x double> %res

; CHECK-LABEL: @perm2pd_0x12		; CHECK-LABEL: @perm2pd_0x12
; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>		; CHECK-NEXT: %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
; CHECK-NEXT: ret <4 x double> %1		; CHECK-NEXT: ret <4 x double> %1
}		}

define <4 x double> @perm2pd_0x13(<4 x double> %a0, <4 x double> %a1) {		define <4 x double> @perm2pd_0x13(<4 x double> %a0, <4 x double> %a1) {
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 19)		%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 19)
ret <4 x double> %res		ret <4 x double> %res

; CHECK-LABEL: @perm2pd_0x13		; CHECK-LABEL: @perm2pd_0x13
; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>		; CHECK-NEXT: %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
; CHECK-NEXT: ret <4 x double> %1		; CHECK-NEXT: ret <4 x double> %1
}		}

define <4 x double> @perm2pd_0x20(<4 x double> %a0, <4 x double> %a1) {		define <4 x double> @perm2pd_0x20(<4 x double> %a0, <4 x double> %a1) {
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 32)		%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 32)
ret <4 x double> %res		ret <4 x double> %res

; CHECK-LABEL: @perm2pd_0x20		; CHECK-LABEL: @perm2pd_0x20
▲ Show 20 Lines • Show All 70 Lines • ▼ Show 20 Lines	define <8 x float> @perm2ps_0x31(<8 x float> %a0, <8 x float> %a1) {
ret <8 x float> %res		ret <8 x float> %res

; CHECK-LABEL: @perm2ps_0x31		; CHECK-LABEL: @perm2ps_0x31
; CHECK-NEXT: %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>		; CHECK-NEXT: %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: ret <8 x float> %1		; CHECK-NEXT: ret <8 x float> %1
}		}


; Confirm that when a single zero mask bit is set, we do nothing.		; Confirm that when a single zero mask bit is set, we replace a source vector with zeros.

		define <4 x double> @perm2pd_0x81(<4 x double> %a0, <4 x double> %a1) {
		%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 129)
		ret <4 x double> %res

		; CHECK-LABEL: @perm2pd_0x81
		; CHECK-NEXT: shufflevector <4 x double> %a0, <4 x double> <double 0.0{{.*}}<4 x i32> <i32 2, i32 3, i32 4, i32 5>
		; CHECK-NEXT: ret <4 x double>
		}

define <4 x double> @perm2pd_0x83(<4 x double> %a0, <4 x double> %a1) {		define <4 x double> @perm2pd_0x83(<4 x double> %a0, <4 x double> %a1) {
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 131)		%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 131)
ret <4 x double> %res		ret <4 x double> %res

; CHECK-LABEL: @perm2pd_0x83		; CHECK-LABEL: @perm2pd_0x83
; CHECK-NEXT: call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 -125)		; CHECK-NEXT: shufflevector <4 x double> %a1, <4 x double> <double 0.0{{.*}}, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
; CHECK-NEXT: ret <4 x double>		; CHECK-NEXT: ret <4 x double>
}		}

		define <4 x double> @perm2pd_0x28(<4 x double> %a0, <4 x double> %a1) {
		%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 40)
		ret <4 x double> %res

; Confirm that when the other zero mask bit is set, we do nothing. Also confirm that an ignored bit has no effect.		; CHECK-LABEL: @perm2pd_0x28
		; CHECK-NEXT: shufflevector <4 x double> <double 0.0{{.*}}, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
		; CHECK-NEXT: ret <4 x double>
		}

define <4 x double> @perm2pd_0x48(<4 x double> %a0, <4 x double> %a1) {		define <4 x double> @perm2pd_0x08(<4 x double> %a0, <4 x double> %a1) {
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 72)		%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 8)
ret <4 x double> %res		ret <4 x double> %res

; CHECK-LABEL: @perm2pd_0x48		; CHECK-LABEL: @perm2pd_0x08
; CHECK-NEXT: call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 72)		; CHECK-NEXT: shufflevector <4 x double> <double 0.0{{.*}}, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; CHECK-NEXT: ret <4 x double>		; CHECK-NEXT: ret <4 x double>
}		}

declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone		declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone		declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone		declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone