This is an archive of the discontinued LLVM Phabricator instance.

Convert some X86 blendv* intrinsics into IR.
ClosedPublic

Authored by filcab on May 21 2014, 12:19 AM.

Download Raw Diff

Details

Reviewers

grosbach
nadav
delena

Group Reviewers

deleted

Commits

rG82ac07c28332: Convert some X86 blendv* intrinsics into IR.
rL209643: Convert some X86 blendv* intrinsics into IR.

Summary

Implemented an InstCombine transformation that takes a blendv* intrinsic
call and translates it into an IR select, if the mask is constant.

This will eventually get lowered into blends with immediates if possible,
or pblendvb (with an option to further optimize if we can transform the
pblendvb into a blend+immediate instruction, depending on the selector).
It will also enable optimizations by the IR passes, which give up on
sight of the intrinsic.

Both the transformation and the lowering of its result to asm got shiny
new tests.

The transformation is a bit convoluted because of blendvp[sd]'s
definition:

Its mask is a floating point value! This forces us to convert it and get
the highest bit. I suppose this happened because the mask has type
__m128 in Intel's intrinsic and v4sf (for blendps) in gcc's builtin.

I will send an email to llvm-dev to discuss if we want to change this or
not.

Diff Detail

Repository: rL LLVM

Event Timeline

filcab updated this revision to Diff 9652.May 21 2014, 12:19 AM

filcab retitled this revision from to Convert some X86 blendv* intrinsics into IR..

filcab updated this object.

filcab edited the test plan for this revision. (Show Details)

filcab added reviewers: grosbach, delena, nadav.

+llvm-commits

Actually add llvm-commits.

Closed by commit rL209643 (authored by @filcab).

Please commit the patch with the fix to the commuted argument order and a fix to the typo in the docs.

Thanks Filipe!

Revision Contents

Path

Size

llvm/

trunk/

lib/

Transforms/

InstCombine/

InstCombineCalls.cpp

35 lines

test/

CodeGen/

X86/

avx-blend.ll

23 lines

avx2-blend.ll

11 lines

sse41-blend.ll

32 lines

Transforms/

InstCombine/

blend_x86.ll

56 lines

Diff 9819

llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp

Show First 20 Lines • Show All 712 Lines • ▼ Show 20 Lines	if (auto CIWidth = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
}		}
}		}
}		}
}		}
}		}
break;		break;
}		}

		case Intrinsic::x86_sse41_pblendvb:
		case Intrinsic::x86_sse41_blendvps:
		case Intrinsic::x86_sse41_blendvpd:
		case Intrinsic::x86_avx_blendv_ps_256:
		case Intrinsic::x86_avx_blendv_pd_256:
		case Intrinsic::x86_avx2_pblendvb: {
		// Convert blendv* to vector selects if the mask is constant.
		// This optimization is convoluted because the intrinsic is defined as
		// getting a vector of floats or doubles for the ps and pd versions.
		// FIXME: That should be changed.
		Value *Mask = II->getArgOperand(2);
		if (auto C = dyn_cast<ConstantDataVector>(Mask)) {
		auto Tyi1 = Builder->getInt1Ty();
		auto SelectorType = cast<VectorType>(Mask->getType());
		auto EltTy = SelectorType->getElementType();
		unsigned Size = SelectorType->getNumElements();
		unsigned BitWidth = EltTy->isFloatTy() ? 32 : (EltTy->isDoubleTy() ? 64 : EltTy->getIntegerBitWidth());
		assert(BitWidth == 64 \|\| BitWidth == 32 \|\| BitWidth == 8 && "Wrong arguments for variable blend intrinsic");
		SmallVector<Constant*, 32> Selectors;
		for (unsigned I = 0; I < Size; ++I) {
		// The intrinsics only read the top bit
		uint64_t Selector;
		if (BitWidth == 8)
		Selector = C->getElementAsInteger(I);
		else
		Selector = C->getElementAsAPFloat(I).bitcastToAPInt().getZExtValue();
		Selectors.push_back(ConstantInt::get(Tyi1, Selector >> (BitWidth - 1)));
		}
		auto NewSelector = ConstantVector::get(Selectors);
		return SelectInst::Create(NewSelector, II->getArgOperand(0), II->getArgOperand(1), "blendv");
		} else {
		break;
		}
		}

case Intrinsic::x86_avx_vpermilvar_ps:		case Intrinsic::x86_avx_vpermilvar_ps:
case Intrinsic::x86_avx_vpermilvar_ps_256:		case Intrinsic::x86_avx_vpermilvar_ps_256:
case Intrinsic::x86_avx_vpermilvar_pd:		case Intrinsic::x86_avx_vpermilvar_pd:
case Intrinsic::x86_avx_vpermilvar_pd_256: {		case Intrinsic::x86_avx_vpermilvar_pd_256: {
// Convert vpermil* to shufflevector if the mask is constant.		// Convert vpermil* to shufflevector if the mask is constant.
Value *V = II->getArgOperand(1);		Value *V = II->getArgOperand(1);
unsigned Size = cast<VectorType>(V->getType())->getNumElements();		unsigned Size = cast<VectorType>(V->getType())->getNumElements();
assert(Size == 8 \|\| Size == 4 \|\| Size == 2);		assert(Size == 8 \|\| Size == 4 \|\| Size == 2);
▲ Show 20 Lines • Show All 835 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/avx-blend.ll

	Show First 20 Lines • Show All 129 Lines • ▼ Show 20 Lines
	; CHECK: testb			; CHECK: testb
	define <2 x double> @testb(<2 x double> %x, <2 x double> %y) {			define <2 x double> @testb(<2 x double> %x, <2 x double> %y) {
	; CHECK: vcmpnlepd			; CHECK: vcmpnlepd
	; CHECK: vblendvpd			; CHECK: vblendvpd
	%min_is_x = fcmp ult <2 x double> %x, %y			%min_is_x = fcmp ult <2 x double> %x, %y
	%min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y			%min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y
	ret <2 x double> %min			ret <2 x double> %min
	}			}

				; If we can figure out a blend has a constant mask, we should emit the
				; blend instruction with an immediate mask
				define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) {
				; CHECK-LABEL: constant_blendvpd_avx:
				; CHECK-NOT: mov
				; CHECK: vblendpd
				; CHECK: ret
				%1 = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> %xy, <4 x double> %ab
				ret <4 x double> %1
				}

				define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) {
				; CHECK-LABEL: constant_blendvps_avx:
				; CHECK-NOT: mov
				; CHECK: vblendps
				; CHECK: ret
				%1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <8 x float> %xyzw, <8 x float> %abcd
				ret <8 x float> %1
				}

				declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>)
				declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>)

llvm/trunk/test/CodeGen/X86/avx2-blend.ll

				; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 \| FileCheck %s

				define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
				; CHECK-LABEL: constant_pblendvb_avx2:
				; CHECK: vmovdqa
				; CHECK: vpblendvb
				%1 = select <32 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <32 x i8> %xyzw, <32 x i8> %abcd
				ret <32 x i8> %1
				}

				declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>)

llvm/trunk/test/CodeGen/X86/sse41-blend.ll

	Show First 20 Lines • Show All 82 Lines • ▼ Show 20 Lines
	; CHECK: float_crash			; CHECK: float_crash
	define void @float_crash() nounwind {			define void @float_crash() nounwind {
	entry:			entry:
	%merge205vector_func.i = select <4 x i1> undef, <4 x double> undef, <4 x double> undef			%merge205vector_func.i = select <4 x i1> undef, <4 x double> undef, <4 x double> undef
	%extract214vector_func.i = extractelement <4 x double> %merge205vector_func.i, i32 0			%extract214vector_func.i = extractelement <4 x double> %merge205vector_func.i, i32 0
	store double %extract214vector_func.i, double addrspace(1)* undef, align 8			store double %extract214vector_func.i, double addrspace(1)* undef, align 8
	ret void			ret void
	}			}

				; If we can figure out a blend has a constant mask, we should emit the
				; blend instruction with an immediate mask
				define <2 x double> @constant_blendvpd(<2 x double> %xy, <2 x double> %ab) {
				; In this case, we emit a simple movss
				; CHECK-LABEL: constant_blendvpd
				; CHECK: movsd
				; CHECK: ret
				%1 = select <2 x i1> <i1 true, i1 false>, <2 x double> %xy, <2 x double> %ab
				ret <2 x double> %1
				}

				define <4 x float> @constant_blendvps(<4 x float> %xyzw, <4 x float> %abcd) {
				; CHECK-LABEL: constant_blendvps
				; CHECK-NOT: mov
				; CHECK: blendps $7
				; CHECK: ret
				%1 = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> %xyzw, <4 x float> %abcd
				ret <4 x float> %1
				}

				define <16 x i8> @constant_pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd) {
				; CHECK-LABEL: constant_pblendvb:
				; CHECK: movaps
				; CHECK: pblendvb
				; CHECK: ret
				%1 = select <16 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <16 x i8> %xyzw, <16 x i8> %abcd
				ret <16 x i8> %1
				}
				declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
				declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
				declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>)

llvm/trunk/test/Transforms/InstCombine/blend_x86.ll

				; RUN: opt < %s -instcombine -mtriple=x86_64-apple-macosx -mcpu=core-avx2 -S \| FileCheck %s

				define <2 x double> @constant_blendvpd(<2 x double> %xy, <2 x double> %ab) {
				; CHECK-LABEL: @constant_blendvpd
				; CHECK: select <2 x i1> <i1 true, i1 false>
				%1 = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %xy, <2 x double> %ab, <2 x double> <double 0xFFFFFFFFE0000000, double 0.000000e+00>)
				ret <2 x double> %1
				}

				define <4 x float> @constant_blendvps(<4 x float> %xyzw, <4 x float> %abcd) {
				; CHECK-LABEL: @constant_blendvps
				; CHECK: select <4 x i1> <i1 false, i1 false, i1 false, i1 true>
				%1 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %xyzw, <4 x float> %abcd, <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0xFFFFFFFFE0000000>)
				ret <4 x float> %1
				}

				define <16 x i8> @constant_pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd) {
				; CHECK-LABEL: @constant_pblendvb
				; CHECK: select <16 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>
				%1 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd, <16 x i8> <i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0>)
				ret <16 x i8> %1
				}

				define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) {
				; CHECK-LABEL: @constant_blendvpd_avx
				; CHECK: select <4 x i1> <i1 true, i1 false, i1 true, i1 false>
				%1 = tail call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %xy, <4 x double> %ab, <4 x double> <double 0xFFFFFFFFE0000000, double 0.000000e+00, double 0xFFFFFFFFE0000000, double 0.000000e+00>)
				ret <4 x double> %1
				}

				define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) {
				; CHECK-LABEL: @constant_blendvps_avx
				; CHECK: select <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>
				%1 = tail call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %xyzw, <8 x float> %abcd, <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0xFFFFFFFFE0000000, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0xFFFFFFFFE0000000>)
				ret <8 x float> %1
				}

				define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
				; CHECK-LABEL: @constant_pblendvb_avx2
				; CHECK: select <32 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>
				%1 = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %xyzw, <32 x i8> %abcd,
				<32 x i8> <i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0,
				i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0,
				i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0,
				i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0>)
				ret <32 x i8> %1
				}

				declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
				declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
				declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>)

				declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>)
				declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>)
				declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>)