This is an archive of the discontinued LLVM Phabricator instance.

[InstCombine] try to reduce x86 addcarry to generic uaddo intrinsic
ClosedPublic

Authored by spatel on Jan 30 2019, 7:35 AM.

Download Raw Diff

Details

Reviewers

RKSimon
craig.topper

Commits

rGbe23a91fcd98: [InstCombine] try to reduce x86 addcarry to generic uaddo intrinsic
rL352870: [InstCombine] try to reduce x86 addcarry to generic uaddo intrinsic

Summary

If we can reduce the x86-specific intrinsic to the generic op, it allows existing simplifications and value tracking folds. AFAICT, this always results in identical x86 codegen in the non-reduced case...which should be true because we semi-generically (too aggressively IMO) convert to llvm.uadd.with.overflow in CGP, so the DAG/isel must already combine/lower this intrinsic as expected.

This isn't quite what was requested in:
https://bugs.llvm.org/show_bug.cgi?id=40486
...but I think we want to have these kinds of folds early for efficiency and to enable greater simplifications. For the case in the bug report where we have:

_addcarry_u64(0, ahi, 0, &ahi)

...this gets completely simplified away in IR.

Diff Detail

Repository: rL LLVM

Event Timeline

spatel created this revision.Jan 30 2019, 7:35 AM

Herald added a subscriber: mcrosier. · View Herald TranscriptJan 30 2019, 7:35 AM

How does this affect the codegen from PR31754 ?

In D57453#1377170, @RKSimon wrote:

How does this affect the codegen from PR31754 ?

This only comes into play for the test code that uses the x86-specific C intrinsic, so it's not changing anything on the original problem examples. And for the test code (https://bugs.llvm.org/show_bug.cgi?id=31754#c3 or the expanded version in PR40486), this is only changing the test<0> instantiation so far.

LGTM

This revision is now accepted and ready to land.Feb 1 2019, 5:45 AM

Closed by commit rL352870: [InstCombine] try to reduce x86 addcarry to generic uaddo intrinsic (authored by spatel). · Explain WhyFeb 1 2019, 6:14 AM

This revision was automatically updated to reflect the committed changes.

Herald added a project: Restricted Project. · View Herald TranscriptFeb 1 2019, 6:14 AM

Revision Contents

Path

Size

llvm/

trunk/

lib/

Transforms/

InstCombine/

InstCombineCalls.cpp

33 lines

test/

Transforms/

InstCombine/

X86/

addcarry.ll

22 lines

Diff 184735

llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp

Show First 20 Lines • Show All 745 Lines • ▼ Show 20 Lines	if (Arg->getType()->isVectorTy() &&
Type *ScalarTy = Type::getIntNTy(Arg->getContext(), NumElts);		Type *ScalarTy = Type::getIntNTy(Arg->getContext(), NumElts);
Value *BC = Builder.CreateBitCast(X, ScalarTy);		Value *BC = Builder.CreateBitCast(X, ScalarTy);
return Builder.CreateZExtOrTrunc(BC, ResTy);		return Builder.CreateZExtOrTrunc(BC, ResTy);
}		}

return nullptr;		return nullptr;
}		}

		static Value *simplifyX86addcarry(const IntrinsicInst &II,
		InstCombiner::BuilderTy &Builder) {
		Value *CarryIn = II.getArgOperand(0);
		Value *Op1 = II.getArgOperand(1);
		Value *Op2 = II.getArgOperand(2);
		Type *RetTy = II.getType();
		Type *OpTy = Op1->getType();
		assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
		RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
		"Unexpected types for x86 addcarry");

		// If carry-in is zero, this is just an unsigned add with overflow.
		if (match(CarryIn, m_ZeroInt())) {
		Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
		{ Op1, Op2 });
		// The types have to be adjusted to match the x86 call types.
		Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
		Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
		Builder.getInt8Ty());
		Value *Res = UndefValue::get(II.getType());
		Res = Builder.CreateInsertValue(Res, UAddOV, 0);
		return Builder.CreateInsertValue(Res, UAddResult, 1);
		}

		return nullptr;
		}

static Value *simplifyX86insertps(const IntrinsicInst &II,		static Value *simplifyX86insertps(const IntrinsicInst &II,
InstCombiner::BuilderTy &Builder) {		InstCombiner::BuilderTy &Builder) {
auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));		auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
if (!CInt)		if (!CInt)
return nullptr;		return nullptr;

VectorType *VecTy = cast<VectorType>(II.getType());		VectorType *VecTy = cast<VectorType>(II.getType());
assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");		assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
▲ Show 20 Lines • Show All 2,342 Lines • ▼ Show 20 Lines
case Intrinsic::x86_avx2_maskstore_d:		case Intrinsic::x86_avx2_maskstore_d:
case Intrinsic::x86_avx2_maskstore_q:		case Intrinsic::x86_avx2_maskstore_q:
case Intrinsic::x86_avx2_maskstore_d_256:		case Intrinsic::x86_avx2_maskstore_d_256:
case Intrinsic::x86_avx2_maskstore_q_256:		case Intrinsic::x86_avx2_maskstore_q_256:
if (simplifyX86MaskedStore(II, this))		if (simplifyX86MaskedStore(II, this))
return nullptr;		return nullptr;
break;		break;

		case Intrinsic::x86_addcarry_32:
		case Intrinsic::x86_addcarry_64:
		if (Value V = simplifyX86addcarry(II, Builder))
		return replaceInstUsesWith(*II, V);
		break;

case Intrinsic::ppc_altivec_vperm:		case Intrinsic::ppc_altivec_vperm:
// Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.		// Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
// Note that ppc_altivec_vperm has a big-endian bias, so when creating		// Note that ppc_altivec_vperm has a big-endian bias, so when creating
// a vectorshuffle for little endian, we must undo the transformation		// a vectorshuffle for little endian, we must undo the transformation
// performed on vec_perm in altivec.h. That is, we must complement		// performed on vec_perm in altivec.h. That is, we must complement
// the permutation mask with respect to 31 and reverse the order of		// the permutation mask with respect to 31 and reverse the order of
// V1 and V2.		// V1 and V2.
if (Constant *Mask = dyn_cast<Constant>(II->getArgOperand(2))) {		if (Constant *Mask = dyn_cast<Constant>(II->getArgOperand(2))) {
▲ Show 20 Lines • Show All 1,540 Lines • Show Last 20 Lines

llvm/trunk/test/Transforms/InstCombine/X86/addcarry.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt < %s -instcombine -S \| FileCheck %s			; RUN: opt < %s -instcombine -S \| FileCheck %s

	declare { i8, i32 } @llvm.x86.addcarry.32(i8, i32, i32)			declare { i8, i32 } @llvm.x86.addcarry.32(i8, i32, i32)
	declare { i8, i64 } @llvm.x86.addcarry.64(i8, i64, i64)			declare { i8, i64 } @llvm.x86.addcarry.64(i8, i64, i64)

	define i32 @no_carryin_i32(i32 %x, i32 %y, i8* %p) {			define i32 @no_carryin_i32(i32 %x, i32 %y, i8* %p) {
	; CHECK-LABEL: @no_carryin_i32(			; CHECK-LABEL: @no_carryin_i32(
	; CHECK-NEXT: [[S:%.]] = call { i8, i32 } @llvm.x86.addcarry.32(i8 0, i32 [[X:%.]], i32 [[Y:%.*]])			; CHECK-NEXT: [[TMP1:%.]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[X:%.]], i32 [[Y:%.*]])
	; CHECK-NEXT: [[OV:%.*]] = extractvalue { i8, i32 } [[S]], 0			; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 0
	; CHECK-NEXT: store i8 [[OV]], i8* [[P:%.*]], align 1			; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
	; CHECK-NEXT: [[R:%.*]] = extractvalue { i8, i32 } [[S]], 1			; CHECK-NEXT: [[TMP4:%.*]] = zext i1 [[TMP3]] to i8
	; CHECK-NEXT: ret i32 [[R]]			; CHECK-NEXT: store i8 [[TMP4]], i8* [[P:%.*]], align 1
				; CHECK-NEXT: ret i32 [[TMP2]]
	;			;
	%s = call { i8, i32 } @llvm.x86.addcarry.32(i8 0, i32 %x, i32 %y)			%s = call { i8, i32 } @llvm.x86.addcarry.32(i8 0, i32 %x, i32 %y)
	%ov = extractvalue { i8, i32 } %s, 0			%ov = extractvalue { i8, i32 } %s, 0
	store i8 %ov, i8* %p			store i8 %ov, i8* %p
	%r = extractvalue { i8, i32 } %s, 1			%r = extractvalue { i8, i32 } %s, 1
	ret i32 %r			ret i32 %r
	}			}

	define i64 @no_carryin_i64(i64 %x, i64 %y, i8* %p) {			define i64 @no_carryin_i64(i64 %x, i64 %y, i8* %p) {
	; CHECK-LABEL: @no_carryin_i64(			; CHECK-LABEL: @no_carryin_i64(
	; CHECK-NEXT: [[S:%.]] = call { i8, i64 } @llvm.x86.addcarry.64(i8 0, i64 [[X:%.]], i64 [[Y:%.*]])			; CHECK-NEXT: [[TMP1:%.]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[X:%.]], i64 [[Y:%.*]])
	; CHECK-NEXT: [[OV:%.*]] = extractvalue { i8, i64 } [[S]], 0			; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0
	; CHECK-NEXT: store i8 [[OV]], i8* [[P:%.*]], align 1			; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1
	; CHECK-NEXT: [[R:%.*]] = extractvalue { i8, i64 } [[S]], 1			; CHECK-NEXT: [[TMP4:%.*]] = zext i1 [[TMP3]] to i8
	; CHECK-NEXT: ret i64 [[R]]			; CHECK-NEXT: store i8 [[TMP4]], i8* [[P:%.*]], align 1
				; CHECK-NEXT: ret i64 [[TMP2]]
	;			;
	%s = call { i8, i64 } @llvm.x86.addcarry.64(i8 0, i64 %x, i64 %y)			%s = call { i8, i64 } @llvm.x86.addcarry.64(i8 0, i64 %x, i64 %y)
	%ov = extractvalue { i8, i64 } %s, 0			%ov = extractvalue { i8, i64 } %s, 0
	store i8 %ov, i8* %p			store i8 %ov, i8* %p
	%r = extractvalue { i8, i64 } %s, 1			%r = extractvalue { i8, i64 } %s, 1
	ret i64 %r			ret i64 %r
	}			}